From 64375c16c8316b8381ad59fa3538ad84732d90b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hugo=20H=C3=B6rnquist?= Date: Thu, 14 Jul 2022 23:44:03 +0200 Subject: work --- module/c/cpp-types.scm | 23 +++-- module/c/cpp-util.scm | 19 +++- module/c/lex2.scm | 44 +++++++++- module/c/preprocessor2.scm | 185 ++++++++++++++++++++------------------- module/c/to-token.scm | 12 ++- tests/test/cpp/lex2.scm | 54 ++++++++++++ tests/test/cpp/preprocessor2.scm | 80 +++++++++++++++-- 7 files changed, 306 insertions(+), 111 deletions(-) diff --git a/module/c/cpp-types.scm b/module/c/cpp-types.scm index b08e9810..6dad061e 100644 --- a/module/c/cpp-types.scm +++ b/module/c/cpp-types.scm @@ -2,6 +2,7 @@ :use-module (c lex2) :use-module (ice-9 match) :use-module (c cpp-util) + :use-module (hnh util type) :export (whitespace-token? comment-token? preprocessing-token? @@ -15,6 +16,9 @@ h-string-token? q-string-token? character-constant? + comment->whitespace + comments->whitespace + make-string-literal )) (define (whitespace-token? x) @@ -59,29 +63,32 @@ (`(pp-number ,x) x) (_ #f)))) -;; TODO this fails if there are multiple components in the string token ;; TODO rename to string-literal-token? (define (string-token? token) (and (preprocessing-token? token) (match (lexeme-body token) - (`(string-literal ,x) x) + (('string-literal x ...) (apply values x)) (_ #f)))) (define (character-constant? token) (and (preprocessing-token? token) (match (lexeme-body token) - (`(character-constant ,x) x) + (('character-constant x ...) (apply values x)) (_ #f)))) (define (h-string-token? token) (and (preprocessing-token? token) (match (lexeme-body token) - (`(h-string ,x) x) + (`(header-name (h-string ,x)) x) (_ #f)))) +;; NOTE q-string tokens are never produced by the lexer, +;; since they instead are treated as regular strings (define (q-string-token? token) - (and (preprocessing-token? token) - (match (lexeme-body token) - (`(q-string ,x) x) - (_ #f)))) + (string-token? token)) + +(define (make-string-literal parts) + (typecheck parts (list-of (or string? list?))) + (lexeme type: 'preprocessing-token + body: (cons 'string-literal parts))) diff --git a/module/c/cpp-util.scm b/module/c/cpp-util.scm index 3ea06505..633c5a0c 100644 --- a/module/c/cpp-util.scm +++ b/module/c/cpp-util.scm @@ -15,7 +15,8 @@ drop-whitespace-right drop-whitespace-both cleanup-whitespace - concatenate-tokens)) + concatenate-tokens + merge-string-literals)) ;; Does the next non-whitespace token in the stream satisfy the predicate? @@ -109,3 +110,19 @@ (define (concatenate-tokens a b) (car (lex (string-append (unlex (list a)) (unlex (list b)))))) + + +(define (merge-string-literals tokens) + (cond ((null? tokens) '()) + ((null? (cdr tokens)) tokens) + ((string-token? (car tokens)) + (lambda (a . _) a) + => (lambda parts-a + (cond ((string-token? (cadr tokens)) + (lambda (a . _) a) + => (lambda parts-b (merge-string-literals + (cons (make-string-literal (append parts-a parts-b)) + (cddr tokens))))) + (else (cons (car tokens) + (merge-string-literals (cdr tokens))))))) + (else (cons (car tokens) (merge-string-literals (cdr tokens)))))) diff --git a/module/c/lex2.scm b/module/c/lex2.scm index 049cc48c..647eff55 100644 --- a/module/c/lex2.scm +++ b/module/c/lex2.scm @@ -1,9 +1,12 @@ (define-module (c lex2) :use-module (ice-9 peg) :use-module (ice-9 match) + :use-module ((hnh util) :select (->)) :use-module (hnh util object) :use-module (hnh util type) :use-module (srfi srfi-88) + :use-module ((c trigraph) :select (replace-trigraphs)) + :use-module ((c line-fold) :select (fold-lines)) :export (lex lexeme lexeme? placemaker @@ -12,6 +15,8 @@ (noexpand . lexeme-noexpand) parse-c-number + + tokenize )) ;;; A.1 Lexical grammar @@ -268,10 +273,15 @@ (or "[" "]" "(" ")" "{" "}" "..." ; Moved to be before "." "." "->" - "++" "--" "&" "*" "+" "-" "~" "!" - "/" "%" "<<" ">>" "<" ">" "<=" ">=" "==" "!=" "^" "|" "&&" "||" + "&&" "||" + "!=" + "++" "--" + "*=" "/=" "%=" "+=" "-=" "<<=" ">>=" "&=" "^=" "|=" + "<=" ">=" "==" + "=" + "/" "%" "<<" ">>" "<" ">" "^" "|" "?" ":" ";" - "=" "*=" "/=" "%=" "+=" "-=" "<<=" ">>=" "&=" "^=" "|=" + "&" "*" "+" "-" "~" "!" "," "##" "#" ; # and ## flipped "<:" ":>" "<%" "%>" "%:%:" "%:" ; %: and %:%: flipped )) @@ -284,6 +294,8 @@ ;; (6.4.7) (define-peg-pattern header-name all (or (and (ignore "<") h-string (ignore ">")) + ;; NOTE this case will never be reached, since it's treated as a regular + ;; string instead (and (ignore "\"") q-string (ignore "\"")))) ;; (6.4.7) @@ -402,3 +414,29 @@ (define (parse-c-number string) (match-pattern constant string)) + + + +;;; 5.1.11.2 Translation phases + +(define (tokenize string) + (-> string +;;; 1. trigraph replacement + replace-trigraphs +;;; 2. Line folding + fold-lines +;;; 3. Decomposition into preprocenning tokens, whitespaces, and comments + lex + comments->whitespace)) + +;; These really belong in (c cpp-types), but that would create a dependency cycle + +(define (comment->whitespace token) + (if ;; (comment-token? token) + (and (lexeme? token) + (eq? 'comment (type token))) + (car (lex " ")) + token)) + +(define (comments->whitespace tokens) + (map comment->whitespace tokens)) diff --git a/module/c/preprocessor2.scm b/module/c/preprocessor2.scm index c1db3f08..f18ca748 100644 --- a/module/c/preprocessor2.scm +++ b/module/c/preprocessor2.scm @@ -9,18 +9,33 @@ :select (function-like-macro variadic? identifier-list)) :use-module ((c cpp-environment object-like-macro) :select (object-like-macro object-like-macro?)) :use-module ((c cpp-environment internal-macro) :select (internal-macro)) - :use-module ((hnh util) :select (-> intersperse aif swap unless unval)) + :use-module ((hnh util) :select (-> ->> intersperse aif swap unless unval)) :use-module ((hnh util lens) :select (set modify cdr*)) :use-module (hnh util path) :use-module (hnh util type) - :use-module ((c lex2) :select (lex placemaker lexeme? lexeme-body lexeme-noexpand)) - :use-module ((c trigraph) :select (replace-trigraphs)) - :use-module ((c line-fold) :select (fold-lines)) + :use-module ((hnh util values) :select (abort* on-fst on-snd apply/values)) + :use-module ((c lex2) + :select (lex + placemaker + lexeme? + lexeme-body + lexeme-noexpand + + tokenize + )) :use-module (c unlex) :use-module (c cpp-types) :use-module (c cpp-util) - :use-module (ice-9 control) - :export (defined-macro _Pragma-macro)) + :export (_Pragma-macro + defined-macro + c-search-path + handle-preprocessing-tokens)) + + +(define (read-file path) + (call-with-input-file path (@ (ice-9 rdelim) read-string))) + + (define-syntax-rule (alist-of variable key-type value-type) (build-validator-body variable (list-of (pair-of key-type value-type)))) @@ -36,18 +51,10 @@ (define (ellipsis-token? token) (equal? "..." (punctuator-token? token))) -(define-syntax-rule (abort* form) - (call-with-values (lambda () form) abort)) - -(define-syntax-rule (on-fst form) - (% form - (lambda (prompt fst . rest) - (apply values (prompt fst) rest)))) - -(define-syntax-rule (on-snd form) - (% form - (lambda (prompt fst snd . rest) - (apply values fst (prompt snd) rest)))) +;; TODO +;; > #if defined X +;; is equivalent to +;; > #if defined(X) ;; parameters is a lexeme list, as returned by parse-parameter-list @@ -335,28 +342,6 @@ identifier: "__LINE__" body: (lex (number->string (current-line environment))))))) -(define (c-search-path) (make-parameter (list "." "/usr/include"))) - -;; #include -(define (resolve-h-file string) - (typecheck string string?) - (cond ((path-absolute? string) string) - (else - (let ((filename - (find file-exists? - (map (lambda (path-prefix) - (path-append path-prefix string)) - (c-search-path))))) - (if filename filename - (scm-error 'cpp-error "resolve-h-file" - "Can't resolve file: ~s" - (list string) #f)))))) - -;; #include "myheader.h" -(define (resolve-q-file string) - (typecheck string string?) - ;; This should always be a fallback (6.10.2, p. 3) - (cond (else (resolve-h-file string)))) (define defined-macro (internal-macro @@ -412,9 +397,20 @@ environment)))) -(define (resolve-constant-expression tokens) +;; 6.10.1 p. 4 +(define (resolve-constant-expression cpp-tokens) ;; (typecheck tokens (list-of lexeme?)) + (define zero (car (lex "0"))) + #; + (define tokens + (map preprocessing-token->token + (map (lambda (token) + (cond ((identifier-token? token) zero) + (else token))) + (remove whitespace-token? tokens)))) + 'TODO + ;; eval as per 6.6 ) @@ -446,13 +442,12 @@ (on-snd (cons (car tokens) (abort* (loop (bump-line environment) (cdr tokens)))))) ((and (identifier-token? (car tokens)) (not (marked-noexpand? (car tokens)))) - (call-with-values - (lambda () (maybe-extend-identifier environment - (identifier-token? (car tokens)) - (lexeme-noexpand (car tokens)) - (cdr tokens))) - ;; Here is the after expansion - (if once? values loop))) + ;; Here is the loop after expansion + (apply/values (if once? values loop) + (maybe-extend-identifier environment + (identifier-token? (car tokens)) + (lexeme-noexpand (car tokens)) + (cdr tokens)))) (else (on-snd (cons (car tokens) (abort* (loop environment (cdr tokens))))))))) @@ -490,7 +485,37 @@ identifier) remaining-tokens))))) -(define (resolve-and-include-header environment tokens) +;; 'gcc -xc -E -v /dev/null' prints GCC:s search path +(define c-search-path + (make-parameter (list "/usr/include" + "/usr/local/include"))) + +;; #include +(define (resolve-h-file string) + (typecheck string string?) + (cond + ;; NOTE do I want this case? + ;; GCC has it + ((path-absolute? string) string) + (else + (or + (find file-exists? + (map (lambda (path-prefix) + (path-append path-prefix string)) + (c-search-path))) + (scm-error 'cpp-error "resolve-h-file" + "Can't resolve file: ~s" + (list string) #f))))) + +;; #include "myheader.h" +(define (resolve-q-file string) + (typecheck string string?) + (cond ((file-exists? string) string) + ;; This should always be a fallback (6.10.2, p. 3) + (else (resolve-h-file string)))) + + +(define (resolve-header environment tokens) (typecheck environment cpp-environment?) ;; (typecheck tokens (list-of lexeme?)) @@ -499,21 +524,17 @@ (string-append msg ", tokens: ~s") (append args (list (unlex tokens))) #f)))) (let loop ((%first-time #t) (tokens tokens)) - (cond ((null? tokens) '()) + (cond ((null? tokens) (err "Invalid #include line")) ((h-string-token? (car tokens)) => (lambda (str) (unless (null? (drop-whitespace (cdr tokens))) (err "Unexpected tokens after #include <>")) - (handle-preprocessing-tokens - environment - (-> str resolve-h-file read-file tokenize)))) + (resolve-h-file str))) ((q-string-token? (car tokens)) => (lambda (str) (unless (null? (drop-whitespace (cdr tokens))) (err "Unexpected tokens after #include \"\"")) - (handle-preprocessing-tokens - environment - (-> str resolve-q-file read-file tokenize)))) + (resolve-q-file str))) (else (unless %first-time (err "Failed parsing tokens")) ;; No newlines in #include @@ -623,11 +644,23 @@ (body (drop-whitespace (cdr line-tokens)))) (if (eq? 'include directive) ;; include is special since it returns a token stream - (call-with-values - (lambda () (resolve-and-include-header environment body)) - (lambda (environment tokens) - (loop environment - (append tokens remaining-tokens)))) + (let ((path (resolve-header environment body))) + ;; TODO change to store source location in lexemes + ;; and rewrite the following to + ;; (loop environment + ;; (append (-> path read-file tokenize) remaining-tokens)) + ;; TODO and then transfer these source locations when we move + ;; to "real" tokens (c to-token) + (let ((env* tokens* + (loop + ;; same hack as at start of loop + (-> environment + (enter-file path) + (bump-line -1)) + (append (lex "\n") + (-> path read-file tokenize))))) + (on-snd (append tokens* (abort* (loop (leave-file env*) remaining-tokens)))))) + (let ((operation ; (environment, list token) → environment (case directive ((if) resolve-for-if) @@ -674,32 +707,6 @@ (unless (null? remaining-tokens) (lex "\n")) (abort* (loop env* remaining-tokens)))))))))) - (else (err "Unexpected middle of line"))))) - - - -(define (read-file path) - (call-with-input-file path (@ (ice-9 rdelim) read-string))) + (else (err "Unexpected middle of line, (near ~s)" + (unlex tokens)))))) -(define (comment->whitespace token) - (if (comment-token? token) - (car (lex " ")) - token)) - -(define (comments->whitespace tokens) - (map comment->whitespace tokens)) - -;;; 5.1.11.2 Translation phases - -(define (tokenize string) - (-> string -;;; 1. trigraph replacement - replace-trigraphs -;;; 2. Line folding - fold-lines -;;; 3. Decomposition into preprocenning tokens, whitespaces, and comments - lex -;;; 4. Execution of preprocessing directives, all preprocessing directives are then deleted - comments->whitespace - ;; squeeze-whitespace-blocks - )) diff --git a/module/c/to-token.scm b/module/c/to-token.scm index c1efcc02..53db7e59 100644 --- a/module/c/to-token.scm +++ b/module/c/to-token.scm @@ -19,9 +19,16 @@ "_Generic" "_Imaginary" "_Noreturn" "_Static_assert" "_Thread_local")) +;; 6.4 paragraph 2 +;; Each preprocessing toket thas is converted to a token shall have the +;; lexcal form of a keyword, an identifier, a constant, a string literal, +;; or a puncturtor (define (preprocessing-token->token cpp-token) + ;; Guile's cond handles multiple from expr, if written on the form + ;; (cond (expr check => proc) ...) (cond ((string-token? cpp-token) - => (lambda (content) + (lambda (a . _) a) + => (lambda content (make-lexical-token 'string-literal #f content))) ((identifier-token? cpp-token) @@ -36,7 +43,8 @@ (make-lexical-token 'constant #f content))) ((character-constant? cpp-token) - => (lambda (x) (make-lexical-token 'constant #f x))) + (lambda (a . _) a) + => (lambda content (make-lexical-token 'constant #f content))) ((punctuator-token? cpp-token) => (lambda (s) diff --git a/tests/test/cpp/lex2.scm b/tests/test/cpp/lex2.scm index 47bb4a16..b7087c3b 100644 --- a/tests/test/cpp/lex2.scm +++ b/tests/test/cpp/lex2.scm @@ -92,3 +92,57 @@ (lexeme type: 'preprocessing-token body: '(punctuator ".")) (lexeme type: 'preprocessing-token body: '(identifier "dir"))) (lex "..\\listing.dir")) + + +(test-equal "Propper H-string" + (list (lexeme type: 'preprocessing-token body: '(header-name (h-string "a")))) + (lex "")) + +(test-equal "Unexpected h-string" + (list (lexeme type: 'preprocessing-token body: '(pp-number "1")) + (lexeme type: 'whitespace body: " ") + (lexeme type: 'preprocessing-token body: '(header-name (h-string " 2 "))) + (lexeme type: 'whitespace body: " ") + (lexeme type: 'preprocessing-token body: '(pp-number "3"))) + (lex "1 < 2 > 3")) + +(test-equal "Quotation mark inside h-string" + (list (lexeme type: 'preprocessing-token body: '(header-name (h-string "a\"b")))) + (lex "")) + +(test-equal "Interaction of h-strings and regular strings" + (test-equal "Less than string, not h-string" + (list (lexeme type: 'preprocessing-token body: '(pp-number "1")) + (lexeme type: 'preprocessing-token body: '(string-literal "<")) + (lexeme type: 'preprocessing-token body: '(punctuator ">"))) + (lex "1\"<\">")) + + (test-equal "H-string, not string" + (list (lexeme type: 'preprocessing-token body: '(pp-number "1")) + (lexeme type: 'preprocessing-token body: '(header-name (h-string "\""))) + (lexeme type: 'other body: "\"")) + (lex "1<\">\""))) + +(test-equal "Q-strings are lexed as regular strings" + (list (lexeme type: 'preprocessing-token body: '(punctuator "#")) + (lexeme type: 'preprocessing-token body: '(identifier "include")) + (lexeme type: 'whitespace body: " ") + (lexeme type: 'preprocessing-token body: '(string-literal "test"))) + ;; # include here, since generated tokens could possible depend on that context, + ;; and the reason regular strings are returned is since the lexer doesn't check + ;; that context + (lex "#include \"test\"") + ) + + + +(test-group "Unicode" + (test-equal "In string literals" + (list (lexeme type: 'preprocessing-token body: '(string-literal "åäö"))) + (lex "\"åäö\"")) + + (test-equal "Outside string literals" + (list (lexeme type: 'other body: "å") + (lexeme type: 'other body: "ä") + (lexeme type: 'other body: "ö")) + (lex "åäö"))) diff --git a/tests/test/cpp/preprocessor2.scm b/tests/test/cpp/preprocessor2.scm index 9f308c9e..71e0c1a0 100644 --- a/tests/test/cpp/preprocessor2.scm +++ b/tests/test/cpp/preprocessor2.scm @@ -38,7 +38,7 @@ :use-module (c lex2) ) -;; TODO Redefinition code isn't yet written +;; TODO Redefinition checking code isn't yet written (test-skip "Example 6") ;; See (c preprocessor2) TODO#1 @@ -63,8 +63,11 @@ (define parse-parameter-list (@@ (c preprocessor2) parse-parameter-list)) (define resolve-define (@@ (c preprocessor2) resolve-define)) (define resolve-token-stream (@@ (c preprocessor2) resolve-token-stream)) -(define tokenize (@@ (c preprocessor2) tokenize)) - +;; (define tokenize (@@ (c preprocessor2) tokenize)) +(define resolve-h-file (@@ (c preprocessor2) resolve-h-file)) +(define resolve-q-file (@@ (c preprocessor2) resolve-q-file)) +(define resolve-header (@@ (c preprocessor2) resolve-header)) +;; (define include-header (@@ (c preprocessor2) include-header)) ;; Remove the noexpand list from each token. @@ -78,6 +81,16 @@ (let ((env tokens (handle-preprocessing-tokens env (tokenize str)))) (drop-whitespace-both (remove-noexpand tokens)))) + (define (call-with-tmp-header string proc) + (let* ((filename (string-copy "/tmp/headerfile-XXXXXXX")) + (port (mkstemp! filename))) + (with-output-to-port port + (lambda () (display string) + )) + (close-port port) + (proc filename))) + + (test-group "Tokens until End Of Line" (call-with-values @@ -578,6 +591,7 @@ + (test-group "Line directive" (let ((e (make-environment))) (test-equal "#line " @@ -595,10 +609,60 @@ (lex "x")))))) -;; TODO -;; resolve-h-file -;; resolve-q-file -;; include +;; NOTE these tests assume a "regular" Unix system +(test-group "#include" + (test-group "Resolve header paths" + (test-equal "Find in path" + "/usr/include/stdio.h" + (resolve-h-file "stdio.h")) + + (test-error "Fail if not in path" + 'cpp-error + (resolve-h-file "This file doesn't exist")) + + (test-equal "Q-string with absolute path" + "/dev/null" + (resolve-q-file "/dev/null")) + (test-error "Q-File fails for missing file" + 'cpp-error (resolve-q-file "This file doesn't exists")) + + (test-equal "Q-strings also look in path" + "/usr/include/stdio.h" + (resolve-q-file "stdio.h"))) + + (test-group "resolve-header returns paths from pp tokens (from #include directive)" + (test-equal "H-string" + "/usr/include/stdio.h" + (resolve-header (make-environment) + (lex ""))) + (test-equal "Q-string" + "/usr/include/stdio.h" + (resolve-header (make-environment) + (lex "\"stdio.h\"")))) + + ;; TODO #include is subject to macro expansion + ;; test with through resolve-header + + + (test-group "Actually including stuff" + (call-with-tmp-header " +#define X 10 +int x; +" (lambda (filename) + (test-equal "Include through #include" + (lex "int x;\n\n10") + (run (format #f " +#include \"~a\" +X +" filename)))))) + + ;; NOTE should really be below "regular" __LINE__ tests + (call-with-tmp-header "__LINE__" (lambda (path) + (test-equal "__LINE__ in other file" + (lex "1") + (run (format #f "#include \"~a\"\n" path)))))) + + @@ -941,7 +1005,7 @@ g(x+(3,4)-w) | h 5) & m p() i[q()] = { q(1), r(2,3), r(4,), r(,5), r(,) }; char c[2][6] = { str(hello), str() };")))) - ;; TODO Example 4 skipped due to #include + ;; TODO Example 4 skipped due to #include in output (test-equal "Example 5" (unlex-aggressive (lex "int j[] = { 123, 45, 67, 89, 10, 11, 12, };")) -- cgit v1.2.3