From 37dc6cc5ab804da964f22787561d030898115809 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hugo=20H=C3=B6rnquist?= Date: Wed, 20 Jul 2022 16:25:35 +0200 Subject: Acknowledge string prefixes. --- module/c/cpp-util.scm | 10 ++++++---- module/c/lex2.scm | 26 ++++++++++++++++++++------ module/c/preprocessor2.scm | 20 +++++++++++++------- module/c/unlex.scm | 15 ++++++++++----- tests/test/cpp/lex2.scm | 35 ++++++++++++++++++++++++++++++----- tests/test/cpp/util.scm | 14 ++++++++++++++ 6 files changed, 93 insertions(+), 27 deletions(-) create mode 100644 tests/test/cpp/util.scm diff --git a/module/c/cpp-util.scm b/module/c/cpp-util.scm index 633c5a0c..9674317b 100644 --- a/module/c/cpp-util.scm +++ b/module/c/cpp-util.scm @@ -117,12 +117,14 @@ ((null? (cdr tokens)) tokens) ((string-token? (car tokens)) (lambda (a . _) a) - => (lambda parts-a + => (lambda (prefix-a . parts-a) (cond ((string-token? (cadr tokens)) (lambda (a . _) a) - => (lambda parts-b (merge-string-literals - (cons (make-string-literal (append parts-a parts-b)) - (cddr tokens))))) + => (lambda (prefix-b . parts-b) + (merge-string-literals + ;; TODO check validity of prefixes + (cons (make-string-literal (cons prefix-a (append parts-a parts-b))) + (cddr tokens))))) (else (cons (car tokens) (merge-string-literals (cdr tokens))))))) (else (cons (car tokens) (merge-string-literals (cdr tokens)))))) diff --git a/module/c/lex2.scm b/module/c/lex2.scm index c09f6423..85c9be19 100644 --- a/module/c/lex2.scm +++ b/module/c/lex2.scm @@ -374,12 +374,26 @@ (`(comment ,body) (lexeme body: body type: 'comment )) (`(preprocessing-token ,body) - (case body - ;; "unflatten" - ((string-literal) - (lexeme body: '(string-literal "") type: 'preprocessing-token)) - (else - (lexeme body: body type: 'preprocessing-token)))) + (match body + ('string-literal + ;; Unflatten case + (lexeme body: '(string-literal (encoding-prefix) "") + type: 'preprocessing-token)) + (('string-literal `(encoding-prefix ,px) args ...) + (lexeme body: `(string-literal (encoding-prefix . ,px) ,@args) + type: 'preprocessing-token)) + (('string-literal args ...) + (lexeme body: `(string-literal (encoding-prefix) ,@args) + type: 'preprocessing-token)) + (('character-constant `(character-prefix ,px) args ...) + (lexeme body: `(character-constant (character-prefix . ,px) + ,@args) + type: 'preprocessing-token)) + (('character-constant args ...) + (lexeme body: `(character-constant (character-prefix) ,@args) + type: 'preprocessing-token)) + (body (lexeme body: body type: 'preprocessing-token)))) + ;; "unflatten" ('comment (lexeme body: "" type: 'comment)))) diff --git a/module/c/preprocessor2.scm b/module/c/preprocessor2.scm index c6be3936..3f9552c5 100644 --- a/module/c/preprocessor2.scm +++ b/module/c/preprocessor2.scm @@ -397,11 +397,15 @@ body: (lambda (environment arguments) (typecheck arguments (and (list-of (list-of lexeme?)) (not null?))) - (aif (string-token? (caar arguments)) - (values (handle-pragma environment (lex it)) '()) - (scm-error 'cpp-pragma-error "_Pragma" - "Invalid argument to _Pragma: ~s" - (list (car arguments)) #f))))) + (cond ((string-token? (caar arguments)) + (lambda (a . _) a) + ;; TODO handle rest + => (lambda (encoding it . rest) + (values (handle-pragma environment (lex it)) + '()))) + (else (scm-error 'cpp-pragma-error "_Pragma" + "Invalid argument to _Pragma: ~s" + (list (car arguments)) #f)))))) @@ -591,10 +595,12 @@ (remaining (drop-whitespace (cdr tokens)))) (cond ((null? remaining) (set environment current-line (1- line))) ((string-token? (car remaining)) - => (lambda (file) + (lambda (a . _) a) + => (lambda (encoding . fragments) (-> environment (set current-line (1- line)) - (set current-file file)))) + ;; TODO properly join string + (set current-file (car fragments))))) ;; no newlines in #line (%first-time (loop #f ((unval resolve-token-stream 1) environment tokens))) (else (err)))))) diff --git a/module/c/unlex.scm b/module/c/unlex.scm index 1f8ba0a1..e467a50f 100644 --- a/module/c/unlex.scm +++ b/module/c/unlex.scm @@ -59,21 +59,26 @@ ;; Returns the "source" of the token, as a preprocessing string literal token (define (stringify-token preprocessing-token) (match (lexeme-body preprocessing-token) - (('string-literal `(encoding-prefix ,prefix) parts ...) - (stringify-string-tokens parts)) - (('string-literal parts ...) + (('string-literal `(encoding-prefix . ,prefix) parts ...) (stringify-string-tokens parts)) + (`(header-name (q-string ,s)) (format #f "~s" s)) + (`(header-name (h-string ,s)) (format #f "<~a>" s)) + (`(identifier ,id) id) + (`(pp-number ,n) n) - (`(character-constant ,c) + + ;; TODO remaining parts + (('character-constant `(character-encoding . ,x) c parts ...) (format #f "'~a'" c)) + (`(punctuator ,p) p))) ;; takes a token list, and return a single string literal token (define (stringify-tokens tokens) (lexeme type: 'preprocessing-token - body: `(string-literal ,(unlex-aggressive tokens)))) + body: `(string-literal (encoding-prefix) ,(unlex-aggressive tokens)))) diff --git a/tests/test/cpp/lex2.scm b/tests/test/cpp/lex2.scm index af6163e8..f4f9b857 100644 --- a/tests/test/cpp/lex2.scm +++ b/tests/test/cpp/lex2.scm @@ -14,7 +14,7 @@ (lex "10")) (test-equal "String literal" - (ls `(string-literal "Hello")) + (ls `(string-literal (encoding-prefix) "Hello")) (lex "\"Hello\"")) @@ -31,7 +31,7 @@ (test-equal "Comment inside string" - (ls `(string-literal "Hel/*lo")) + (ls `(string-literal (encoding-prefix) "Hel/*lo")) (lex "\"Hel/*lo\"")) (test-equal "#define line" @@ -117,7 +117,7 @@ (test-equal "Interaction of h-strings and regular strings" (test-equal "Less than string, not h-string" (ls '(pp-number "1") - '(string-literal "<") + '(string-literal (encoding-prefix) "<") '(punctuator ">")) (lex "1\"<\">")) @@ -131,7 +131,7 @@ (list (l '(punctuator "#")) (l '(identifier "include")) (lexeme type: 'whitespace body: " ") - (l '(string-literal "test"))) + (l '(string-literal (encoding-prefix) "test"))) ;; # include here, since generated tokens could possible depend on that context, ;; and the reason regular strings are returned is since the lexer doesn't check ;; that context @@ -142,7 +142,7 @@ (test-group "Unicode" (test-equal "In string literals" - (ls '(string-literal "åäö")) + (ls '(string-literal (encoding-prefix) "åäö")) (lex "\"åäö\"")) (test-equal "Outside string literals" @@ -150,3 +150,28 @@ (lexeme type: 'other body: "ä") (lexeme type: 'other body: "ö")) (lex "åäö"))) + + + + +(test-group "Characters with prefixes" + (test-equal (ls '(character-constant (character-prefix . "u") + "a")) + (lex "u'a'")) + (test-equal (ls '(character-constant (character-prefix . "U") + "a")) + (lex "U'a'")) + (test-equal (ls '(character-constant (character-prefix . "L") + "a")) + (lex "L'a'"))) + +;; Note that these strings have 0 "data" components +(test-group "Strings with prefixes" + (test-equal (ls '(string-literal (encoding-prefix . "u8"))) + (lex "u8\"\"")) + (test-equal (ls '(string-literal (encoding-prefix . "u"))) + (lex "u\"\"")) + (test-equal (ls '(string-literal (encoding-prefix . "U"))) + (lex "U\"\"")) + (test-equal (ls '(string-literal (encoding-prefix . "L"))) + (lex "L\"\""))) diff --git a/tests/test/cpp/util.scm b/tests/test/cpp/util.scm new file mode 100644 index 00000000..8329294a --- /dev/null +++ b/tests/test/cpp/util.scm @@ -0,0 +1,14 @@ +(define-module (test cpp util) + :use-module (srfi srfi-64) + :use-module (srfi srfi-88) + :use-module (c cpp-util) + :use-module ((c lex2) :select (lex lexeme))) + +(test-group "Merge string literals" + (test-equal "To simple strings" + (list (lexeme type: 'preprocessing-token + body: '(string-literal (encoding-prefix) "Hello" "World"))) + (merge-string-literals (lex "\"Hello\"\"World\""))) + + ;; TODO tests with prefixes + ) -- cgit v1.2.3