From 37dc6cc5ab804da964f22787561d030898115809 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hugo=20H=C3=B6rnquist?= <hugo@lysator.liu.se>
Date: Wed, 20 Jul 2022 16:25:35 +0200
Subject: Acknowledge string prefixes.

---
 module/c/cpp-util.scm      | 10 ++++++----
 module/c/lex2.scm          | 26 ++++++++++++++++++++------
 module/c/preprocessor2.scm | 20 +++++++++++++-------
 module/c/unlex.scm         | 15 ++++++++++-----
 tests/test/cpp/lex2.scm    | 35 ++++++++++++++++++++++++++++++-----
 tests/test/cpp/util.scm    | 14 ++++++++++++++
 6 files changed, 93 insertions(+), 27 deletions(-)
 create mode 100644 tests/test/cpp/util.scm

diff --git a/module/c/cpp-util.scm b/module/c/cpp-util.scm
index 633c5a0c..9674317b 100644
--- a/module/c/cpp-util.scm
+++ b/module/c/cpp-util.scm
@@ -117,12 +117,14 @@
         ((null? (cdr tokens)) tokens)
         ((string-token? (car tokens))
          (lambda (a . _) a)
-         => (lambda parts-a
+         => (lambda (prefix-a . parts-a)
               (cond ((string-token? (cadr tokens))
                      (lambda (a . _) a)
-                     => (lambda parts-b (merge-string-literals
-                                    (cons (make-string-literal (append parts-a parts-b))
-                                          (cddr tokens)))))
+                     => (lambda (prefix-b . parts-b)
+                          (merge-string-literals
+                           ;; TODO check validity of prefixes
+                           (cons (make-string-literal (cons prefix-a (append parts-a parts-b)))
+                                 (cddr tokens)))))
                     (else (cons (car tokens)
                                 (merge-string-literals (cdr tokens)))))))
         (else (cons (car tokens) (merge-string-literals (cdr tokens))))))
diff --git a/module/c/lex2.scm b/module/c/lex2.scm
index c09f6423..85c9be19 100644
--- a/module/c/lex2.scm
+++ b/module/c/lex2.scm
@@ -374,12 +374,26 @@
     (`(comment ,body)
      (lexeme body: body type: 'comment ))
     (`(preprocessing-token ,body)
-     (case body
-       ;; "unflatten"
-       ((string-literal)
-        (lexeme body: '(string-literal "") type: 'preprocessing-token))
-       (else
-        (lexeme body: body type: 'preprocessing-token))))
+     (match body
+       ('string-literal
+        ;; Unflatten case
+        (lexeme body: '(string-literal (encoding-prefix) "")
+                type: 'preprocessing-token))
+       (('string-literal `(encoding-prefix ,px) args ...)
+        (lexeme body: `(string-literal (encoding-prefix . ,px) ,@args)
+                type: 'preprocessing-token))
+       (('string-literal args ...)
+        (lexeme body: `(string-literal (encoding-prefix) ,@args)
+                type: 'preprocessing-token))
+       (('character-constant `(character-prefix ,px) args ...)
+        (lexeme body: `(character-constant (character-prefix . ,px)
+                                           ,@args)
+                type: 'preprocessing-token))
+       (('character-constant args ...)
+        (lexeme body: `(character-constant (character-prefix) ,@args)
+                type: 'preprocessing-token))
+       (body (lexeme body: body type: 'preprocessing-token))))
+
     ;; "unflatten"
     ('comment (lexeme body: "" type: 'comment))))
 
diff --git a/module/c/preprocessor2.scm b/module/c/preprocessor2.scm
index c6be3936..3f9552c5 100644
--- a/module/c/preprocessor2.scm
+++ b/module/c/preprocessor2.scm
@@ -397,11 +397,15 @@
    body: (lambda (environment arguments)
            (typecheck arguments (and (list-of (list-of lexeme?))
                                      (not null?)))
-           (aif (string-token? (caar arguments))
-                (values (handle-pragma environment (lex it)) '())
-                (scm-error 'cpp-pragma-error "_Pragma"
-                           "Invalid argument to _Pragma: ~s"
-                           (list (car arguments)) #f)))))
+           (cond ((string-token? (caar arguments))
+                  (lambda (a . _) a)
+                  ;; TODO handle rest
+                  => (lambda (encoding it . rest)
+                       (values (handle-pragma environment (lex it))
+                               '())))
+                 (else (scm-error 'cpp-pragma-error "_Pragma"
+                                  "Invalid argument to _Pragma: ~s"
+                                  (list (car arguments)) #f))))))
 
 
 
@@ -591,10 +595,12 @@
                         (remaining (drop-whitespace (cdr tokens))))
                     (cond ((null? remaining) (set environment current-line (1- line)))
                           ((string-token? (car remaining))
-                           => (lambda (file)
+                           (lambda (a . _) a)
+                           => (lambda (encoding . fragments)
                                 (-> environment
                                     (set current-line (1- line))
-                                    (set current-file file))))
+                                    ;; TODO properly join string
+                                    (set current-file (car fragments)))))
                           ;; no newlines in #line
                           (%first-time (loop #f ((unval resolve-token-stream 1) environment tokens)))
                           (else (err))))))
diff --git a/module/c/unlex.scm b/module/c/unlex.scm
index 1f8ba0a1..e467a50f 100644
--- a/module/c/unlex.scm
+++ b/module/c/unlex.scm
@@ -59,21 +59,26 @@
 ;; Returns the "source" of the token, as a preprocessing string literal token
 (define (stringify-token preprocessing-token)
   (match (lexeme-body preprocessing-token)
-    (('string-literal `(encoding-prefix ,prefix) parts ...)
-     (stringify-string-tokens parts))
-    (('string-literal parts ...)
+    (('string-literal `(encoding-prefix . ,prefix) parts ...)
      (stringify-string-tokens parts))
+
     (`(header-name (q-string ,s))
      (format #f "~s" s))
+
     (`(header-name (h-string ,s))
      (format #f "<~a>" s))
+
     (`(identifier ,id) id)
+
     (`(pp-number ,n) n)
-    (`(character-constant ,c)
+
+    ;; TODO remaining parts
+    (('character-constant `(character-encoding . ,x) c parts ...)
      (format #f "'~a'" c))
+
     (`(punctuator ,p) p)))
 
 ;; takes a token list, and return a single string literal token
 (define (stringify-tokens tokens)
   (lexeme type: 'preprocessing-token
-          body: `(string-literal ,(unlex-aggressive tokens))))
+          body: `(string-literal (encoding-prefix) ,(unlex-aggressive tokens))))
diff --git a/tests/test/cpp/lex2.scm b/tests/test/cpp/lex2.scm
index af6163e8..f4f9b857 100644
--- a/tests/test/cpp/lex2.scm
+++ b/tests/test/cpp/lex2.scm
@@ -14,7 +14,7 @@
   (lex "10"))
 
 (test-equal "String literal"
-  (ls `(string-literal "Hello"))
+  (ls `(string-literal (encoding-prefix) "Hello"))
   (lex "\"Hello\""))
 
 
@@ -31,7 +31,7 @@
 
 
 (test-equal "Comment inside string"
-  (ls `(string-literal "Hel/*lo"))
+  (ls `(string-literal (encoding-prefix) "Hel/*lo"))
   (lex "\"Hel/*lo\""))
 
 (test-equal "#define line"
@@ -117,7 +117,7 @@
 (test-equal "Interaction of h-strings and regular strings"
  (test-equal "Less than string, not h-string"
    (ls '(pp-number "1")
-       '(string-literal "<")
+       '(string-literal (encoding-prefix) "<")
        '(punctuator ">"))
    (lex "1\"<\">"))
 
@@ -131,7 +131,7 @@
   (list (l '(punctuator "#"))
         (l '(identifier "include"))
         (lexeme type: 'whitespace body: " ")
-        (l '(string-literal "test")))
+        (l '(string-literal (encoding-prefix) "test")))
   ;; # include here, since generated tokens could possible depend on that context,
   ;; and the reason regular strings are returned is since the lexer doesn't check
   ;; that context
@@ -142,7 +142,7 @@
 
 (test-group "Unicode"
   (test-equal "In string literals"
-    (ls '(string-literal "åäö"))
+    (ls '(string-literal (encoding-prefix) "åäö"))
     (lex "\"åäö\""))
 
   (test-equal "Outside string literals"
@@ -150,3 +150,28 @@
           (lexeme type: 'other body: "ä")
           (lexeme type: 'other body: "ö"))
     (lex "åäö")))
+
+
+
+
+(test-group "Characters with prefixes"
+  (test-equal (ls '(character-constant (character-prefix . "u")
+                                       "a"))
+    (lex "u'a'"))
+  (test-equal (ls '(character-constant (character-prefix . "U")
+                                       "a"))
+    (lex "U'a'"))
+  (test-equal (ls '(character-constant (character-prefix . "L")
+                                       "a"))
+    (lex "L'a'")))
+
+;; Note that these strings have 0 "data" components
+(test-group "Strings with prefixes"
+  (test-equal (ls '(string-literal (encoding-prefix . "u8")))
+    (lex "u8\"\""))
+  (test-equal (ls '(string-literal (encoding-prefix . "u")))
+    (lex "u\"\""))
+  (test-equal (ls '(string-literal (encoding-prefix . "U")))
+    (lex "U\"\""))
+  (test-equal (ls '(string-literal (encoding-prefix . "L")))
+    (lex "L\"\"")))
diff --git a/tests/test/cpp/util.scm b/tests/test/cpp/util.scm
new file mode 100644
index 00000000..8329294a
--- /dev/null
+++ b/tests/test/cpp/util.scm
@@ -0,0 +1,14 @@
+(define-module (test cpp util)
+  :use-module (srfi srfi-64)
+  :use-module (srfi srfi-88)
+  :use-module (c cpp-util)
+  :use-module ((c lex2) :select (lex lexeme)))
+
+(test-group "Merge string literals"
+  (test-equal "To simple strings"
+    (list (lexeme type: 'preprocessing-token
+                  body: '(string-literal (encoding-prefix) "Hello" "World")))
+    (merge-string-literals (lex "\"Hello\"\"World\"")))
+
+  ;; TODO tests with prefixes
+  )
-- 
cgit v1.2.3