From 5a24d35fd7dd0e1f9b117b43e97d54bde84c9ca4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hugo=20H=C3=B6rnquist?= <hugo@lysator.liu.se>
Date: Wed, 20 Jul 2022 16:49:18 +0200
Subject: Major work on to-token.

---
 module/c/to-token.scm       | 121 ++++++++++++++++++++++++++++++++++++++------
 tests/test/cpp/to-token.scm |  65 ++++++++++++++++++++++++
 2 files changed, 171 insertions(+), 15 deletions(-)
 create mode 100644 tests/test/cpp/to-token.scm

diff --git a/module/c/to-token.scm b/module/c/to-token.scm
index 53db7e59..f5e459de 100644
--- a/module/c/to-token.scm
+++ b/module/c/to-token.scm
@@ -1,23 +1,90 @@
 (define-module (c to-token)
+  :use-module ((srfi srfi-1) :select (fold append-map))
+  :use-module (ice-9 match)
+  :use-module ((hnh util) :select (->))
   :use-module ((system base lalr)
                :select (make-lexical-token))
   :use-module (c cpp-types)
   :use-module ((c lex2) :select (parse-c-number))
   ;; :use-module (hnh util type)
+  :use-module ((rnrs bytevectors)
+               :select (make-bytevector
+                        bytevector-length
+                        bytevector-copy!
+                        u8-list->bytevector
+                        bytevector-u8-ref))
+  :use-module ((rnrs io ports)
+               :select (string->bytevector
+                        make-transcoder
+                        utf-8-codec))
   :export (preprocessing-token->token))
 
 (define (pp-number->c-number token)
   (parse-c-number (pp-number? token)))
 
 (define keywords
-  '("auto" "break" "case" "char" "const" "continue" "default"
-    "do" "double" "else" "enum" "extern" "float" "for" "goto"
-    "if" "inline" "int" "long" "register" "restrict" "return"
-    "short" "signed" "sizeof" "static" "struct" "switch"
-    "typedef" "union" "unsigned" "void" "volatile" "while"
-    "_Alignas" "_Alignof" "_Atomic" "_Bool" "_Complex"
-    "_Generic" "_Imaginary" "_Noreturn" "_Static_assert"
-    "_Thread_local"))
+  '(auto break case char const continue default
+    do double else enum extern float for goto
+    if inline int long register restrict return
+    short signed sizeof static struct switch
+    typedef union unsigned void volatile while
+    _Alignas _Alignof _Atomic _Bool _Complex
+    _Generic _Imaginary _Noreturn _Static_assert
+    _Thread_local))
+
+(define (cpp-char->bytevector c)
+  (match c
+    (`(escape-sequence (simple-escape-sequence      ,x))
+     (case (string-ref x 0)
+       ((#\a) #vu8(#x07)) ; #\alarm
+       ((#\b) #vu8(#x08)) ; #\backspace
+       ((#\f) #vu8(#x0C)) ; #\page
+       ((#\n) #vu8(#x0A)) ; #\newline
+       ((#\r) #vu8(#x0D)) ; #\return
+       ((#\t) #vu8(#x09)) ; #\tab
+       ((#\v) #vu8(#x0B)) ; #\vtab
+       ;; ' " ? \
+       (else (char->integer (string-ref x 0)))))
+
+    ;; TODO these u8-list->bytevector should depend on the
+    ;;      encoding prefix of the string/char
+    (`(escape-sequence (octal-escape-sequence       ,x))
+     (-> x (string->number 8) list u8-list->bytevector))
+
+    (`(escape-sequence (hexadecimal-escape-sequence ,x))
+     (-> x (string->number 16) list u8-list->bytevector))
+
+    (`(escape-sequence (universal-character-name    ,x))
+     (let ((n (string->number x 16)))
+       (when (<= #xD800 x #xDFFF)
+         (error))
+       (when (and (< x #xA0)
+                  (or (not (= x #x24))
+                      (not (= x #x40))
+                      (not (= x #x60))))
+         (error))
+       (-> n
+           integer->char
+           string
+           (string->bytevector (make-transcoder (utf-8-codec))))))
+    (_ (error))))
+
+(define (concat-bytevectors bvs)
+  (define target (make-bytevector (apply + (map bytevector-length bvs))))
+  (fold (lambda (bv offset)
+          (let ((len (bytevector-length bv)))
+            (bytevector-copy! bv 0 target offset len)
+            (+ offset len)))
+        0
+        bvs)
+  target)
+
+(define (handle-string-fragments content)
+  (map
+   (lambda (x) (if (string? x)
+              (string->bytevector x (make-transcoder (utf-8-codec)))
+              (cpp-char->bytevector x)))
+   content))
 
 ;; 6.4 paragraph 2
 ;; Each preprocessing toket thas is converted to a token shall have the
@@ -28,23 +95,47 @@
   ;; (cond (expr check => proc) ...)
   (cond ((string-token? cpp-token)
          (lambda (a . _) a)
-         => (lambda content
-              (make-lexical-token 'string-literal #f content)))
+         => (lambda (encoding . content)
+              (make-lexical-token
+               'string-literal #f
+               (concat-bytevectors
+                (append
+                 ;; TODO this should depend on encoding
+                 (handle-string-fragments content)
+                 (list #vu8(0)))))))
 
         ((identifier-token? cpp-token)
             => (lambda (name)
-                 (if (member name keywords)
-                     (string->symbol name)
-                     (make-lexical-token 'identifier #f name))))
+                 (let ((name (string->symbol name)))
+                   (if (memv name keywords)
+                       name
+                       (make-lexical-token 'identifier #f name)))))
 
         ((pp-number? cpp-token)
          => (lambda (content)
               ;; TOOD should return an integer-constant or a floating-constant
-              (make-lexical-token 'constant #f content)))
+              (make-lexical-token 'constant #f (parse-c-number content))))
 
         ((character-constant? cpp-token)
          (lambda (a . _) a)
-         => (lambda content (make-lexical-token 'constant #f content)))
+         => (lambda (encoding . content)
+              (make-lexical-token
+               'constant #f
+               ;; TODO that to do with multi-byte characters?
+               ;; > 'ab' == 'a' << 8 | 'b' == 0x6162
+               ;; > '\x1234' == 0x1234
+               ;; GCC prints 34 for the following expression
+               ;; > printf("%x\n", '\x1234');
+               ;; but 6162 for this
+               ;; > printf("%x\n", 'ab');
+
+               ;; What about
+               ;; > 'a\x1234' == a << 16 | 'b' == 0x611234
+               (let ((bv (concat-bytevectors
+                          ;; TODO this should depend on encoding
+                          (handle-string-fragments content))))
+                 ;; TODO maybe actually store multiple bytes from multi byte literals
+                 (bytevector-u8-ref bv (1- (bytevector-length bv)))))))
 
         ((punctuator-token? cpp-token)
          => (lambda (s)
diff --git a/tests/test/cpp/to-token.scm b/tests/test/cpp/to-token.scm
new file mode 100644
index 00000000..b633ce12
--- /dev/null
+++ b/tests/test/cpp/to-token.scm
@@ -0,0 +1,65 @@
+(define-module (test cpp to-token)
+  :use-module (srfi srfi-64)
+  :use-module (srfi srfi-64 test-error)
+  :use-module (srfi srfi-88)
+  :use-module (c to-token)
+  :use-module ((system base lalr)
+               :select (lexical-token-category
+                        lexical-token-value))
+  :use-module ((c lex2) :select (lex))
+  )
+
+(test-group "string tokens"
+  (let ((v (preprocessing-token->token (car (lex "\"Hello\"")))))
+    (test-equal 'string-literal (lexical-token-category v))
+    (test-equal #vu8(#x48 #x65 #x6C #x6C #x6F 0) (lexical-token-value v))
+    ;; TODO prefixes
+    ))
+
+(test-group "identifier tokens"
+  (let ((v (preprocessing-token->token (car (lex "hello")))))
+    (test-equal 'identifier (lexical-token-category v))
+    (test-equal 'hello (lexical-token-value v))))
+
+(test-group "keywords"
+  (test-equal 'auto (preprocessing-token->token (car (lex "auto")))))
+
+(test-group "numbers"
+  (test-group "Integers"
+    (test-group "Base-10"
+      (let ((v (preprocessing-token->token (car (lex "1")))))
+        (test-equal 'constant (lexical-token-category v))
+        (test-equal 1 (lexical-token-value v))))
+
+    (test-equal "Base-16"
+      16 (lexical-token-value (preprocessing-token->token (car (lex "0x10")))))
+    (test-equal "Base-8"
+      8 (lexical-token-value (preprocessing-token->token (car (lex "010")))))
+    (test-group "Suffixes"
+      'TODO
+      ))
+
+  ;; TODO floats
+  )
+
+(test-group "character constants"
+  (let ((v (preprocessing-token->token (car (lex "'a'")))))
+    (test-equal 'constant (lexical-token-category v))
+    (test-equal (char->integer #\a) (lexical-token-value v)) )
+  (let ((v (preprocessing-token->token (car (lex "'ab'")))))
+    (test-equal 'constant (lexical-token-category v))
+    (test-equal (char->integer #\b) (lexical-token-value v)))
+  (let ((v (preprocessing-token->token (car (lex "'\\x41'")))))
+    (test-equal 'constant (lexical-token-category v))
+    (test-equal #x41 (lexical-token-value v)))
+  ;; (lex "'\\x4142'")
+  ;; (lex "'L\\x4142'")
+  )
+
+(test-group "punctuators"
+  (test-equal '+ (preprocessing-token->token (car (lex "+"))))
+  (test-equal 'lbrace (preprocessing-token->token (car (lex "{")))))
+
+(test-group "other"
+  (test-error 'cpp-error (preprocessing-token->token (car (lex " ")))))
+
-- 
cgit v1.2.3