1 files changed, 549 insertions, 0 deletions
diff --git a/module/c/lex2.scm b/module/c/lex2.scm
new file mode 100644
index 00000000..af90dcce
--- /dev/null
+++ b/module/c/lex2.scm
@@ -0,0 +1,549 @@
+(define-module (c lex2)
+  :use-module (ice-9 peg)
+  :use-module (ice-9 match)
+  :use-module ((hnh util) :select (->))
+  :use-module (hnh util object)
+  :use-module (hnh util type)
+  :use-module ((srfi srfi-1) :select (fold))
+  :use-module (srfi srfi-88)
+  :use-module ((c trigraph) :select (replace-trigraphs))
+  :use-module ((c line-fold) :select (fold-lines))
+  :export (lex
+           lexeme lexeme?
+           placemaker
+           (type . lexeme-type)
+           (body . lexeme-body)
+           (noexpand . lexeme-noexpand)
+
+           parse-c-number
+
+           tokenize
+           ))
+
+;;; A.1 Lexical grammar
+;;; A.1.1 Lexical elements
+
+;; (6.4)
+(define-peg-pattern token all
+  (or keyword
+      identifier
+      constant
+      string-literal
+      punctuator
+      ))
+
+;; (6.4)
+(define-peg-pattern preprocessing-token all
+  ;; string literal moved before header-name since string literals
+  ;; otherwise became q-strings
+  (or string-literal
+      header-name
+      character-constant
+      identifier
+      pp-number
+      punctuator
+      ;; Each non-white-space character that cannot be one of the above
+      ))
+
+;;; A.1.2 Keywords
+
+;; (6.4.1)
+(define-peg-pattern keyword all
+  (or "auto" "break" "case" "char" "const" "continue" "default"
+      "do" "double" "else" "enum" "extern" "float" "for" "goto"
+      "if" "inline" "int" "long" "register" "restrict" "return"
+      "short" "signed" "sizeof" "static" "struct" "switch"
+      "typedef" "union" "unsigned" "void" "volatile" "while"
+      "_Alignas" "_Alignof" "_Atomic" "_Bool" "_Complex"
+      "_Generic" "_Imaginary" "_Noreturn" "_Static_assert"
+      "_Thread_local"))
+
+;;; A.1.3 Identifiers
+
+;; (6.4.2.1)
+(define-peg-pattern identifier all
+  (and identifier-nondigit (* (or identifier-nondigit digit))))
+
+;; (6.4.2.1)
+(define-peg-pattern identifier-nondigit body
+  (or nondigit
+      universal-character-name
+      ;; TODO other implementation-defined characters
+      ))
+
+;; (6.4.2.1)
+(define-peg-pattern nondigit body
+  (or "_"
+      (range #\A #\Z)
+      (range #\a #\z)))
+
+;; (6.4.2.1)
+(define-peg-pattern digit body
+  (range #\0 #\9))
+
+;;; A.1.4 Universal character names
+
+;; (6.4.3)
+(define-peg-pattern universal-character-name all
+  (or (and (ignore "\\u") hex-quad)
+      (and (ignore "\\U") hex-quad hex-quad)))
+
+;; (6.4.3)
+(define-peg-pattern hex-quad body
+  (and hexadecimal-digit hexadecimal-digit
+       hexadecimal-digit hexadecimal-digit))
+
+;;; A.1.5 Constants
+
+;; (6.4.4)
+(define-peg-pattern constant all
+  ;; Int and float swapped from standard since we need to try parsing
+  ;; the floats beforehand
+  (or floating-constant
+      integer-constant
+      enumeration-constant
+      character-constant))
+
+;; (6.4.4.1)
+(define-peg-pattern integer-constant all
+  (and (or decimal-constant
+           hexadecimal-constant
+           octal-constant)
+       (? integer-suffix)))
+
+;; (6.4.4.1)
+(define-peg-pattern decimal-constant all
+  (and nonzero-digit (* digit)))
+
+;; (6.4.4.1)
+(define-peg-pattern octal-constant all
+  (and "0" (* octal-digit)))
+
+;; (6.4.4.1)
+(define-peg-pattern hexadecimal-constant all
+  (and hexadecimal-prefix (+ hexadecimal-digit)))
+
+;; (6.4.4.1)
+(define-peg-pattern hexadecimal-prefix none
+  (or "0x" "0X"))
+
+;; (6.4.4.1)
+(define-peg-pattern nonzero-digit body
+  (range #\1 #\9))
+
+;; (6.4.4.1)
+(define-peg-pattern octal-digit body
+  (range #\0 #\7))
+
+;; (6.4.4.1)
+(define-peg-pattern hexadecimal-digit body
+  (or (range #\0 #\9)
+      (range #\a #\f)
+      (range #\A #\F)))
+
+;; (6.4.4.1)
+(define-peg-pattern integer-suffix all
+  (or (and unsigned-suffix (? long-suffix))
+      (and long-suffix (? unsigned-suffix))))
+
+;; (6.4.4.1)
+;; This is a merger of long-suffix and long-long-suffix
+(define-peg-pattern long-suffix body
+  (or "l" "L" "ll" "LL"))
+
+;; (6.4.4.1)
+(define-peg-pattern unsigned-suffix body
+  (or "u" "U"))
+
+;; (6.4.4.2)
+(define-peg-pattern floating-constant all
+  (or decimal-floating-constant
+      hexadecimal-floating-constant))
+
+;; (6.4.4.2)
+(define-peg-pattern decimal-floating-constant all
+  (or (and fractional-constant (? exponent-part) (? floating-suffix))
+      (and digit-sequence exponent-part (? floating-suffix))))
+
+;; (6.4.4.2)
+(define-peg-pattern hexadecimal-floating-constant all
+  (and hexadecimal-prefix
+       (or hexadecimal-fractional-constant
+           hexadecimal-digit-sequence)
+       binary-exponent-part
+       (? floating-suffix)))
+
+;; (6.4.4.2)
+(define-peg-pattern fractional-constant all
+  (or (and (? digit-sequence) "." digit-sequence)
+      (and digit-sequence ".")))
+
+;; (6.4.4.2)
+(define-peg-pattern exponent-part all
+  (and (or "e" "E") (? sign) digit-sequence))
+
+;; (6.4.4.2)
+(define-peg-pattern sign all
+  (or "+" "-"))
+
+;; (6.4.4.2)
+(define-peg-pattern digit-sequence body
+  (+ digit))
+
+;; (6.4.4.2)
+(define-peg-pattern hexadecimal-fractional-constant all
+  (or (and (? hexadecimal-digit-sequence) "." hexadecimal-digit-sequence)
+      (and    hexadecimal-digit-sequence  ".")))
+
+;; (6.4.4.2)
+(define-peg-pattern binary-exponent-part all
+  (and (ignore (or "p" "P"))
+       (? sign)
+       digit-sequence))
+
+;; (6.4.4.2)
+(define-peg-pattern hexadecimal-digit-sequence body
+  (+ hexadecimal-digit))
+
+;; (6.4.4.2)
+(define-peg-pattern floating-suffix all
+  (or "f" "l" "F" "L"))
+
+;; (6.4.4.3)
+(define-peg-pattern enumeration-constant all
+  identifier)
+
+(define-peg-pattern character-prefix all
+  (or "L" "u" "U"))
+
+;; (6.4.4.4)
+(define-peg-pattern character-constant all
+  (and (? character-prefix)
+       (ignore "'")
+       (+ c-char)
+       (ignore "'")))
+
+;; (6.4.4.4)
+(define-peg-pattern c-char body
+  (or (and (not-followed-by (or "'" "\\" "\n"))  peg-any)
+      escape-sequence))
+
+;; (6.4.4.4)
+(define-peg-pattern escape-sequence all
+  (or simple-escape-sequence
+      octal-escape-sequence
+      hexadecimal-escape-sequence
+      universal-character-name))
+
+;; (6.4.4.4)
+(define-peg-pattern simple-escape-sequence all
+  (and (ignore "\\") (or "'" "\"" "?" "\\"
+                         "a" "b" "f" "n" "r" "t" "v")))
+
+;; (6.4.4.4)
+(define-peg-pattern octal-escape-sequence all
+  (and (ignore "\\") octal-digit (? octal-digit) (? octal-digit)))
+
+;; (6.4.4.4)
+(define-peg-pattern hexadecimal-escape-sequence all
+  (and (ignore "\\x") (+ hexadecimal-digit)))
+
+;; A.1.6 String literals
+
+;; (6.4.5)
+(define-peg-pattern string-literal all
+  (and (? encoding-prefix)
+       (ignore "\"")
+       (* s-char)
+       (ignore "\"")))
+
+;; (6.4.5)
+(define-peg-pattern encoding-prefix all
+  (or "u8" "u" "U" "L"))
+
+;; (6.4.5)
+(define-peg-pattern s-char body
+  (or (and (not-followed-by (or "\"" "\\" "\n")) peg-any)
+      escape-sequence))
+
+;;; A.1.7
+
+;; (6.4.6)
+(define-peg-pattern punctuator all
+  (or "[" "]" "(" ")" "{" "}"
+      "..."                             ; Moved to be before "."
+      "." "->"
+      "&&" "||"
+      "!="
+      "++" "--"
+      "*=" "/=" "%=" "+=" "-=" "<<=" ">>=" "&=" "^=" "|="
+      "<=" ">=" "=="
+      "="
+      "/" "%" "<<" ">>" "<" ">"   "^" "|"
+      "?" ":" ";"
+      "&" "*" "+" "-" "~" "!"
+      "," "##" "#" ; # and ## flipped
+      "<:" ":>" "<%" "%>" "%:%:" "%:" ; %: and %:%: flipped
+      ))
+
+;;; A.1.8 Header names
+
+(define-peg-pattern h-string all (+ h-char))
+(define-peg-pattern q-string all (+ q-char))
+
+;; (6.4.7)
+(define-peg-pattern header-name all
+  (or (and (ignore "<")  h-string (ignore ">"))
+      ;; NOTE this case will never be reached, since it's treated as a regular
+      ;; string instead
+      (and (ignore "\"") q-string (ignore "\""))))
+
+;; (6.4.7)
+(define-peg-pattern h-char body
+  (or (and (not-followed-by (or ">" "\n")) peg-any)
+      escape-sequence))
+
+;; (6.4.7)
+(define-peg-pattern q-char body
+  (or (and (not-followed-by (or "\"" "\n")) peg-any)
+      escape-sequence))
+
+;;; A.1.9 Preprocessing numbers
+
+;; (6.4.8)
+(define-peg-pattern pp-number all
+  (and (? ".") digit
+       (* (or digit
+              identifier-nondigit
+              (and (or "e" "E" "p" "P")
+                   sign)
+              "."))))
+
+
+
+(define-peg-pattern whitespace all
+  (or "\t" "\n" "\v" "\f" " "
+      ;; "\r"
+      ))
+
+(define-peg-pattern block-comment body
+  (and (ignore "/*")
+       (* (and (not-followed-by "*/")
+               peg-any))
+       (ignore "*/")))
+
+(define-peg-pattern line-comment body
+  (and (ignore "//")
+       (* (and (not-followed-by "\n")
+               peg-any))))
+
+(define-peg-pattern comment all
+  (or line-comment block-comment))
+
+(define-peg-pattern non-whitespace all
+  (and (not-followed-by whitespace)
+       peg-any))
+
+(define-peg-pattern preprocessing-tokens all
+  (* (or whitespace
+         comment
+         preprocessing-token
+         non-whitespace)))
+
+
+
+;; comment could be merged with whitespace, but then unlex would have to know that
+
+;; other is the "each non-white-space character that cannot be one of the above"
+;; clause from 6.4 p. 1
+
+(define-type (lexeme)
+  (type type: (memv '(whitespace comment preprocessing-token other placemaker)))
+  (body type: (or string? list?))
+  (noexpand type: (list-of string?)
+            default: '()))
+
+(define (placemaker)
+  (lexeme type: 'placemaker body: '()))
+
+(define (lex-output->lexeme-object x)
+  (match x
+    (`(non-whitespace ,body)
+     (lexeme body: body type: 'other))
+    (`(whitespace ,body)
+     (lexeme body: body type: 'whitespace ))
+    (`(comment ,body)
+     (lexeme body: body type: 'comment ))
+    (`(preprocessing-token ,body)
+     (match body
+       ('string-literal
+        ;; Unflatten case
+        (lexeme body: '(string-literal (encoding-prefix) "")
+                type: 'preprocessing-token))
+       (('string-literal `(encoding-prefix ,px) args ...)
+        (lexeme body: `(string-literal (encoding-prefix . ,px) ,@args)
+                type: 'preprocessing-token))
+       (('string-literal args ...)
+        (lexeme body: `(string-literal (encoding-prefix) ,@args)
+                type: 'preprocessing-token))
+       (('character-constant `(character-prefix ,px) args ...)
+        (lexeme body: `(character-constant (character-prefix . ,px)
+                                           ,@args)
+                type: 'preprocessing-token))
+       (('character-constant args ...)
+        (lexeme body: `(character-constant (character-prefix) ,@args)
+                type: 'preprocessing-token))
+       (body (lexeme body: body type: 'preprocessing-token))))
+
+    ;; "unflatten"
+    ('comment (lexeme body: "" type: 'comment))))
+
+
+
+
+;; At a number of places I chose token depending on the order of the rule. The
+;; standard however says that the longest possible choice should be used.
+;; 6.4 p. 4
+
+;; returns a list of lexemes
+(define (lex string)
+  (if (string-null? string)
+      '()
+      (map lex-output->lexeme-object
+           (let ((result (match-pattern preprocessing-tokens string)))
+             (let ((trailing (substring (peg:string result)
+                                        (peg:end result))))
+               (unless (string-null? trailing)
+                 (scm-error 'cpp-lex-error "lex"
+                            "Failed to lex string, remaining trailing characters: ~s"
+                            (list trailing) #f)))
+             (unless (list? (peg:tree result))
+               (scm-error 'cpp-lex-error "lex"
+                          "Parsing just failed. Chars: ~s"
+                          (list (peg:string result)) #f))
+             (cdr (peg:tree result))))))
+
+
+
+
+
+;; (parse-decimals "555" 10)
+;; ⇒ 0.5549999999999999
+;; (parse-decimals "8" 16)
+;; ⇒ 0.5
+(define (parse-decimals str base)
+  (/ (fold (lambda (digit done)
+             (let ((v (string->number digit base)))
+               (+ v (/ done base))))
+           0.0
+           (map string (string->list str)))
+     base))
+
+;; parse a number on form <digits>.<digits>
+(define (parse-fractional str base)
+  (let* ((pair (string-split str #\.))
+         (integer (list-ref pair 0))
+         (decimals (list-ref pair 1)))
+    (+ (if (string-null? integer)
+           0 (string->number integer 16))
+       (if (string-null? decimals)
+           0 (parse-decimals decimals 16)))))
+
+
+(define (parse-float body)
+  (define (fractional-constant x)
+    (case x
+      ((decimal-floating-constant)                 'fractional-constant)
+      ((hexadecimal-floating-constant) 'hexadecimal-fractional-constant)))
+
+  (define (exponent-part x)
+    (case x
+      ((decimal-floating-constant)            'exponent-part)
+      ((hexadecimal-floating-constant) 'binary-exponent-part)))
+
+  (define (expt-base x)
+    (case x
+      ((decimal-floating-constant)     10)
+      ((hexadecimal-floating-constant)  2)))
+
+  (define (base x)
+    (case x
+      ((decimal-floating-constant)     10)
+      ((hexadecimal-floating-constant) 16)))
+
+  (let ((type (car body))
+        (body (cdr body)))
+    (* 1.0
+       (cond ((assoc-ref body (fractional-constant type))
+              => (lambda (fc) (parse-fractional (car fc) (base type))))
+             (else (string->number (car body) (base type))))
+       (cond ((assoc-ref body (exponent-part type))
+              => (lambda (x) (expt (expt-base type)
+                              (string->number (car x) (base type)))))
+             (else 1)))
+    ;; TODO do something with (possible) suffix
+    ;; (assoc-ref body 'floating-suffix)
+    ))
+
+(define (parse-integer body)
+  (let* (;; (suffix (assoc-ref body 'integer-suffix))
+         (value (cadr (car body)))
+         (value-type (car (car body))))
+    ;; TODO do something with suffix
+    (string->number
+     value
+     (case value-type
+       ((octal-constant)        8)
+       ((decimal-constant)     10)
+       ((hexadecimal-constant) 16)))))
+
+;; (parse-c-number "0x1.8p0")
+;; ⇒ 1.5
+
+;; TODO is "5ul" equivalent to "((unsigned long) 5)"
+(define (parse-c-number string)
+  (cond ((match-pattern constant string)
+         => (lambda (m)
+              (let ((m (cadr (peg:tree m))))       ; Strip 'constant wrapper
+                (case (car m)
+                  ((floating-constant)
+                   (parse-float (cadr m)))
+
+                  ((integer-constant)
+                   (parse-integer (cdr m)))
+
+                  ((enumeration-constant character-constant)
+                   (scm-error 'misc-error "parse-c-number"
+                              "Couldn't parse [~a] as a /number/ (~s)"
+                              (list string m) #f))))))
+
+        (else (scm-error 'misc-error "parse-c-number"
+                         "Couldn't parse [~a] as a number"
+                         (list string) #f))))
+
+
+
+
+;;; 5.1.11.2 Translation phases
+
+(define (tokenize string)
+  (-> string
+;;; 1. trigraph replacement
+      replace-trigraphs
+;;; 2. Line folding
+      fold-lines
+;;; 3. Decomposition into preprocenning tokens, whitespaces, and comments
+      lex
+      comments->whitespace))
+
+;; These really belong in (c cpp-types), but that would create a dependency cycle
+
+(define (comment->whitespace token)
+  (if ;; (comment-token? token)
+   (and (lexeme? token)
+        (eq? 'comment (type token)))
+      (car (lex " "))
+      token))
+
+(define (comments->whitespace tokens)
+  (map comment->whitespace tokens))