work

author: Hugo Hörnquist <hugo@lysator.liu.se> 2022-07-14 23:44:03 +0200
committer: Hugo Hörnquist <hugo@lysator.liu.se> 2022-07-14 23:44:03 +0200
commit: 64375c16c8316b8381ad59fa3538ad84732d90b7 (patch)
tree: cec344cb9306f1353ff0c4a4daaf7a81506eef5a /module/c/lex2.scm
parent: Add C LALR parser. (diff)
download: calp-64375c16c8316b8381ad59fa3538ad84732d90b7.tar.gz
calp-64375c16c8316b8381ad59fa3538ad84732d90b7.tar.xz
1 files changed, 41 insertions, 3 deletions
diff --git a/module/c/lex2.scm b/module/c/lex2.scm
index 049cc48c..647eff55 100644
--- a/module/c/lex2.scm
+++ b/module/c/lex2.scm
@@ -1,9 +1,12 @@
 (define-module (c lex2)
   :use-module (ice-9 peg)
   :use-module (ice-9 match)
+  :use-module ((hnh util) :select (->))
   :use-module (hnh util object)
   :use-module (hnh util type)
   :use-module (srfi srfi-88)
+  :use-module ((c trigraph) :select (replace-trigraphs))
+  :use-module ((c line-fold) :select (fold-lines))
   :export (lex
            lexeme lexeme?
            placemaker
@@ -12,6 +15,8 @@
            (noexpand . lexeme-noexpand)
 
            parse-c-number
+
+           tokenize
            ))
 
 ;;; A.1 Lexical grammar
@@ -268,10 +273,15 @@
   (or "[" "]" "(" ")" "{" "}"
       "..."                             ; Moved to be before "."
       "." "->"
-      "++" "--" "&" "*" "+" "-" "~" "!"
-      "/" "%" "<<" ">>" "<" ">" "<=" ">=" "==" "!=" "^" "|" "&&" "||"
+      "&&" "||"
+      "!="
+      "++" "--"
+      "*=" "/=" "%=" "+=" "-=" "<<=" ">>=" "&=" "^=" "|="
+      "<=" ">=" "=="
+      "="
+      "/" "%" "<<" ">>" "<" ">"   "^" "|"
       "?" ":" ";"
-      "=" "*=" "/=" "%=" "+=" "-=" "<<=" ">>=" "&=" "^=" "|="
+      "&" "*" "+" "-" "~" "!"
       "," "##" "#" ; # and ## flipped
       "<:" ":>" "<%" "%>" "%:%:" "%:" ; %: and %:%: flipped
       ))
@@ -284,6 +294,8 @@
 ;; (6.4.7)
 (define-peg-pattern header-name all
   (or (and (ignore "<")  h-string (ignore ">"))
+      ;; NOTE this case will never be reached, since it's treated as a regular
+      ;; string instead
       (and (ignore "\"") q-string (ignore "\""))))
 
 ;; (6.4.7)
@@ -402,3 +414,29 @@
 (define (parse-c-number string)
   (match-pattern constant string))
 
+
+
+
+;;; 5.1.11.2 Translation phases
+
+(define (tokenize string)
+  (-> string
+;;; 1. trigraph replacement
+      replace-trigraphs
+;;; 2. Line folding
+      fold-lines
+;;; 3. Decomposition into preprocenning tokens, whitespaces, and comments
+      lex
+      comments->whitespace))
+
+;; These really belong in (c cpp-types), but that would create a dependency cycle
+
+(define (comment->whitespace token)
+  (if ;; (comment-token? token)
+   (and (lexeme? token)
+        (eq? 'comment (type token)))
+      (car (lex " "))
+      token))
+
+(define (comments->whitespace tokens)
+  (map comment->whitespace tokens))
author	Hugo Hörnquist <hugo@lysator.liu.se>	2022-07-14 23:44:03 +0200
committer	Hugo Hörnquist <hugo@lysator.liu.se>	2022-07-14 23:44:03 +0200
commit	64375c16c8316b8381ad59fa3538ad84732d90b7 (patch)
tree	cec344cb9306f1353ff0c4a4daaf7a81506eef5a /module/c/lex2.scm
parent	Add C LALR parser. (diff)
download	calp-64375c16c8316b8381ad59fa3538ad84732d90b7.tar.gz calp-64375c16c8316b8381ad59fa3538ad84732d90b7.tar.xz