work

author: Hugo Hörnquist <hugo@lysator.liu.se> 2022-07-13 04:39:14 +0200
committer: Hugo Hörnquist <hugo@lysator.liu.se> 2022-07-13 04:39:14 +0200
commit: c1cf0693982d9c1f1b871966752140fee5d76d19 (patch)
tree: 65cf5c501f5b2029eb08bed7fa9844b9530385f3 /module
parent: Resolve # ## # (diff)
download: calp-c1cf0693982d9c1f1b871966752140fee5d76d19.tar.gz
calp-c1cf0693982d9c1f1b871966752140fee5d76d19.tar.xz
5 files changed, 171 insertions, 72 deletions
diff --git a/module/c/cpp-types.scm b/module/c/cpp-types.scm
index 555120d6..1df70594 100644
--- a/module/c/cpp-types.scm
+++ b/module/c/cpp-types.scm
@@ -53,6 +53,7 @@
          (`(pp-number ,x) x)
          (_ #f))))
 
+;; TODO this fails if there are multiple components in the string token
 (define (string-token? token)
   (and (preprocessing-token? token)
        (match (lexeme-body token)
diff --git a/module/c/cpp-util.scm b/module/c/cpp-util.scm
index fff3cc9e..7969ccd5 100644
--- a/module/c/cpp-util.scm
+++ b/module/c/cpp-util.scm
@@ -1,11 +1,15 @@
 (define-module (c cpp-util)
   :use-module ((srfi srfi-1) :select (drop-while break))
+  :use-module (srfi srfi-71)
   :use-module ((hnh util) :select (->))
   :use-module (hnh util type)
+  :use-module ((hnh util lens) :select (modify ref))
   :use-module ((c lex2) :select (lex lexeme?))
   :use-module ((c unlex) :select (unlex))
   :use-module (c cpp-types)
   :export (tokens-until-eol
+           tokens-until-cpp-directive
+           next-token-matches?
            squeeze-whitespace
            drop-whitespace
            drop-whitespace-right
@@ -13,6 +17,20 @@
            cleanup-whitespace
            concatenate-tokens))
 
+
+;; Does the next non-whitespace token in the stream satisfy the predicate?
+(define (next-token-matches? predicate tokens)
+  (let ((tokens (drop-whitespace tokens)))
+    (if (null? tokens)
+        #f
+        (predicate (car tokens)))))
+
+(define (next-token-matches/line? predicate tokens)
+  (let ((tokens (drop-whitespace/line tokens)))
+    (if (null? tokens)
+        #f
+        (predicate (car tokens)))))
+
 ;; Returns two values:
 ;; - tokens until a newline token is met
 ;; - (potentially the newline token) and the remaining tokens
@@ -20,6 +38,24 @@
   (typecheck tokens (list-of lexeme?))
   (break newline-token? tokens))
 
+;; call predicate with the remaining token stream, until we run out of token, or
+;; predicate matches
+(define (break-lexemes predicate lex-list)
+  (let loop ((rem lex-list) (done '()))
+    (cond ((null? rem)     (values (reverse done) '()))
+          ((predicate rem) (values (reverse done) rem))
+          (else (loop (cdr rem) (cons (car rem) done))))))
+
+;; Finds the next instance of "\n#" (possibly with inbetween whitespace)
+;; and return the values before and after (inclusive)
+(define (tokens-until-cpp-directive tokens)
+  (break-lexemes
+   (lambda (tokens)
+     (and (newline-token? (car tokens))
+          (next-token-matches/line?
+           (lambda (token) (equal? "#" (punctuator-token? token)))
+           (cdr tokens))))
+   tokens))
 
 ;; Replace all whitespace with single spaces.
 (define (squeeze-whitespace tokens)
@@ -41,6 +77,13 @@
   (typecheck tokens (list-of lexeme?))
   (drop-while whitespace-token? tokens))
 
+(define (drop-whitespace/line tokens)
+  (typecheck tokens (list-of lexeme?))
+  (drop-while (lambda (t)
+                (and (whitespace-token? t)
+                     (not (newline-token? t))))
+              tokens))
+
 (define (drop-whitespace-right tokens)
   (typecheck tokens (list-of lexeme?))
   (-> tokens reverse drop-whitespace reverse))
diff --git a/module/c/lex2.scm b/module/c/lex2.scm
index 652aa6c1..fcddcdc4 100644
--- a/module/c/lex2.scm
+++ b/module/c/lex2.scm
@@ -368,4 +368,15 @@
   (if (string-null? string)
       '()
       (map lex-output->lexeme-object
-           (cdr (peg:tree (match-pattern preprocessing-tokens string))))))
+           (let ((result (match-pattern preprocessing-tokens string)))
+             (let ((trailing (substring (peg:string result)
+                                        (peg:end result))))
+               (unless (string-null? trailing)
+                 (scm-error 'cpp-lex-error "lex"
+                            "Failed to lex string, remaining trailing characters: ~s"
+                            (list trailing) #f)))
+             (unless (list? (peg:tree result))
+               (scm-error 'cpp-lex-error "lex"
+                          "Parsing just failed. Chars: ~s"
+                          (list (peg:string result)) #f))
+             (cdr (peg:tree result))))))
diff --git a/module/c/preprocessor2.scm b/module/c/preprocessor2.scm
index 4678ded7..d7bf3b64 100644
--- a/module/c/preprocessor2.scm
+++ b/module/c/preprocessor2.scm
@@ -9,7 +9,7 @@
                :select (function-like-macro variadic? identifier-list))
   :use-module ((c cpp-environment object-like-macro) :select (object-like-macro object-like-macro?))
   :use-module ((c cpp-environment internal-macro) :select (internal-macro))
-  :use-module ((hnh util) :select (-> intersperse aif swap unless))
+  :use-module ((hnh util) :select (-> intersperse aif swap unless unval))
   :use-module ((hnh util lens) :select (set modify cdr*))
   :use-module (hnh util path)
   :use-module (hnh util type)
@@ -19,7 +19,8 @@
   :use-module (c unlex)
   :use-module (c cpp-types)
   :use-module (c cpp-util)
-  :export (defined-macro))
+  :use-module (ice-9 control)
+  :export (defined-macro _Pragma-macro))
 
 (define-syntax-rule (alist-of variable key-type value-type)
   (build-validator-body variable (list-of (pair-of key-type value-type))))
@@ -34,6 +35,21 @@
 (define (comma-token? token)             (equal? ","   (punctuator-token? token)))
 (define (ellipsis-token? token)          (equal? "..." (punctuator-token? token)))
 
+
+(define-syntax-rule (abort* form)
+  (call-with-values (lambda () form) abort))
+
+(define-syntax-rule (on-fst form)
+  (% form
+     (lambda (prompt fst . rest)
+       (apply values (prompt fst) rest))))
+
+(define-syntax-rule (on-snd form)
+  (% form
+     (lambda (prompt fst snd . rest)
+       (apply values fst (prompt snd) rest))))
+
+
 ;; parameters is a lexeme list, as returned by parse-parameter-list
 (define (build-parameter-map macro parameters)
   (typecheck macro cpp-macro?)
@@ -157,7 +173,9 @@
                            (if (or (concat-token? last)
                                    (next-token-matches? concat-token? tokens))
                                replacement
-                               (resolve-token-stream environment replacement once?: #t))
+                               ;; resolve-token-stream only modifies environment by updating current line
+                               ;; that can't happen in a macro body
+                               ((unval resolve-token-stream 1) environment replacement once?: #t))
                            (loop (cdr tokens) #f))))))
               ((whitespace-token? (car tokens))
                (cons (car tokens) (loop (cdr tokens) last)))
@@ -190,13 +208,6 @@
 
   (let ((name (macro-identifier macro)))
     (cond ((object-macro? macro)
-           ;; #define f(a) f(x * (a))
-           ;; #define w 0,1
-           ;; #define m(a) a(w)
-           ;; m(f)
-           ;; ⇒ f(0,1)
-           ;; instead of expected
-           ;; f(2 * (0,1))
            (values environment (append (fold (swap mark-noexpand)
                                              (expand## (macro-body macro))
                                              (cons name noexpand-list))
@@ -211,17 +222,23 @@
                                        (cons name noexpand-list))
                                  remaining)))
                (values environment
+                       ;; TODO#1 the token shouldn't be expanded here, but it should neither be marked no-expand?
+                       ;; Consider the case
+                       ;; #define m(a) a(0,1)
+                       ;; #define f(a) f(2 * (a))
+                       ;; m(f)
                        (append (mark-noexpand (lex (macro-identifier macro)) (macro-identifier macro))
                                remaining-tokens))))
 
           ((internal-macro? macro)
            (if (next-token-matches? left-parenthesis-token? remaining-tokens)
                (let ((containing remaining newlines (parse-parameter-list remaining-tokens)))
-                 (values (bump-line environment newlines)
-                         (append (fold (swap mark-noexpand)
-                                       ((macro-body macro) environment containing)
-                                       (cons name noexpand-list))
-                                 remaining)))
+                 (let ((env* tokens* ((macro-body macro) environment containing)))
+                   (values (bump-line env* newlines)
+                           (append (fold (swap mark-noexpand)
+                                         tokens*
+                                         (cons name noexpand-list))
+                                   remaining))))
                (values environment
                        (append (mark-noexpand (lex (macro-identifier macro)) (macro-identifier macro))
                                remaining-tokens))))
@@ -256,13 +273,6 @@
 
 
 
-(define (next-token-matches? predicate tokens)
-  (let ((tokens (drop-whitespace tokens)))
-    (if (null? tokens)
-        #f
-        (predicate (car tokens)))))
-
-
 ;; returns three values:
 ;; - a list of tokens where each is a parameter to the function like macro
 ;; - the remaining tokenstream
@@ -356,17 +366,24 @@
                                      (not null?)))
            (aif (identifier-token? (car (list-ref arguments 0)))
                 (let ((in-env (boolean->c-boolean (in-environment? environment it))))
-                  (lex (number->string in-env)))
+                  (values environment (lex (number->string in-env))))
                 (scm-error 'cpp-error "defined"
                            "Invalid parameter list to `defined': ~s"
                            (list arguments) #f)))))
 
-;; (lex "STDC FP_CONTRACT ON")
-;; (#<<lexeme> type: preprocessing-token body: (identifier "STDC") noexpand: ()>
-;;  #<<lexeme> type: whitespace body: " " noexpand: ()>
-;;  #<<lexeme> type: preprocessing-token body: (identifier "FP_CONTRACT") noexpand: ()>
-;;  #<<lexeme> type: whitespace body: " " noexpand: ()>
-;;  #<<lexeme> type: preprocessing-token body: (identifier "ON") noexpand: ()>)
+(define _Pragma-macro
+  (internal-macro
+   identifier: "_Pragma"
+   body: (lambda (environment arguments)
+           (typecheck arguments (and (list-of (list-of lexeme?))
+                                     (not null?)))
+           (aif (string-token? (caar arguments))
+                (values (handle-pragma environment (lex it)) '())
+                (scm-error 'cpp-pragma-error "_Pragma"
+                           "Invalid argument to _Pragma: ~s"
+                           (list (car arguments)) #f)))))
+
+
 
 ;; environment, tokens → environment
 (define (handle-pragma environment tokens)
@@ -395,13 +412,6 @@
            environment))))
 
 
-;; TODO
-;; (define _Pragma-macro
-;;   (internal-macro
-;;    identifier: "_Pragma"
-;;    body: (lambda (environment tokens)
-;;            )))
-
 (define (resolve-constant-expression tokens)
   (typecheck tokens (list-of lexeme?))
   'TODO
@@ -424,26 +434,28 @@
 
 ;; Expands a token-stream which contains no pre-processing directives (#if:s, ...)
 ;; If @var{once?} is true then the resulting body won't be scanned again for tokens to expand
+;; environment, tokens, [boolean] → environment, tokens
 (define* (resolve-token-stream environment tokens key: once?)
   (typecheck environment cpp-environment?)
   (typecheck tokens (list-of lexeme?))
   ;; (pprint-environment environment)
   ;; (format (current-error-port) "~a~%~%" (unlex tokens))
   (let loop ((environment environment) (tokens tokens))
-    (cond ((null? tokens) '())
-          ((car tokens)
-           (lambda (x) (and (identifier-token? x)
-                       (not (marked-noexpand? x))))
-           => (lambda (token)
-                (call-with-values
-                    (lambda () (maybe-extend-identifier environment
-                                                   (identifier-token? token)
-                                                   (lexeme-noexpand token)
-                                                   (cdr tokens)))
-                  ;; Here is the after expansion
-                  (if once? (lambda (_ t) t) loop))))
-          (else (cons (car tokens)
-                      (loop environment (cdr tokens)))))))
+    (cond ((null? tokens) (values environment '()))
+          ((newline-token? (car tokens))
+           (on-snd (cons (car tokens) (abort* (loop (bump-line environment) (cdr tokens))))))
+          ((and (identifier-token? (car tokens))
+                (not (marked-noexpand? (car tokens))))
+           (call-with-values
+               (lambda () (maybe-extend-identifier environment
+                                              (identifier-token? (car tokens))
+                                              (lexeme-noexpand (car tokens))
+                                              (cdr tokens)))
+             ;; Here is the after expansion
+             (if once? values loop)))
+          (else (on-snd (cons (car tokens) (abort* (loop environment (cdr tokens)))))))))
+
+
 
 ;; returns a new environment
 ;; handle body of #if
@@ -453,7 +465,8 @@
   (typecheck tokens (list-of lexeme?))
 
   (-> (extend-environment environment defined-macro)
-      (resolve-token-stream tokens)
+      ;; no newlines in #if line
+      ((unval resolve-token-stream 1) tokens)
       resolve-constant-expression
       c-boolean->boolean
       (if (enter-active-if environment)
@@ -503,7 +516,8 @@
                    (-> str resolve-q-file read-file tokenize))))
             (else
              (unless %first-time (err "Failed parsing tokens"))
-             (loop #f (resolve-token-stream environment tokens)))))))
+             ;; No newlines in #include
+             (loop #f ((unval resolve-token-stream 1) environment tokens)))))))
 
 ;; environment, tokens → environment
 (define (handle-line-directive environment tokens*)
@@ -517,17 +531,19 @@
       (cond ((null? tokens))
             ((number-token? (car tokens))
              => (lambda (line)
-                  (let ((line (string->number line)))
-                    (let ((remaining (drop-whitespace (cdr tokens))))
-                      (cond ((null? remaining) (set environment current-line (1- line)))
-                            ((string-token? (car remaining))
-                             => (lambda (file)
-                                  (-> environment
-                                      (set current-line (1- line))
-                                      (set current-file file))))
-                            (%first-time (loop #f (resolve-token-stream environment tokens)))
-                            (else (err)))))))
-            (%first-time (loop #f (resolve-token-stream environment tokens)))
+                  (let ((line (string->number line))
+                        (remaining (drop-whitespace (cdr tokens))))
+                    (cond ((null? remaining) (set environment current-line (1- line)))
+                          ((string-token? (car remaining))
+                           => (lambda (file)
+                                (-> environment
+                                    (set current-line (1- line))
+                                    (set current-file file))))
+                          ;; no newlines in #line
+                          (%first-time (loop #f ((unval resolve-token-stream 1) environment tokens)))
+                          (else (err))))))
+            ;; no newlines in #line
+            (%first-time (loop #f ((unval resolve-token-stream 1) environment tokens)))
             (else (err))))))
 
 ;; environment, tokens → environment
@@ -634,15 +650,16 @@
                                              remaining-tokens)))))))))
 
                    ;; Line is not a pre-processing directive
-                   (else (let ((line-tokens remaining-tokens (tokens-until-eol (cdr tokens))))
-                           (let ((env* tokens* (loop environment remaining-tokens)))
-                             (values env*
-                                     (append (unless (in-comment-block? environment)
-                                               (resolve-token-stream environment line-tokens))
+                   (else (let ((preceding-tokens remaining-tokens (tokens-until-cpp-directive (cdr tokens))))
+                           (let* ((env* resolved-tokens (if (in-comment-block? environment)
+                                                            (values environment '())
+                                                            (resolve-token-stream environment preceding-tokens))))
+                             (on-snd (append resolved-tokens
                                              ;; The initial newline is presreved here, for better output,
                                              ;; and to keep at least one whitespace token when there was one previously.
                                              ;; possibly also keep a newline for line-directives.
-                                             (append (unless (null? remaining-tokens) (lex "\n")) tokens*)))))))))
+                                             (unless (null? remaining-tokens) (lex "\n"))
+                                             (abort* (loop env* remaining-tokens))))))))))
 
           (else (err "Unexpected middle of line")))))
 
diff --git a/module/c/unlex.scm b/module/c/unlex.scm
index 18e800d9..e3d36f86 100644
--- a/module/c/unlex.scm
+++ b/module/c/unlex.scm
@@ -5,6 +5,7 @@
   :use-module (c cpp-types)
   :use-module (c cpp-util)
   :export (unlex
+           unlex-aggressive
            stringify-token
            stringify-tokens))
 
@@ -24,11 +25,37 @@
                 ((whitespace-token? x) " ")))
         (squeeze-whitespace tokens))))
 
+(define (stringify-escape-sequence sub-token)
+  (match sub-token
+    (`(simple-escape-sequence ,x)
+     (format #f "\\~a" x))
+    (`(octal-escape-sequence ,x)
+     (format #f "\\~a" x))
+    (`(hexadecimal-escape-sequence ,x)
+     (format #f "\\x~a" x))
+    (`(universal-character-name ,x)
+     (case (string-length x)
+       ((4) (format #f "\\u~a" x))
+       ((8) (format #f "\\U~a" x))))))
+
+(define (stringify-string-tokens fragments)
+  (with-output-to-string
+    (lambda ()
+      (display #\")
+      (for-each (match-lambda
+                  (`(escape-sequence ,x)
+                   (display (stringify-escape-sequence x)))
+                  (s (display s)))
+                fragments)
+      (display #\"))))
+
 ;; Returns the "source" of the token, as a preprocessing string literal token
 (define (stringify-token preprocessing-token)
   (match (lexeme-body preprocessing-token)
-    (`(string-literal ,s)
-     (format #f "~s" s))
+    (('string-literal `(encoding-prefix ,prefix) parts ...)
+     (stringify-string-tokens parts))
+    (('string-literal parts ...)
+     (stringify-string-tokens parts))
     (`(header-name (q-string ,s))
      (format #f "~s" s))
     (`(header-name (h-string ,s))
author	Hugo Hörnquist <hugo@lysator.liu.se>	2022-07-13 04:39:14 +0200
committer	Hugo Hörnquist <hugo@lysator.liu.se>	2022-07-13 04:39:14 +0200
commit	c1cf0693982d9c1f1b871966752140fee5d76d19 (patch)
tree	65cf5c501f5b2029eb08bed7fa9844b9530385f3 /module
parent	Resolve # ## # (diff)
download	calp-c1cf0693982d9c1f1b871966752140fee5d76d19.tar.gz calp-c1cf0693982d9c1f1b871966752140fee5d76d19.tar.xz