aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHugo Hörnquist <hugo@lysator.liu.se>2022-07-14 23:44:03 +0200
committerHugo Hörnquist <hugo@lysator.liu.se>2022-07-14 23:44:03 +0200
commit64375c16c8316b8381ad59fa3538ad84732d90b7 (patch)
treecec344cb9306f1353ff0c4a4daaf7a81506eef5a
parentAdd C LALR parser. (diff)
downloadcalp-64375c16c8316b8381ad59fa3538ad84732d90b7.tar.gz
calp-64375c16c8316b8381ad59fa3538ad84732d90b7.tar.xz
work
-rw-r--r--module/c/cpp-types.scm23
-rw-r--r--module/c/cpp-util.scm19
-rw-r--r--module/c/lex2.scm44
-rw-r--r--module/c/preprocessor2.scm185
-rw-r--r--module/c/to-token.scm12
-rw-r--r--tests/test/cpp/lex2.scm54
-rw-r--r--tests/test/cpp/preprocessor2.scm80
7 files changed, 306 insertions, 111 deletions
diff --git a/module/c/cpp-types.scm b/module/c/cpp-types.scm
index b08e9810..6dad061e 100644
--- a/module/c/cpp-types.scm
+++ b/module/c/cpp-types.scm
@@ -2,6 +2,7 @@
:use-module (c lex2)
:use-module (ice-9 match)
:use-module (c cpp-util)
+ :use-module (hnh util type)
:export (whitespace-token?
comment-token?
preprocessing-token?
@@ -15,6 +16,9 @@
h-string-token?
q-string-token?
character-constant?
+ comment->whitespace
+ comments->whitespace
+ make-string-literal
))
(define (whitespace-token? x)
@@ -59,29 +63,32 @@
(`(pp-number ,x) x)
(_ #f))))
-;; TODO this fails if there are multiple components in the string token
;; TODO rename to string-literal-token?
(define (string-token? token)
(and (preprocessing-token? token)
(match (lexeme-body token)
- (`(string-literal ,x) x)
+ (('string-literal x ...) (apply values x))
(_ #f))))
(define (character-constant? token)
(and (preprocessing-token? token)
(match (lexeme-body token)
- (`(character-constant ,x) x)
+ (('character-constant x ...) (apply values x))
(_ #f))))
(define (h-string-token? token)
(and (preprocessing-token? token)
(match (lexeme-body token)
- (`(h-string ,x) x)
+ (`(header-name (h-string ,x)) x)
(_ #f))))
+;; NOTE q-string tokens are never produced by the lexer,
+;; since they instead are treated as regular strings
(define (q-string-token? token)
- (and (preprocessing-token? token)
- (match (lexeme-body token)
- (`(q-string ,x) x)
- (_ #f))))
+ (string-token? token))
+
+(define (make-string-literal parts)
+ (typecheck parts (list-of (or string? list?)))
+ (lexeme type: 'preprocessing-token
+ body: (cons 'string-literal parts)))
diff --git a/module/c/cpp-util.scm b/module/c/cpp-util.scm
index 3ea06505..633c5a0c 100644
--- a/module/c/cpp-util.scm
+++ b/module/c/cpp-util.scm
@@ -15,7 +15,8 @@
drop-whitespace-right
drop-whitespace-both
cleanup-whitespace
- concatenate-tokens))
+ concatenate-tokens
+ merge-string-literals))
;; Does the next non-whitespace token in the stream satisfy the predicate?
@@ -109,3 +110,19 @@
(define (concatenate-tokens a b)
(car (lex (string-append (unlex (list a))
(unlex (list b))))))
+
+
+(define (merge-string-literals tokens)
+ (cond ((null? tokens) '())
+ ((null? (cdr tokens)) tokens)
+ ((string-token? (car tokens))
+ (lambda (a . _) a)
+ => (lambda parts-a
+ (cond ((string-token? (cadr tokens))
+ (lambda (a . _) a)
+ => (lambda parts-b (merge-string-literals
+ (cons (make-string-literal (append parts-a parts-b))
+ (cddr tokens)))))
+ (else (cons (car tokens)
+ (merge-string-literals (cdr tokens)))))))
+ (else (cons (car tokens) (merge-string-literals (cdr tokens))))))
diff --git a/module/c/lex2.scm b/module/c/lex2.scm
index 049cc48c..647eff55 100644
--- a/module/c/lex2.scm
+++ b/module/c/lex2.scm
@@ -1,9 +1,12 @@
(define-module (c lex2)
:use-module (ice-9 peg)
:use-module (ice-9 match)
+ :use-module ((hnh util) :select (->))
:use-module (hnh util object)
:use-module (hnh util type)
:use-module (srfi srfi-88)
+ :use-module ((c trigraph) :select (replace-trigraphs))
+ :use-module ((c line-fold) :select (fold-lines))
:export (lex
lexeme lexeme?
placemaker
@@ -12,6 +15,8 @@
(noexpand . lexeme-noexpand)
parse-c-number
+
+ tokenize
))
;;; A.1 Lexical grammar
@@ -268,10 +273,15 @@
(or "[" "]" "(" ")" "{" "}"
"..." ; Moved to be before "."
"." "->"
- "++" "--" "&" "*" "+" "-" "~" "!"
- "/" "%" "<<" ">>" "<" ">" "<=" ">=" "==" "!=" "^" "|" "&&" "||"
+ "&&" "||"
+ "!="
+ "++" "--"
+ "*=" "/=" "%=" "+=" "-=" "<<=" ">>=" "&=" "^=" "|="
+ "<=" ">=" "=="
+ "="
+ "/" "%" "<<" ">>" "<" ">" "^" "|"
"?" ":" ";"
- "=" "*=" "/=" "%=" "+=" "-=" "<<=" ">>=" "&=" "^=" "|="
+ "&" "*" "+" "-" "~" "!"
"," "##" "#" ; # and ## flipped
"<:" ":>" "<%" "%>" "%:%:" "%:" ; %: and %:%: flipped
))
@@ -284,6 +294,8 @@
;; (6.4.7)
(define-peg-pattern header-name all
(or (and (ignore "<") h-string (ignore ">"))
+ ;; NOTE this case will never be reached, since it's treated as a regular
+ ;; string instead
(and (ignore "\"") q-string (ignore "\""))))
;; (6.4.7)
@@ -402,3 +414,29 @@
(define (parse-c-number string)
(match-pattern constant string))
+
+
+
+;;; 5.1.11.2 Translation phases
+
+(define (tokenize string)
+ (-> string
+;;; 1. trigraph replacement
+ replace-trigraphs
+;;; 2. Line folding
+ fold-lines
+;;; 3. Decomposition into preprocenning tokens, whitespaces, and comments
+ lex
+ comments->whitespace))
+
+;; These really belong in (c cpp-types), but that would create a dependency cycle
+
+(define (comment->whitespace token)
+ (if ;; (comment-token? token)
+ (and (lexeme? token)
+ (eq? 'comment (type token)))
+ (car (lex " "))
+ token))
+
+(define (comments->whitespace tokens)
+ (map comment->whitespace tokens))
diff --git a/module/c/preprocessor2.scm b/module/c/preprocessor2.scm
index c1db3f08..f18ca748 100644
--- a/module/c/preprocessor2.scm
+++ b/module/c/preprocessor2.scm
@@ -9,18 +9,33 @@
:select (function-like-macro variadic? identifier-list))
:use-module ((c cpp-environment object-like-macro) :select (object-like-macro object-like-macro?))
:use-module ((c cpp-environment internal-macro) :select (internal-macro))
- :use-module ((hnh util) :select (-> intersperse aif swap unless unval))
+ :use-module ((hnh util) :select (-> ->> intersperse aif swap unless unval))
:use-module ((hnh util lens) :select (set modify cdr*))
:use-module (hnh util path)
:use-module (hnh util type)
- :use-module ((c lex2) :select (lex placemaker lexeme? lexeme-body lexeme-noexpand))
- :use-module ((c trigraph) :select (replace-trigraphs))
- :use-module ((c line-fold) :select (fold-lines))
+ :use-module ((hnh util values) :select (abort* on-fst on-snd apply/values))
+ :use-module ((c lex2)
+ :select (lex
+ placemaker
+ lexeme?
+ lexeme-body
+ lexeme-noexpand
+
+ tokenize
+ ))
:use-module (c unlex)
:use-module (c cpp-types)
:use-module (c cpp-util)
- :use-module (ice-9 control)
- :export (defined-macro _Pragma-macro))
+ :export (_Pragma-macro
+ defined-macro
+ c-search-path
+ handle-preprocessing-tokens))
+
+
+(define (read-file path)
+ (call-with-input-file path (@ (ice-9 rdelim) read-string)))
+
+
(define-syntax-rule (alist-of variable key-type value-type)
(build-validator-body variable (list-of (pair-of key-type value-type))))
@@ -36,18 +51,10 @@
(define (ellipsis-token? token) (equal? "..." (punctuator-token? token)))
-(define-syntax-rule (abort* form)
- (call-with-values (lambda () form) abort))
-
-(define-syntax-rule (on-fst form)
- (% form
- (lambda (prompt fst . rest)
- (apply values (prompt fst) rest))))
-
-(define-syntax-rule (on-snd form)
- (% form
- (lambda (prompt fst snd . rest)
- (apply values fst (prompt snd) rest))))
+;; TODO
+;; > #if defined X
+;; is equivalent to
+;; > #if defined(X)
;; parameters is a lexeme list, as returned by parse-parameter-list
@@ -335,28 +342,6 @@
identifier: "__LINE__"
body: (lex (number->string (current-line environment)))))))
-(define (c-search-path) (make-parameter (list "." "/usr/include")))
-
-;; #include <stdio.h>
-(define (resolve-h-file string)
- (typecheck string string?)
- (cond ((path-absolute? string) string)
- (else
- (let ((filename
- (find file-exists?
- (map (lambda (path-prefix)
- (path-append path-prefix string))
- (c-search-path)))))
- (if filename filename
- (scm-error 'cpp-error "resolve-h-file"
- "Can't resolve file: ~s"
- (list string) #f))))))
-
-;; #include "myheader.h"
-(define (resolve-q-file string)
- (typecheck string string?)
- ;; This should always be a fallback (6.10.2, p. 3)
- (cond (else (resolve-h-file string))))
(define defined-macro
(internal-macro
@@ -412,9 +397,20 @@
environment))))
-(define (resolve-constant-expression tokens)
+;; 6.10.1 p. 4
+(define (resolve-constant-expression cpp-tokens)
;; (typecheck tokens (list-of lexeme?))
+ (define zero (car (lex "0")))
+ #;
+ (define tokens
+ (map preprocessing-token->token
+ (map (lambda (token)
+ (cond ((identifier-token? token) zero)
+ (else token)))
+ (remove whitespace-token? tokens))))
+
'TODO
+ ;; eval as per 6.6
)
@@ -446,13 +442,12 @@
(on-snd (cons (car tokens) (abort* (loop (bump-line environment) (cdr tokens))))))
((and (identifier-token? (car tokens))
(not (marked-noexpand? (car tokens))))
- (call-with-values
- (lambda () (maybe-extend-identifier environment
- (identifier-token? (car tokens))
- (lexeme-noexpand (car tokens))
- (cdr tokens)))
- ;; Here is the after expansion
- (if once? values loop)))
+ ;; Here is the loop after expansion
+ (apply/values (if once? values loop)
+ (maybe-extend-identifier environment
+ (identifier-token? (car tokens))
+ (lexeme-noexpand (car tokens))
+ (cdr tokens))))
(else (on-snd (cons (car tokens) (abort* (loop environment (cdr tokens)))))))))
@@ -490,7 +485,37 @@
identifier)
remaining-tokens)))))
-(define (resolve-and-include-header environment tokens)
+;; 'gcc -xc -E -v /dev/null' prints GCC:s search path
+(define c-search-path
+ (make-parameter (list "/usr/include"
+ "/usr/local/include")))
+
+;; #include <stdio.h>
+(define (resolve-h-file string)
+ (typecheck string string?)
+ (cond
+ ;; NOTE do I want this case?
+ ;; GCC has it
+ ((path-absolute? string) string)
+ (else
+ (or
+ (find file-exists?
+ (map (lambda (path-prefix)
+ (path-append path-prefix string))
+ (c-search-path)))
+ (scm-error 'cpp-error "resolve-h-file"
+ "Can't resolve file: ~s"
+ (list string) #f)))))
+
+;; #include "myheader.h"
+(define (resolve-q-file string)
+ (typecheck string string?)
+ (cond ((file-exists? string) string)
+ ;; This should always be a fallback (6.10.2, p. 3)
+ (else (resolve-h-file string))))
+
+
+(define (resolve-header environment tokens)
(typecheck environment cpp-environment?)
;; (typecheck tokens (list-of lexeme?))
@@ -499,21 +524,17 @@
(string-append msg ", tokens: ~s")
(append args (list (unlex tokens))) #f))))
(let loop ((%first-time #t) (tokens tokens))
- (cond ((null? tokens) '())
+ (cond ((null? tokens) (err "Invalid #include line"))
((h-string-token? (car tokens))
=> (lambda (str)
(unless (null? (drop-whitespace (cdr tokens)))
(err "Unexpected tokens after #include <>"))
- (handle-preprocessing-tokens
- environment
- (-> str resolve-h-file read-file tokenize))))
+ (resolve-h-file str)))
((q-string-token? (car tokens))
=> (lambda (str)
(unless (null? (drop-whitespace (cdr tokens)))
(err "Unexpected tokens after #include \"\""))
- (handle-preprocessing-tokens
- environment
- (-> str resolve-q-file read-file tokenize))))
+ (resolve-q-file str)))
(else
(unless %first-time (err "Failed parsing tokens"))
;; No newlines in #include
@@ -623,11 +644,23 @@
(body (drop-whitespace (cdr line-tokens))))
(if (eq? 'include directive)
;; include is special since it returns a token stream
- (call-with-values
- (lambda () (resolve-and-include-header environment body))
- (lambda (environment tokens)
- (loop environment
- (append tokens remaining-tokens))))
+ (let ((path (resolve-header environment body)))
+ ;; TODO change to store source location in lexemes
+ ;; and rewrite the following to
+ ;; (loop environment
+ ;; (append (-> path read-file tokenize) remaining-tokens))
+ ;; TODO and then transfer these source locations when we move
+ ;; to "real" tokens (c to-token)
+ (let ((env* tokens*
+ (loop
+ ;; same hack as at start of loop
+ (-> environment
+ (enter-file path)
+ (bump-line -1))
+ (append (lex "\n")
+ (-> path read-file tokenize)))))
+ (on-snd (append tokens* (abort* (loop (leave-file env*) remaining-tokens))))))
+
(let ((operation ; (environment, list token) → environment
(case directive
((if) resolve-for-if)
@@ -674,32 +707,6 @@
(unless (null? remaining-tokens) (lex "\n"))
(abort* (loop env* remaining-tokens))))))))))
- (else (err "Unexpected middle of line")))))
-
-
-
-(define (read-file path)
- (call-with-input-file path (@ (ice-9 rdelim) read-string)))
+ (else (err "Unexpected middle of line, (near ~s)"
+ (unlex tokens))))))
-(define (comment->whitespace token)
- (if (comment-token? token)
- (car (lex " "))
- token))
-
-(define (comments->whitespace tokens)
- (map comment->whitespace tokens))
-
-;;; 5.1.11.2 Translation phases
-
-(define (tokenize string)
- (-> string
-;;; 1. trigraph replacement
- replace-trigraphs
-;;; 2. Line folding
- fold-lines
-;;; 3. Decomposition into preprocenning tokens, whitespaces, and comments
- lex
-;;; 4. Execution of preprocessing directives, all preprocessing directives are then deleted
- comments->whitespace
- ;; squeeze-whitespace-blocks
- ))
diff --git a/module/c/to-token.scm b/module/c/to-token.scm
index c1efcc02..53db7e59 100644
--- a/module/c/to-token.scm
+++ b/module/c/to-token.scm
@@ -19,9 +19,16 @@
"_Generic" "_Imaginary" "_Noreturn" "_Static_assert"
"_Thread_local"))
+;; 6.4 paragraph 2
+;; Each preprocessing toket thas is converted to a token shall have the
+;; lexcal form of a keyword, an identifier, a constant, a string literal,
+;; or a puncturtor
(define (preprocessing-token->token cpp-token)
+ ;; Guile's cond handles multiple from expr, if written on the form
+ ;; (cond (expr check => proc) ...)
(cond ((string-token? cpp-token)
- => (lambda (content)
+ (lambda (a . _) a)
+ => (lambda content
(make-lexical-token 'string-literal #f content)))
((identifier-token? cpp-token)
@@ -36,7 +43,8 @@
(make-lexical-token 'constant #f content)))
((character-constant? cpp-token)
- => (lambda (x) (make-lexical-token 'constant #f x)))
+ (lambda (a . _) a)
+ => (lambda content (make-lexical-token 'constant #f content)))
((punctuator-token? cpp-token)
=> (lambda (s)
diff --git a/tests/test/cpp/lex2.scm b/tests/test/cpp/lex2.scm
index 47bb4a16..b7087c3b 100644
--- a/tests/test/cpp/lex2.scm
+++ b/tests/test/cpp/lex2.scm
@@ -92,3 +92,57 @@
(lexeme type: 'preprocessing-token body: '(punctuator "."))
(lexeme type: 'preprocessing-token body: '(identifier "dir")))
(lex "..\\listing.dir"))
+
+
+(test-equal "Propper H-string"
+ (list (lexeme type: 'preprocessing-token body: '(header-name (h-string "a"))))
+ (lex "<a>"))
+
+(test-equal "Unexpected h-string"
+ (list (lexeme type: 'preprocessing-token body: '(pp-number "1"))
+ (lexeme type: 'whitespace body: " ")
+ (lexeme type: 'preprocessing-token body: '(header-name (h-string " 2 ")))
+ (lexeme type: 'whitespace body: " ")
+ (lexeme type: 'preprocessing-token body: '(pp-number "3")))
+ (lex "1 < 2 > 3"))
+
+(test-equal "Quotation mark inside h-string"
+ (list (lexeme type: 'preprocessing-token body: '(header-name (h-string "a\"b"))))
+ (lex "<a\"b>"))
+
+(test-equal "Interaction of h-strings and regular strings"
+ (test-equal "Less than string, not h-string"
+ (list (lexeme type: 'preprocessing-token body: '(pp-number "1"))
+ (lexeme type: 'preprocessing-token body: '(string-literal "<"))
+ (lexeme type: 'preprocessing-token body: '(punctuator ">")))
+ (lex "1\"<\">"))
+
+ (test-equal "H-string, not string"
+ (list (lexeme type: 'preprocessing-token body: '(pp-number "1"))
+ (lexeme type: 'preprocessing-token body: '(header-name (h-string "\"")))
+ (lexeme type: 'other body: "\""))
+ (lex "1<\">\"")))
+
+(test-equal "Q-strings are lexed as regular strings"
+ (list (lexeme type: 'preprocessing-token body: '(punctuator "#"))
+ (lexeme type: 'preprocessing-token body: '(identifier "include"))
+ (lexeme type: 'whitespace body: " ")
+ (lexeme type: 'preprocessing-token body: '(string-literal "test")))
+ ;; # include here, since generated tokens could possible depend on that context,
+ ;; and the reason regular strings are returned is since the lexer doesn't check
+ ;; that context
+ (lex "#include \"test\"")
+ )
+
+
+
+(test-group "Unicode"
+ (test-equal "In string literals"
+ (list (lexeme type: 'preprocessing-token body: '(string-literal "åäö")))
+ (lex "\"åäö\""))
+
+ (test-equal "Outside string literals"
+ (list (lexeme type: 'other body: "å")
+ (lexeme type: 'other body: "ä")
+ (lexeme type: 'other body: "ö"))
+ (lex "åäö")))
diff --git a/tests/test/cpp/preprocessor2.scm b/tests/test/cpp/preprocessor2.scm
index 9f308c9e..71e0c1a0 100644
--- a/tests/test/cpp/preprocessor2.scm
+++ b/tests/test/cpp/preprocessor2.scm
@@ -38,7 +38,7 @@
:use-module (c lex2)
)
-;; TODO Redefinition code isn't yet written
+;; TODO Redefinition checking code isn't yet written
(test-skip "Example 6")
;; See (c preprocessor2) TODO#1
@@ -63,8 +63,11 @@
(define parse-parameter-list (@@ (c preprocessor2) parse-parameter-list))
(define resolve-define (@@ (c preprocessor2) resolve-define))
(define resolve-token-stream (@@ (c preprocessor2) resolve-token-stream))
-(define tokenize (@@ (c preprocessor2) tokenize))
-
+;; (define tokenize (@@ (c preprocessor2) tokenize))
+(define resolve-h-file (@@ (c preprocessor2) resolve-h-file))
+(define resolve-q-file (@@ (c preprocessor2) resolve-q-file))
+(define resolve-header (@@ (c preprocessor2) resolve-header))
+;; (define include-header (@@ (c preprocessor2) include-header))
;; Remove the noexpand list from each token.
@@ -78,6 +81,16 @@
(let ((env tokens (handle-preprocessing-tokens env (tokenize str))))
(drop-whitespace-both (remove-noexpand tokens))))
+ (define (call-with-tmp-header string proc)
+ (let* ((filename (string-copy "/tmp/headerfile-XXXXXXX"))
+ (port (mkstemp! filename)))
+ (with-output-to-port port
+ (lambda () (display string)
+ ))
+ (close-port port)
+ (proc filename)))
+
+
(test-group "Tokens until End Of Line"
(call-with-values
@@ -578,6 +591,7 @@
+
(test-group "Line directive"
(let ((e (make-environment)))
(test-equal "#line <number>"
@@ -595,10 +609,60 @@
(lex "x"))))))
-;; TODO
-;; resolve-h-file
-;; resolve-q-file
-;; include
+;; NOTE these tests assume a "regular" Unix system
+(test-group "#include"
+ (test-group "Resolve header paths"
+ (test-equal "Find in path"
+ "/usr/include/stdio.h"
+ (resolve-h-file "stdio.h"))
+
+ (test-error "Fail if not in path"
+ 'cpp-error
+ (resolve-h-file "This file doesn't exist"))
+
+ (test-equal "Q-string with absolute path"
+ "/dev/null"
+ (resolve-q-file "/dev/null"))
+ (test-error "Q-File fails for missing file"
+ 'cpp-error (resolve-q-file "This file doesn't exists"))
+
+ (test-equal "Q-strings also look in path"
+ "/usr/include/stdio.h"
+ (resolve-q-file "stdio.h")))
+
+ (test-group "resolve-header returns paths from pp tokens (from #include directive)"
+ (test-equal "H-string"
+ "/usr/include/stdio.h"
+ (resolve-header (make-environment)
+ (lex "<stdio.h>")))
+ (test-equal "Q-string"
+ "/usr/include/stdio.h"
+ (resolve-header (make-environment)
+ (lex "\"stdio.h\""))))
+
+ ;; TODO #include is subject to macro expansion
+ ;; test with through resolve-header
+
+
+ (test-group "Actually including stuff"
+ (call-with-tmp-header "
+#define X 10
+int x;
+" (lambda (filename)
+ (test-equal "Include through #include"
+ (lex "int x;\n\n10")
+ (run (format #f "
+#include \"~a\"
+X
+" filename))))))
+
+ ;; NOTE should really be below "regular" __LINE__ tests
+ (call-with-tmp-header "__LINE__" (lambda (path)
+ (test-equal "__LINE__ in other file"
+ (lex "1")
+ (run (format #f "#include \"~a\"\n" path))))))
+
+
@@ -941,7 +1005,7 @@ g(x+(3,4)-w) | h 5) & m
p() i[q()] = { q(1), r(2,3), r(4,), r(,5), r(,) };
char c[2][6] = { str(hello), str() };"))))
- ;; TODO Example 4 skipped due to #include
+ ;; TODO Example 4 skipped due to #include in output
(test-equal "Example 5"
(unlex-aggressive (lex "int j[] = { 123, 45, 67, 89, 10, 11, 12, };"))