diff options
Diffstat (limited to 'module/c')
-rw-r--r-- | module/c/compiler.scm | 64 | ||||
-rw-r--r-- | module/c/cpp-environment.scm | 215 | ||||
-rw-r--r-- | module/c/cpp-environment/function-like-macro.scm | 25 | ||||
-rw-r--r-- | module/c/cpp-environment/internal-macro.scm | 11 | ||||
-rw-r--r-- | module/c/cpp-environment/object-like-macro.scm | 18 | ||||
-rw-r--r-- | module/c/cpp-types.scm | 94 | ||||
-rw-r--r-- | module/c/cpp-util.scm | 130 | ||||
-rw-r--r-- | module/c/cpp.scm | 95 | ||||
-rw-r--r-- | module/c/eval.scm | 265 | ||||
-rw-r--r-- | module/c/eval/environment.scm | 34 | ||||
-rw-r--r-- | module/c/eval2.scm | 20 | ||||
-rw-r--r-- | module/c/lex.scm | 52 | ||||
-rw-r--r-- | module/c/lex2.scm | 549 | ||||
-rw-r--r-- | module/c/line-fold.scm | 29 | ||||
-rw-r--r-- | module/c/operators.scm | 3 | ||||
-rw-r--r-- | module/c/parse.scm | 411 | ||||
-rw-r--r-- | module/c/parse2.scm | 554 | ||||
-rw-r--r-- | module/c/preprocessor.scm | 370 | ||||
-rw-r--r-- | module/c/preprocessor2.scm | 752 | ||||
-rw-r--r-- | module/c/to-token.scm | 161 | ||||
-rw-r--r-- | module/c/trigraph.scm | 24 | ||||
-rw-r--r-- | module/c/unlex.scm | 84 | ||||
-rw-r--r-- | module/c/zipper.scm | 60 |
23 files changed, 3882 insertions, 138 deletions
diff --git a/module/c/compiler.scm b/module/c/compiler.scm new file mode 100644 index 00000000..09d49578 --- /dev/null +++ b/module/c/compiler.scm @@ -0,0 +1,64 @@ +(define-module (c compiler) + :use-module ((c lex2) :select (lex)) + :use-module ((c trigraph) :select (replace-trigraphs)) + :use-module ((c line-fold) :select (fold-lines)) + :use-module (c cpp-environment object-like-macro) + :use-module ((c cpp-environment) + :select (make-environment + extend-environment + enter-file)) + :use-module (hnh util) + ;; TODO importort + ;; handle-preprocessing-tokens + ;; load-and-tokenize-file + :export (run-compiler)) + +" +#define __STDC__ 1 +#define __STDC_HOSTED__ 1 +#define __STDC_VERSION__ 201112L +" + +(define now (localtime (current-time))) +(define default-macros + (list + ;; 6.10.8 + (object-like-macro + identifier: "__STDC__" + body: (lex "1")) + (object-like-macro + identifier: "__STDC_HOSTED__" + body: (lex "1")) + (object-like-macro + identifier: "__STDC_VERSION__" + body: (lex "201112L")) + (object-like-macro + identifier: "__DATE__" + ;; TODO format should always be in + ;; english, and not tranlated + body: (lex (strftime "\"%b %_d %Y\"" now))) + (object-like-macro + identifier: "__TIME__" + body: (lex (strftime "\"%H:%M:%S\"" now))))) + +(define environment + (-> (make-environment) + (extend-environment default-macros))) + + + +;;; 5.1.11.2 Translation phases + + + +(define (run-compiler path) + (define environment (enter-file (make-environment) path)) + (-> (load-and-tokenize-file path) + (handle-preprocessing-tokens environment)) +;;; 5. (something with character sets) +;;; 6. concatenation of string literals +;;; 7. Whitespace tokens are discarded, each preprocessing token is converted into a token + ;; 6.4 paragraph 2 + ;; Each preprocessing toket thas is converted to a token shall have the lexcal form of a keyword, an identifier, a constant, a string literal, or a puncturtor +;;; 8. All external objects and functions are resolved + ) diff --git a/module/c/cpp-environment.scm b/module/c/cpp-environment.scm new file mode 100644 index 00000000..a6401e71 --- /dev/null +++ b/module/c/cpp-environment.scm @@ -0,0 +1,215 @@ +(define-module (c cpp-environment) + :use-module (srfi srfi-1) + :use-module (srfi srfi-88) + :use-module (ice-9 hash-table) + :use-module (hnh util object) + :use-module (hnh util type) + :use-module (hnh util lens) + :use-module ((c cpp-environment function-like-macro) :prefix #{fun:}#) + :use-module ((c cpp-environment object-like-macro) :prefix #{obj:}#) + :use-module ((c cpp-environment internal-macro) :prefix #{int:}#) + :use-module ((c unlex) :select (unlex)) + :export ( + + macro-identifier + macro-body + macro-identifier-list + macro-variadic? + cpp-macro? + ;; pprint-macro + + enter-active-if + enter-inactive-if + flip-flop-if + leave-if + in-conditional/active? + in-conditional/inactive? + in-conditional? + + enter-file + leave-file + bump-line + current-line + current-file + + function-macro? + object-macro? + internal-macro? + + cpp-environment + cpp-environment? + cpp-if-status + ;; cpp-variables + cpp-file-stack + + make-environment in-environment? + remove-identifier add-identifier + get-identifier + extend-environment + disjoin-macro + + pprint-environment + pprint-macro + )) + +(define (macro-identifier x) + (define identifier + (cond ((obj:object-like-macro? x) obj:identifier) + ((fun:function-like-macro? x) fun:identifier) + ((int:internal-macro? x) int:identifier) + (else (scm-error 'wrong-type-arg "macro-identifier" + "Not a macro: ~s" + (list x) #f)))) + (identifier x)) + + +(define (macro-body-proc macro) + (cond ((obj:object-like-macro? macro) obj:body) + ((fun:function-like-macro? macro) fun:body) + ((int:internal-macro? macro) int:body) + (else (scm-error 'wrong-type-arg "macro-body" + "Not a macro: ~s" + (list macro) #f)))) + +(define macro-body + (case-lambda ((macro) ((macro-body-proc macro) macro)) + ((macro value) ((macro-body-proc macro) macro value)))) + +(define macro-identifier-list fun:identifier-list) +(define macro-variadic? fun:variadic?) + +(define function-macro? fun:function-like-macro?) +(define object-macro? obj:object-like-macro?) +(define internal-macro? int:internal-macro?) + +(define (cpp-macro? x) + (or (obj:object-like-macro? x) + (fun:function-like-macro? x) + (int:internal-macro? x))) + + + + +(define-type (cpp-environment) + (cpp-if-status type: (list-of (memv '(outside active-if inactive-if))) + default: '(outside)) + ;; not exported since type signatures don't hold inside the hash table + ;; TODO replace hash table with something that doesn't require copying the + ;; entire structure every time + (cpp-variables type: hash-table? default: (make-hash-table)) + (cpp-file-stack type: (and (not null?) + (list-of (pair-of string? exact-integer?))) + default: '(("*outside*" . 1)))) + + + + +(define (enter-active-if environment) + (modify environment cpp-if-status xcons 'active-if)) + +(define (enter-inactive-if environment) + (modify environment cpp-if-status xcons 'inactive-if)) + +;; for #else +(define (flip-flop-if environment) + ((if (in-conditional/inactive? environment) + enter-active-if + enter-inactive-if) + (leave-if environment))) + +(define (leave-if environment) + (modify environment cpp-if-status cdr)) + +(define (in-conditional/inactive? environment) + (eq? 'inactive-if (get environment cpp-if-status car*))) + +(define (in-conditional/active? environment) + (eq? 'active-if (get environment cpp-if-status car*))) + +(define (in-conditional? environment) + (or (in-conditional/inactive? environment) + (in-conditional/active? environment))) + + + +(define (enter-file environment filename) + (modify environment cpp-file-stack xcons (cons filename 1))) + +(define (leave-file environment) + (modify environment cpp-file-stack cdr)) + +(define current-line (compose-lenses cpp-file-stack car* cdr*)) + +(define current-file (compose-lenses cpp-file-stack car* car*)) + +(define* (bump-line environment optional: (count 1)) + (modify environment current-line + count)) + + + + +(define (make-environment) (cpp-environment)) + +(define (clone-hash-table ht) + (alist->hash-table (hash-map->list cons ht))) + +(define (clone-environment environment) + (modify environment cpp-variables clone-hash-table)) + +(define (in-environment? environment key) + (hash-get-handle (cpp-variables environment) key)) + +(define (remove-identifier environment key) + (typecheck key string?) + + (let ((environment (clone-environment environment))) + (hash-remove! (cpp-variables environment) key) + environment)) + +(define (add-identifier environment key value) + (typecheck key string?) + (typecheck value cpp-macro?) + + (let ((environment (clone-environment environment))) + (hash-set! (cpp-variables environment) key value) + environment)) + +(define (get-identifier environment key) + (hash-ref (cpp-variables environment) key)) + + +(define (extend-environment environment macros) + (typecheck macros (list-of cpp-macro?)) + (fold (lambda (m env) (add-identifier env (macro-identifier m) m)) + environment macros)) + +(define (disjoin-macro environment name) + (typecheck name string?) + (remove-identifier environment name)) + + + + +(define* (pprint-environment environment optional: (port (current-error-port))) + (display "== Environment ==\n") + (hash-for-each (lambda (key macro) + (pprint-macro macro port) + (newline port)) + (cpp-variables environment))) + +(define* (pprint-macro x optional: (p (current-output-port))) + (cond ((internal-macro? x) + (format p "/* ~a INTERNAL MACRO */" + (macro-identifier x))) + ((object-macro? x) + (format p "#define ~a ~a" + (macro-identifier x) + (unlex (macro-body x)))) + ((function-macro? x) + (format p "#define ~a(~a) ~a" + (macro-identifier x) + (string-join (append (macro-identifier-list x) + (if (macro-variadic? x) + '("...") '())) + "," 'infix) + (unlex (macro-body x)))))) diff --git a/module/c/cpp-environment/function-like-macro.scm b/module/c/cpp-environment/function-like-macro.scm new file mode 100644 index 00000000..59b47c9c --- /dev/null +++ b/module/c/cpp-environment/function-like-macro.scm @@ -0,0 +1,25 @@ +(define-module (c cpp-environment function-like-macro) + :use-module (hnh util object) + :use-module (hnh util type) + :use-module ((c lex2) :select (lexeme?)) + :use-module ((c unlex) :select (unlex)) + :export (function-like-macro + function-like-macro? + identifier + identifier-list + body + variadic?)) + +(define-type (function-like-macro + printer: (lambda (r p) + (format p "#<#define ~a~a ~a>" + (identifier r) + (append (identifier-list r) + (if (variadic? r) + '("...") '())) + (unlex (body r))))) + (identifier type: string?) + (identifier-list type: (list-of string?)) + (body type: (list-of lexeme?)) + (variadic? type: boolean? + default: #f)) diff --git a/module/c/cpp-environment/internal-macro.scm b/module/c/cpp-environment/internal-macro.scm new file mode 100644 index 00000000..3c946738 --- /dev/null +++ b/module/c/cpp-environment/internal-macro.scm @@ -0,0 +1,11 @@ +(define-module (c cpp-environment internal-macro) + :use-module (hnh util object) + :export (internal-macro + internal-macro? + identifier body)) + +(define-type (internal-macro) + (identifier type: string?) + (body type: procedure? + ;; Arity 2 + )) diff --git a/module/c/cpp-environment/object-like-macro.scm b/module/c/cpp-environment/object-like-macro.scm new file mode 100644 index 00000000..90a3ad23 --- /dev/null +++ b/module/c/cpp-environment/object-like-macro.scm @@ -0,0 +1,18 @@ +(define-module (c cpp-environment object-like-macro) + :use-module (hnh util object) + :use-module (hnh util type) + :use-module ((c lex2) :select (lexeme?)) + :use-module ((c unlex) :select (unlex)) + :export (object-like-macro + object-like-macro? + identifier + body)) + + +(define-type (object-like-macro + printer: (lambda (r p) + (format p "#<#define ~a ~a>" + (identifier r) + (unlex (body r))))) + (identifier type: string?) + (body type: (list-of lexeme?))) diff --git a/module/c/cpp-types.scm b/module/c/cpp-types.scm new file mode 100644 index 00000000..6dad061e --- /dev/null +++ b/module/c/cpp-types.scm @@ -0,0 +1,94 @@ +(define-module (c cpp-types) + :use-module (c lex2) + :use-module (ice-9 match) + :use-module (c cpp-util) + :use-module (hnh util type) + :export (whitespace-token? + comment-token? + preprocessing-token? + other-token? + placemaker-token? + newline-token? + identifier-token? + punctuator-token? + pp-number? + string-token? + h-string-token? + q-string-token? + character-constant? + comment->whitespace + comments->whitespace + make-string-literal + )) + +(define (whitespace-token? x) + (and (lexeme? x) + (eq? 'whitespace (lexeme-type x)))) + +(define (comment-token? x) + (and (lexeme? x) + (eq? 'comment (lexeme-type x)))) + +(define (preprocessing-token? x) + (and (lexeme? x) + (eq? 'preprocessing-token (lexeme-type x)))) + +(define (other-token? x) + (and (lexeme? x) + (eq? 'other (lexeme-type x)))) + +(define (placemaker-token? x) + (and (lexeme? x) + (eq? 'placemaker (lexeme-type x)))) + +(define (newline-token? x) + (and (whitespace-token? x) + (string=? "\n" (lexeme-body x)))) + +(define (identifier-token? token) + (and (preprocessing-token? token) + (match (lexeme-body token) + (`(identifier ,id) id) + (_ #f)))) + +(define (punctuator-token? token) + (and (preprocessing-token? token) + (match (lexeme-body token) + (`(punctuator ,x) x) + (_ #f)))) + +(define (pp-number? token) + (and (preprocessing-token? token) + (match (lexeme-body token) + (`(pp-number ,x) x) + (_ #f)))) + +;; TODO rename to string-literal-token? +(define (string-token? token) + (and (preprocessing-token? token) + (match (lexeme-body token) + (('string-literal x ...) (apply values x)) + (_ #f)))) + +(define (character-constant? token) + (and (preprocessing-token? token) + (match (lexeme-body token) + (('character-constant x ...) (apply values x)) + (_ #f)))) + + +(define (h-string-token? token) + (and (preprocessing-token? token) + (match (lexeme-body token) + (`(header-name (h-string ,x)) x) + (_ #f)))) + +;; NOTE q-string tokens are never produced by the lexer, +;; since they instead are treated as regular strings +(define (q-string-token? token) + (string-token? token)) + +(define (make-string-literal parts) + (typecheck parts (list-of (or string? list?))) + (lexeme type: 'preprocessing-token + body: (cons 'string-literal parts))) diff --git a/module/c/cpp-util.scm b/module/c/cpp-util.scm new file mode 100644 index 00000000..9674317b --- /dev/null +++ b/module/c/cpp-util.scm @@ -0,0 +1,130 @@ +(define-module (c cpp-util) + :use-module ((srfi srfi-1) :select (drop-while break)) + :use-module (srfi srfi-71) + :use-module ((hnh util) :select (->)) + :use-module (hnh util type) + :use-module ((hnh util lens) :select (modify ref)) + :use-module ((c lex2) :select (lex lexeme?)) + :use-module ((c unlex) :select (unlex)) + :use-module (c cpp-types) + :export (tokens-until-eol + tokens-until-cpp-directive + next-token-matches? + squeeze-whitespace + drop-whitespace + drop-whitespace-right + drop-whitespace-both + cleanup-whitespace + concatenate-tokens + merge-string-literals)) + + +;; Does the next non-whitespace token in the stream satisfy the predicate? +(define (next-token-matches? predicate tokens) + (let ((tokens (drop-whitespace tokens))) + (if (null? tokens) + #f + (predicate (car tokens))))) + +(define (next-token-matches/line? predicate tokens) + (let ((tokens (drop-whitespace/line tokens))) + (if (null? tokens) + #f + (predicate (car tokens))))) + +;; Returns two values: +;; - tokens until a newline token is met +;; - (potentially the newline token) and the remaining tokens +(define (tokens-until-eol tokens) + ;; (typecheck tokens (list-of lexeme?)) + (break newline-token? tokens)) + +;; call predicate with the remaining token stream, until we run out of token, or +;; predicate matches +(define (break-lexemes predicate lex-list) + (let loop ((rem lex-list) (done '())) + (cond ((null? rem) (values (reverse done) '())) + ((predicate rem) (values (reverse done) rem)) + (else (loop (cdr rem) (cons (car rem) done)))))) + +;; Finds the next instance of "\n#" (possibly with inbetween whitespace) +;; and return the values before and after (inclusive) +(define (tokens-until-cpp-directive tokens) + (break-lexemes + (lambda (tokens) + (and (newline-token? (car tokens)) + (next-token-matches/line? + (lambda (token) (equal? "#" (punctuator-token? token))) + (cdr tokens)))) + tokens)) + +;; Replace all whitespace with single spaces. +(define (squeeze-whitespace tokens) + (cond ((null? tokens) '()) + ((null? (cdr tokens)) + (list + (if (whitespace-token? (car tokens)) + (car (lex " ")) + (car tokens)))) + ((and (whitespace-token? (car tokens)) + (whitespace-token? (cadr tokens))) + (squeeze-whitespace (cons (car (lex " ")) + (cddr tokens)))) + (else (cons (car tokens) + (squeeze-whitespace (cdr tokens)))))) + +;; Drop leading whitespace tokens +(define (drop-whitespace tokens) + ;; (typecheck tokens (list-of lexeme?)) + (drop-while whitespace-token? tokens)) + +(define (drop-whitespace/line tokens) + ;; (typecheck tokens (list-of lexeme?)) + (drop-while (lambda (t) + (and (whitespace-token? t) + (not (newline-token? t)))) + tokens)) + +(define (drop-whitespace-right tokens) + ;; (typecheck tokens (list-of lexeme?)) + (-> tokens reverse drop-whitespace reverse)) + +(define (drop-whitespace-both tokens) + ;; (typecheck tokens (list-of lexeme?)) + (-> tokens + drop-whitespace + drop-whitespace-right)) + +;; helper procedure to parse-parameter-list. +;; If a parameter is complex then whitespace is kept, but squeezed to single spaces. Surounding whitespace is removed. +;; Example: +;; #define str(x, y) #y +;; str(x, ( 2, 4 ) ) +;; expands to: +;; "( 2, 4 )" +;; 6.10.3.2 p 2 +(define (cleanup-whitespace tokens) + ;; (typecheck tokens (list-of lexeme?)) + (-> tokens drop-whitespace-both squeeze-whitespace)) + +(define (concatenate-tokens a b) + (car (lex (string-append (unlex (list a)) + (unlex (list b)))))) + + +(define (merge-string-literals tokens) + (cond ((null? tokens) '()) + ((null? (cdr tokens)) tokens) + ((string-token? (car tokens)) + (lambda (a . _) a) + => (lambda (prefix-a . parts-a) + (cond ((string-token? (cadr tokens)) + (lambda (a . _) a) + => (lambda (prefix-b . parts-b) + (merge-string-literals + ;; TODO check validity of prefixes + (cons (make-string-literal (cons prefix-a (append parts-a parts-b))) + (cddr tokens))))) + (else (cons (car tokens) + (merge-string-literals (cdr tokens))))))) + (else (cons (car tokens) (merge-string-literals (cdr tokens)))))) diff --git a/module/c/cpp.scm b/module/c/cpp.scm index a2935352..aed496f2 100644 --- a/module/c/cpp.scm +++ b/module/c/cpp.scm @@ -5,47 +5,37 @@ :use-module (ice-9 match) :use-module (ice-9 regex) :use-module ((rnrs io ports) :select (call-with-port)) + :use-module ((rnrs bytevectors) :select (bytevector?)) :use-module (ice-9 format) :use-module ((hnh util io) :select (read-lines)) :use-module (hnh util graph) :use-module (c lex) :use-module (c parse) :use-module (c operators) - :export (do-funcall replace-symbols include#) + :export (replace-symbols include#) ) ;; input "#define F(x, y) x + y" -;; 1 full define | F(x, y) +;; 1 full define | F(x,y) ;; 2 macro name | F -;; 3 macro args | (x, y) -;; 4 macro body | x + y -(define define-re (make-regexp "^#define ((\\w+)(\\([^)]*\\))?) (.*)")) +;; 3 macro args | (x,y) +;; 5 macro body | x + y or #f +(define define-re (make-regexp "^#define ((\\w+)([(][^)]*[)])?)( (.*))?")) (define (tokenize-define-line header-line) (aif (regexp-exec define-re header-line) (cons (match:substring it 1) - (match:substring it 4)) + (let ((body (match:substring it 5))) + (if (or (eqv? body #f) + (string-null? body)) + "1" body))) (scm-error 'c-parse-error "tokenize-define-line" "Line dosen't match: ~s" (list header-line) #f))) -(define (do-funcall function arguments) - (if (list? arguments) - (apply function arguments) - (function arguments))) - -(define symb-map - `((,(symbol #\|) . logior) - (funcall . (@ (c cpp) do-funcall)) - (&& . and) - (& . logand) - (== . =) - (!= . (negate =)) - )) - (define (replace-symbols tree dict) (if (not (list? tree)) (or (assoc-ref dict tree) tree) @@ -55,12 +45,27 @@ ;; Direct values. Lisp also has quoted symbols in this group. (define (immediate? x) (or (number? x) - (char? x) - (string? x))) + (bytevector? x))) +;; TODO replace this with something sensible +;; like a correct list extracted from (c eval) +;; and not thinging that types are variables ;; built in symbols. Should never be marked as dependencies (define (primitive? x) - (memv x (cons 'funcall binary-operators))) + (memv x `( + ;; language primitives + sizeof + + ;; special forms introduced by parser + funcall ternary struct-type as-type + + ;; unary operatons which aren't also binary operators + ++ -- ! ~ + not compl dereference pointer + pre-increment pre-decrement + post-increment post-decrement + ,@binary-operators + ))) @@ -77,7 +82,6 @@ [arg (list arg)])) (define right (f (cdr pair))) - (define alt-right (replace-symbols right symb-map)) (define dependencies (lset-difference eq? @@ -91,12 +95,12 @@ dependencies (match left [('funcall name ('#{,}# args ...)) - (cons name `(lambda ,args ,alt-right))] + (cons name `(lambda ,args ,right))] [('funcall name arg) - (cons name `(lambda (,arg) ,alt-right))] + (cons name `(lambda (,arg) ,right))] - [name (cons name alt-right)]))) + [name (cons name right)]))) (define (parse-cpp-file lines) @@ -104,7 +108,9 @@ (catch #t (lambda () (parse-cpp-define line)) (lambda (err caller fmt args data) - (format #t "~a ~?~%" fmt args) + (format #t "~a in ~a: ~?~%" + err caller fmt args) + (format #t "~s~%" line) #f))) lines)) @@ -114,29 +120,32 @@ (define (tokenize-header-file header-file) (map tokenize-define-line (call-with-port - (open-input-pipe - (string-append "cpp -dM " header-file)) + (open-pipe* OPEN_READ "cpp" "-dM" header-file) read-lines))) -(define-macro (include# header-file . args) - - (define define-form (if (null? args) 'define (car args))) - - (define lines (remove (compose private-c-symbol? car) - (tokenize-header-file header-file))) +(define (load-cpp-file header-file) + (define lines (tokenize-header-file header-file)) (define forms (parse-cpp-file lines)) - (define graph* - (fold (lambda (node graph) - (add-node graph (cdr node) (car node))) - (make-graph car) - (filter identity forms))) + (fold (lambda (node graph) + (add-node graph (cdr node) (car node))) + (make-graph car) + (filter identity forms))) +(define (include% header-file) + (define graph* (load-cpp-file header-file)) ;; Hack for termios since this symbol isn't defined. ;; (including in the above removed private c symbols) - (define graph (add-node graph* (cons '_POSIX_VDISABLE #f) '())) + (define graph (add-node graph* (cons '_POSIX_VDISABLE 0) '())) + ;; TODO expand bodies + ;; (remove (compose private-c-symbol? car)) + (resolve-dependency-graph graph)) + +(define-macro (include# header-file . args) + + (define define-form (if (null? args) 'define (car args))) `(begin ,@(map (lambda (pair) `(,define-form ,(car pair) ,(cdr pair))) - (resolve-dependency-graph graph)))) + (include% header-file)))) diff --git a/module/c/eval.scm b/module/c/eval.scm new file mode 100644 index 00000000..67d0075d --- /dev/null +++ b/module/c/eval.scm @@ -0,0 +1,265 @@ +(define-module (c eval) + :use-module (hnh util) + :use-module (srfi srfi-1) + :use-module (srfi srfi-71) + :use-module (srfi srfi-88) + :use-module (ice-9 match) + :use-module (ice-9 curried-definitions) + :use-module ((rnrs bytevectors) + :select (bytevector?)) + :use-module ((rnrs arithmetic bitwise) + :select (bitwise-not + bitwise-and + bitwise-ior + bitwise-xor + bitwise-arithmetic-shift-left + bitwise-arithmetic-shift-right)) + :use-module (c eval environment) + :export (c-procedure? + procedure-formals + procedure-body + procedure-arity + + c-eval + )) + +(define C-TRUE 1) +(define C-FALSE 0) + +(define (boolean->c-boolean bool) + (if bool C-TRUE C-FALSE)) + +(define (c-boolean->boolean bool) + (not (zero? bool))) + +(define (c-not b) + (-> b c-boolean->boolean not boolean->c-boolean)) + +(define (c-procedure? expr) + (and (list? expr) + (not (null? expr)) + (eq? 'lambda (car expr)))) + +(define* (ensure-c-procedure expr optional: calling-procedure) + (unless (c-procedure? expr) + (scm-error 'c-eval-error calling-procedure + "Value not a procedure: ~s" + (list procedure #f)))) + +(define (procedure-formals procedure) + (ensure-c-procedure procedure "procedure-formals") + (list-ref procedure 1)) + +(define (procedure-body procedure) + (ensure-c-procedure procedure "procedure-body") + (list-ref procedure 2)) + +(define (procedure-arity procedure) + (length (procedure-formals procedure))) + +(define (literal? expression) + (or (number? expression) + (bytevector? expression))) + + + +;; Internal helper procedures + +(define (mod-set operation) + (lambda (env slot value) + ;; a += b + ;; a = a + b + (c-eval env `(= ,slot (,operation ,slot ,value))))) + +(define (fold-2 proc init lst) + (car+cdr + (fold (lambda (arg env+done) + (let ((env* arg* (proc (car env+done) arg))) + (cons* env* arg* (cdr env+done)))) + init + lst))) + +;; TODO this disregards +;; - floating point convertions +;; - integer truncation +(define ((simple-infix operation) env . operands) + (let ((env done (fold-2 c-eval (cons env '()) operands))) + (values env (apply operation (reverse done))))) + +(define ((binary-operator proc) env i c) + (let ((env* i* (c-eval env i))) + (let ((env** c* (c-eval env* c))) + (values env** (proc i* c*))))) + + + + +;; The order of evaluation for a number of these is undefined, meaning +;; that any side effects without sequence points is undefined. +;; However, for many of these I do the sensible thing and evaluate them +;; from left to right, instead of ignoring all side effects. + +;; TODO double check these with a C standard + + +;; Operators have their own namespace. They are called without funcall +;; in the pseudo-lisp which C is compiled to, and they expand more like +;; lisp macros, since its up to each operator what to do with its operands. +;; This to allow assignment and short circuting. +(define primitives + `((and . ,(lambda (env . operands) + (let loop ((env env) (operands operands)) + (if (null? operands) + (values env C-TRUE) + (let* ((env* result (c-eval env (car operands)))) + (if (c-boolean->boolean result) + (loop env* (cdr operands)) + (values env* result))))))) + (or . ,(lambda (env . operands) + (let loop ((env env) (operands operands)) + (if (null? operands) + (values env C-FALSE) + (let* ((env* result (c-eval env (car operands)))) + (if (false? result) + (values env* result) + (loop env* (cdr operands)))))))) + (= . ,(lambda (env slot value) + ;; TOOD if slot isn't a variable, but a field (or array index) + ;; then it needs to be resolved... + (let ((env* result (c-eval env value))) + (values (env-set! env* slot result) + result)))) + (and_eq ,(mod-set 'bitand)) ; &= + (or_eq ,(mod-set 'bitor)) ; |= + (xor_eq ,(mod-set 'xor)) ; ^= + (+= ,(mod-set '+)) + (-= ,(mod-set '-)) + (*= ,(mod-set '*)) + (/= ,(mod-set '/)) + (<<= ,(mod-set '<<)) + (>>= ,(mod-set '>>)) + (%= ,(mod-set '%)) + (+ . ,(simple-infix +)) + (* . ,(simple-infix *)) + (/ . ,(simple-infix /)) + (- . ,(lambda (env op . operands) + (if (null? operands) + (let ((env* value (c-eval env op))) + (values env* (- value))) + (apply (simple-infix -) + env op operands)))) + (bitor . ,(simple-infix bitwise-ior)) + (bitand . ,(simple-infix bitwise-and)) + (xor . ,(simple-infix bitwise-xor)) + (not_eq . ,(lambda (env a b) (c-eval env `(not (== ,a ,b))))) ; != + (<< . ,(binary-operator bitwise-arithmetic-shift-left)) + (>> . ,(binary-operator bitwise-arithmetic-shift-right)) + (< . ,(binary-operator (compose boolean->c-boolean <))) + (> . ,(binary-operator (compose boolean->c-boolean >))) + ;; this assumes that = handles pointers + (== . ,(binary-operator (compose boolean->c-boolean =))) + (<= . ,(binary-operator (compose boolean->c-boolean <=))) + (>= . ,(binary-operator (compose boolean->c-boolean >=))) + (% . ,(binary-operator modulo)) + + (not . ,(lambda (env value) + (let ((env* result (c-eval env value))) + (values env* (c-not result))))) + (compl . ,(lambda (env value) + (let ((env* result (c-eval env value))) + (values env* (bitwise-not result))))) + + ;; ++C + (pre-increment . ,(lambda (env slot) (c-eval env `(+= ,slot 1)))) + (pre-decrement . ,(lambda (env slot) (c-eval env `(-= ,slot 1)))) + ;; TODO these (C++, C--) need to handle if slot isn't a direct variabl + (post-increment . ,(lambda (env slot) + (let ((value (env-ref env slot))) + (values (env-set! env slot (1+ value)) + value)))) + (pre-decrement . ,(lambda (env slot) + (let ((value (env-ref env slot))) + (values (env-set! env slot (1+ value)) + value)))) + + (ternary . ,(lambda (env test true-clause false-clause) + (let ((env* value (c-eval env test))) + (c-eval env* + (if (c-boolean->boolean value) + true-clause false-clause))))) + + ;; TODO remaining operations + (as-type . ,(lambda (env target-type value) + (format (current-error-port) "cast<~s>(~s)~%" target-type value) + (values env value))) + + (object-slot . ,(lambda (env object slot) + (scm-error 'not-implemented "object-slot" + "Object slots are not implemented, when accessing ~s.~s" + (list object slot) #f))) + (dereference-slot ,(lambda (env ptr slot) + (scm-error 'not-implemented "dereference-slot" + "Object slots are not implemented, when accessing ~s->~s" + (list object slot) #f))) + (dereference . ,(lambda (env ptr) + (scm-error 'not-implemented "dereference" + "Poiner dereferencing is not implemented: *~s" + (list ptr) #f))) + (pointer . ,(lambda (env value) + (scm-error 'not-implemented "pointer" + "Pointer of is not implemented: &~s" + (list value) #f))))) + +;; TODO |,| + + +(define (c-eval environment expression) + (match expression + (('lambda formals body) (values environment `(lambda ,formals ,body))) + ;; hack since sizeof really should be a operator + (('funcall 'sizeof arg) + ;; TODO + (format (current-error-port) "sizeof ~s~%" arg) + (values environment 1)) + + (('funcall procedure-name ('#{,}# args ...)) + (let ((procedure (env-ref environment procedure-name))) + (ensure-c-procedure procedure "c-eval") + (unless (= (length args) (procedure-arity procedure)) + (scm-error 'c-eval-error "c-eval" + "Procedure arity mismatch for ~s, expected ~s, got ~s" + (list procedure-name + (procedure-arity procedure) + (length args)) + #f)) + (let ((env args* (fold-2 c-eval (cons environment '()) args ))) + (let ((inner-environment + (fold (lambda (name value env) (env-set! env name value)) + (push-frame! env) + (procedure-formals procedure) args*))) + (let ((resulting-environment + result-value + (c-eval inner-environment (procedure-body procedure)))) + (values (pop-frame! resulting-environment) + result-value)))))) + + (('funcall procedure arg) + (c-eval environment `(funcall ,procedure (#{,}# ,arg)))) + + ((operator operands ...) + (apply (or (assoc-ref primitives operator) + (scm-error 'c-eval-error "c-eval" + "Applying non-existant primitive operator: ~s, operands: ~s" + (list operator operands) #f)) + environment operands)) + + ;; "f()" gets compiled to simply f + ;; meaning that we instead use the environment to determine + ;; if something is a variable or procedure + (expr + (if (literal? expr) + (values environment expr) + (let ((value (env-ref environment expr))) + (if (c-procedure? value) + (c-eval environment `(funcall ,value (#{,}#))) + (values environment value))))))) diff --git a/module/c/eval/environment.scm b/module/c/eval/environment.scm new file mode 100644 index 00000000..12eefaf7 --- /dev/null +++ b/module/c/eval/environment.scm @@ -0,0 +1,34 @@ +(define-module (c eval environment) + :use-module (srfi srfi-1) + :export (make-environment + env-set! env-ref push-frame! pop-frame!)) + +(define (make-frame) + (make-hash-table)) + +(define (make-environment) + (list (make-frame))) + +;; Returns an updated environment, linear update +(define (env-set! env key value) + ;; get handle to differentiate #f + ;; (even though #f should never be stored since it's not a C value) + (cond ((find (lambda (frame) (hashq-get-handle frame key)) env) + => (lambda (frame) (hashq-set! frame key value))) + (else (hashq-set! (car env) key value))) + env) + +(define (env-ref env key) + (cond ((null? env) + (scm-error 'misc-error "env-ref" + "~s unbound" + (list key) + #f)) + ((hashq-get-handle (car env) key) => cdr) + (else (env-ref (cdr env) key)))) + +(define (push-frame! environment) + (cons (make-frame) environment)) + +(define (pop-frame! environment) + (cdr environment)) diff --git a/module/c/eval2.scm b/module/c/eval2.scm new file mode 100644 index 00000000..d58f20bf --- /dev/null +++ b/module/c/eval2.scm @@ -0,0 +1,20 @@ +(define-module (c eval2) + :use-module ((hnh util) :select (->)) + :export (C-TRUE + C-FALSE + boolean->c-boolean + c-boolean->boolean + c-not)) + + +(define C-TRUE 1) +(define C-FALSE 0) + +(define (boolean->c-boolean bool) + (if bool C-TRUE C-FALSE)) + +(define (c-boolean->boolean bool) + (not (zero? bool))) + +(define (c-not b) + (-> b c-boolean->boolean not boolean->c-boolean)) diff --git a/module/c/lex.scm b/module/c/lex.scm index 34e52d88..0bde5c9e 100644 --- a/module/c/lex.scm +++ b/module/c/lex.scm @@ -43,8 +43,23 @@ (define-peg-pattern integer all (and (or base-8 base-16 base-10) (? integer-suffix))) +(define-peg-pattern float-suffix all + (* (or "f" "F" "l" "L"))) + +(define-peg-pattern exponent all + (and (ignore (or "e" "E")) (? (or "+" "-")) base-10)) + +;; Helper patterns for creating named groups in float +(define-peg-pattern float-integer all base-10) +(define-peg-pattern float-decimal all base-10) + +(define-peg-pattern float all + (or (and float-integer exponent (? float-suffix)) + (and (? float-integer) (ignore ".") float-decimal (? exponent) (? float-suffix)) + (and float-integer (ignore ".") (? exponent) (? float-suffix)))) + (define-peg-pattern number body - (or integer)) + (or float integer)) (define-peg-pattern group all (and (ignore "(") expr (ignore ")"))) @@ -65,11 +80,16 @@ (define-peg-pattern char all (and (ignore "'") (or escaped-char peg-any) (ignore "'"))) +(define-peg-pattern quot none "\"") + +(define-peg-pattern string all + (and quot (* (or escaped-char (and (not-followed-by "\"") peg-any))) quot)) (define-peg-pattern* operator all `(or ,@(map symbol->string symbol-binary-operators) ,@(map (lambda (op) `(and ,(symbol->string op) ws)) - wordy-binary-operators))) + wordy-binary-operators) + "?" ":")) ;; whitespace (define-peg-pattern ws none @@ -89,17 +109,23 @@ base-10-digit)))) (define-peg-pattern prefix-operator all - (or "!" "~" "*" "&" "++" "--" "+" "-")) + ;; It's important that ++ and -- are BEFORE + and - + ;; otherwise the first + is found, leaving the second +, which fails + ;; to lex since it's an invalid token + ;; TODO sizeof can be written as a prefix operator + ;; (without parenthesis) if the operand is an expression. + (or "*" "&" "++" "--" + "!" "~" "+" "-")) + ;;; Note that stacked pre or postfix operators without parenthesis ;;; dosen't work. So `*&C' is invalid, while `*(&C)' is valid. (define-peg-pattern prefix all - (and prefix-operator sp (or variable group funcall #; postfix - ))) + (and prefix-operator sp (or variable group funcall literal))) (define-peg-pattern postfix-operator all - (or "++" "--")) + (or "++" "--" "*")) (define-peg-pattern postfix all ;; literals can't be in-place incremented and decremented @@ -111,15 +137,25 @@ ;; first case is "same" as expr, but in different order to prevent ;; infinite self reference. Pre and postfix not here, solved by having ;; them before infix in expr - (and (or funcall postfix prefix group char number variable) + (and (or funcall postfix prefix group literal variable) sp operator sp expr)) (define-peg-pattern funcall all (and variable sp group)) +(define-peg-pattern literal body + (or char string number)) + ;;; main parser (define-peg-pattern expr body - (+ (and sp (or infix postfix prefix funcall group char number variable) + (+ (and sp (or + ;; float must be BEFORE infix, otherwise 3.2 is parsed as (infix 3 (operator ".") 2) + ;; that however breaks the infix logic, meaning that floating point numbers can't be + ;; used in basic arithmetic. + ;; TODO remove all implicit order of operations handling in the lexer, and move it to + ;; the parser. This should also fix the case of typecasts being applied incorrectly. + float + infix postfix prefix funcall group literal variable) sp))) diff --git a/module/c/lex2.scm b/module/c/lex2.scm new file mode 100644 index 00000000..af90dcce --- /dev/null +++ b/module/c/lex2.scm @@ -0,0 +1,549 @@ +(define-module (c lex2) + :use-module (ice-9 peg) + :use-module (ice-9 match) + :use-module ((hnh util) :select (->)) + :use-module (hnh util object) + :use-module (hnh util type) + :use-module ((srfi srfi-1) :select (fold)) + :use-module (srfi srfi-88) + :use-module ((c trigraph) :select (replace-trigraphs)) + :use-module ((c line-fold) :select (fold-lines)) + :export (lex + lexeme lexeme? + placemaker + (type . lexeme-type) + (body . lexeme-body) + (noexpand . lexeme-noexpand) + + parse-c-number + + tokenize + )) + +;;; A.1 Lexical grammar +;;; A.1.1 Lexical elements + +;; (6.4) +(define-peg-pattern token all + (or keyword + identifier + constant + string-literal + punctuator + )) + +;; (6.4) +(define-peg-pattern preprocessing-token all + ;; string literal moved before header-name since string literals + ;; otherwise became q-strings + (or string-literal + header-name + character-constant + identifier + pp-number + punctuator + ;; Each non-white-space character that cannot be one of the above + )) + +;;; A.1.2 Keywords + +;; (6.4.1) +(define-peg-pattern keyword all + (or "auto" "break" "case" "char" "const" "continue" "default" + "do" "double" "else" "enum" "extern" "float" "for" "goto" + "if" "inline" "int" "long" "register" "restrict" "return" + "short" "signed" "sizeof" "static" "struct" "switch" + "typedef" "union" "unsigned" "void" "volatile" "while" + "_Alignas" "_Alignof" "_Atomic" "_Bool" "_Complex" + "_Generic" "_Imaginary" "_Noreturn" "_Static_assert" + "_Thread_local")) + +;;; A.1.3 Identifiers + +;; (6.4.2.1) +(define-peg-pattern identifier all + (and identifier-nondigit (* (or identifier-nondigit digit)))) + +;; (6.4.2.1) +(define-peg-pattern identifier-nondigit body + (or nondigit + universal-character-name + ;; TODO other implementation-defined characters + )) + +;; (6.4.2.1) +(define-peg-pattern nondigit body + (or "_" + (range #\A #\Z) + (range #\a #\z))) + +;; (6.4.2.1) +(define-peg-pattern digit body + (range #\0 #\9)) + +;;; A.1.4 Universal character names + +;; (6.4.3) +(define-peg-pattern universal-character-name all + (or (and (ignore "\\u") hex-quad) + (and (ignore "\\U") hex-quad hex-quad))) + +;; (6.4.3) +(define-peg-pattern hex-quad body + (and hexadecimal-digit hexadecimal-digit + hexadecimal-digit hexadecimal-digit)) + +;;; A.1.5 Constants + +;; (6.4.4) +(define-peg-pattern constant all + ;; Int and float swapped from standard since we need to try parsing + ;; the floats beforehand + (or floating-constant + integer-constant + enumeration-constant + character-constant)) + +;; (6.4.4.1) +(define-peg-pattern integer-constant all + (and (or decimal-constant + hexadecimal-constant + octal-constant) + (? integer-suffix))) + +;; (6.4.4.1) +(define-peg-pattern decimal-constant all + (and nonzero-digit (* digit))) + +;; (6.4.4.1) +(define-peg-pattern octal-constant all + (and "0" (* octal-digit))) + +;; (6.4.4.1) +(define-peg-pattern hexadecimal-constant all + (and hexadecimal-prefix (+ hexadecimal-digit))) + +;; (6.4.4.1) +(define-peg-pattern hexadecimal-prefix none + (or "0x" "0X")) + +;; (6.4.4.1) +(define-peg-pattern nonzero-digit body + (range #\1 #\9)) + +;; (6.4.4.1) +(define-peg-pattern octal-digit body + (range #\0 #\7)) + +;; (6.4.4.1) +(define-peg-pattern hexadecimal-digit body + (or (range #\0 #\9) + (range #\a #\f) + (range #\A #\F))) + +;; (6.4.4.1) +(define-peg-pattern integer-suffix all + (or (and unsigned-suffix (? long-suffix)) + (and long-suffix (? unsigned-suffix)))) + +;; (6.4.4.1) +;; This is a merger of long-suffix and long-long-suffix +(define-peg-pattern long-suffix body + (or "l" "L" "ll" "LL")) + +;; (6.4.4.1) +(define-peg-pattern unsigned-suffix body + (or "u" "U")) + +;; (6.4.4.2) +(define-peg-pattern floating-constant all + (or decimal-floating-constant + hexadecimal-floating-constant)) + +;; (6.4.4.2) +(define-peg-pattern decimal-floating-constant all + (or (and fractional-constant (? exponent-part) (? floating-suffix)) + (and digit-sequence exponent-part (? floating-suffix)))) + +;; (6.4.4.2) +(define-peg-pattern hexadecimal-floating-constant all + (and hexadecimal-prefix + (or hexadecimal-fractional-constant + hexadecimal-digit-sequence) + binary-exponent-part + (? floating-suffix))) + +;; (6.4.4.2) +(define-peg-pattern fractional-constant all + (or (and (? digit-sequence) "." digit-sequence) + (and digit-sequence "."))) + +;; (6.4.4.2) +(define-peg-pattern exponent-part all + (and (or "e" "E") (? sign) digit-sequence)) + +;; (6.4.4.2) +(define-peg-pattern sign all + (or "+" "-")) + +;; (6.4.4.2) +(define-peg-pattern digit-sequence body + (+ digit)) + +;; (6.4.4.2) +(define-peg-pattern hexadecimal-fractional-constant all + (or (and (? hexadecimal-digit-sequence) "." hexadecimal-digit-sequence) + (and hexadecimal-digit-sequence "."))) + +;; (6.4.4.2) +(define-peg-pattern binary-exponent-part all + (and (ignore (or "p" "P")) + (? sign) + digit-sequence)) + +;; (6.4.4.2) +(define-peg-pattern hexadecimal-digit-sequence body + (+ hexadecimal-digit)) + +;; (6.4.4.2) +(define-peg-pattern floating-suffix all + (or "f" "l" "F" "L")) + +;; (6.4.4.3) +(define-peg-pattern enumeration-constant all + identifier) + +(define-peg-pattern character-prefix all + (or "L" "u" "U")) + +;; (6.4.4.4) +(define-peg-pattern character-constant all + (and (? character-prefix) + (ignore "'") + (+ c-char) + (ignore "'"))) + +;; (6.4.4.4) +(define-peg-pattern c-char body + (or (and (not-followed-by (or "'" "\\" "\n")) peg-any) + escape-sequence)) + +;; (6.4.4.4) +(define-peg-pattern escape-sequence all + (or simple-escape-sequence + octal-escape-sequence + hexadecimal-escape-sequence + universal-character-name)) + +;; (6.4.4.4) +(define-peg-pattern simple-escape-sequence all + (and (ignore "\\") (or "'" "\"" "?" "\\" + "a" "b" "f" "n" "r" "t" "v"))) + +;; (6.4.4.4) +(define-peg-pattern octal-escape-sequence all + (and (ignore "\\") octal-digit (? octal-digit) (? octal-digit))) + +;; (6.4.4.4) +(define-peg-pattern hexadecimal-escape-sequence all + (and (ignore "\\x") (+ hexadecimal-digit))) + +;; A.1.6 String literals + +;; (6.4.5) +(define-peg-pattern string-literal all + (and (? encoding-prefix) + (ignore "\"") + (* s-char) + (ignore "\""))) + +;; (6.4.5) +(define-peg-pattern encoding-prefix all + (or "u8" "u" "U" "L")) + +;; (6.4.5) +(define-peg-pattern s-char body + (or (and (not-followed-by (or "\"" "\\" "\n")) peg-any) + escape-sequence)) + +;;; A.1.7 + +;; (6.4.6) +(define-peg-pattern punctuator all + (or "[" "]" "(" ")" "{" "}" + "..." ; Moved to be before "." + "." "->" + "&&" "||" + "!=" + "++" "--" + "*=" "/=" "%=" "+=" "-=" "<<=" ">>=" "&=" "^=" "|=" + "<=" ">=" "==" + "=" + "/" "%" "<<" ">>" "<" ">" "^" "|" + "?" ":" ";" + "&" "*" "+" "-" "~" "!" + "," "##" "#" ; # and ## flipped + "<:" ":>" "<%" "%>" "%:%:" "%:" ; %: and %:%: flipped + )) + +;;; A.1.8 Header names + +(define-peg-pattern h-string all (+ h-char)) +(define-peg-pattern q-string all (+ q-char)) + +;; (6.4.7) +(define-peg-pattern header-name all + (or (and (ignore "<") h-string (ignore ">")) + ;; NOTE this case will never be reached, since it's treated as a regular + ;; string instead + (and (ignore "\"") q-string (ignore "\"")))) + +;; (6.4.7) +(define-peg-pattern h-char body + (or (and (not-followed-by (or ">" "\n")) peg-any) + escape-sequence)) + +;; (6.4.7) +(define-peg-pattern q-char body + (or (and (not-followed-by (or "\"" "\n")) peg-any) + escape-sequence)) + +;;; A.1.9 Preprocessing numbers + +;; (6.4.8) +(define-peg-pattern pp-number all + (and (? ".") digit + (* (or digit + identifier-nondigit + (and (or "e" "E" "p" "P") + sign) + ".")))) + + + +(define-peg-pattern whitespace all + (or "\t" "\n" "\v" "\f" " " + ;; "\r" + )) + +(define-peg-pattern block-comment body + (and (ignore "/*") + (* (and (not-followed-by "*/") + peg-any)) + (ignore "*/"))) + +(define-peg-pattern line-comment body + (and (ignore "//") + (* (and (not-followed-by "\n") + peg-any)))) + +(define-peg-pattern comment all + (or line-comment block-comment)) + +(define-peg-pattern non-whitespace all + (and (not-followed-by whitespace) + peg-any)) + +(define-peg-pattern preprocessing-tokens all + (* (or whitespace + comment + preprocessing-token + non-whitespace))) + + + +;; comment could be merged with whitespace, but then unlex would have to know that + +;; other is the "each non-white-space character that cannot be one of the above" +;; clause from 6.4 p. 1 + +(define-type (lexeme) + (type type: (memv '(whitespace comment preprocessing-token other placemaker))) + (body type: (or string? list?)) + (noexpand type: (list-of string?) + default: '())) + +(define (placemaker) + (lexeme type: 'placemaker body: '())) + +(define (lex-output->lexeme-object x) + (match x + (`(non-whitespace ,body) + (lexeme body: body type: 'other)) + (`(whitespace ,body) + (lexeme body: body type: 'whitespace )) + (`(comment ,body) + (lexeme body: body type: 'comment )) + (`(preprocessing-token ,body) + (match body + ('string-literal + ;; Unflatten case + (lexeme body: '(string-literal (encoding-prefix) "") + type: 'preprocessing-token)) + (('string-literal `(encoding-prefix ,px) args ...) + (lexeme body: `(string-literal (encoding-prefix . ,px) ,@args) + type: 'preprocessing-token)) + (('string-literal args ...) + (lexeme body: `(string-literal (encoding-prefix) ,@args) + type: 'preprocessing-token)) + (('character-constant `(character-prefix ,px) args ...) + (lexeme body: `(character-constant (character-prefix . ,px) + ,@args) + type: 'preprocessing-token)) + (('character-constant args ...) + (lexeme body: `(character-constant (character-prefix) ,@args) + type: 'preprocessing-token)) + (body (lexeme body: body type: 'preprocessing-token)))) + + ;; "unflatten" + ('comment (lexeme body: "" type: 'comment)))) + + + + +;; At a number of places I chose token depending on the order of the rule. The +;; standard however says that the longest possible choice should be used. +;; 6.4 p. 4 + +;; returns a list of lexemes +(define (lex string) + (if (string-null? string) + '() + (map lex-output->lexeme-object + (let ((result (match-pattern preprocessing-tokens string))) + (let ((trailing (substring (peg:string result) + (peg:end result)))) + (unless (string-null? trailing) + (scm-error 'cpp-lex-error "lex" + "Failed to lex string, remaining trailing characters: ~s" + (list trailing) #f))) + (unless (list? (peg:tree result)) + (scm-error 'cpp-lex-error "lex" + "Parsing just failed. Chars: ~s" + (list (peg:string result)) #f)) + (cdr (peg:tree result)))))) + + + + + +;; (parse-decimals "555" 10) +;; ⇒ 0.5549999999999999 +;; (parse-decimals "8" 16) +;; ⇒ 0.5 +(define (parse-decimals str base) + (/ (fold (lambda (digit done) + (let ((v (string->number digit base))) + (+ v (/ done base)))) + 0.0 + (map string (string->list str))) + base)) + +;; parse a number on form <digits>.<digits> +(define (parse-fractional str base) + (let* ((pair (string-split str #\.)) + (integer (list-ref pair 0)) + (decimals (list-ref pair 1))) + (+ (if (string-null? integer) + 0 (string->number integer 16)) + (if (string-null? decimals) + 0 (parse-decimals decimals 16))))) + + +(define (parse-float body) + (define (fractional-constant x) + (case x + ((decimal-floating-constant) 'fractional-constant) + ((hexadecimal-floating-constant) 'hexadecimal-fractional-constant))) + + (define (exponent-part x) + (case x + ((decimal-floating-constant) 'exponent-part) + ((hexadecimal-floating-constant) 'binary-exponent-part))) + + (define (expt-base x) + (case x + ((decimal-floating-constant) 10) + ((hexadecimal-floating-constant) 2))) + + (define (base x) + (case x + ((decimal-floating-constant) 10) + ((hexadecimal-floating-constant) 16))) + + (let ((type (car body)) + (body (cdr body))) + (* 1.0 + (cond ((assoc-ref body (fractional-constant type)) + => (lambda (fc) (parse-fractional (car fc) (base type)))) + (else (string->number (car body) (base type)))) + (cond ((assoc-ref body (exponent-part type)) + => (lambda (x) (expt (expt-base type) + (string->number (car x) (base type))))) + (else 1))) + ;; TODO do something with (possible) suffix + ;; (assoc-ref body 'floating-suffix) + )) + +(define (parse-integer body) + (let* (;; (suffix (assoc-ref body 'integer-suffix)) + (value (cadr (car body))) + (value-type (car (car body)))) + ;; TODO do something with suffix + (string->number + value + (case value-type + ((octal-constant) 8) + ((decimal-constant) 10) + ((hexadecimal-constant) 16))))) + +;; (parse-c-number "0x1.8p0") +;; ⇒ 1.5 + +;; TODO is "5ul" equivalent to "((unsigned long) 5)" +(define (parse-c-number string) + (cond ((match-pattern constant string) + => (lambda (m) + (let ((m (cadr (peg:tree m)))) ; Strip 'constant wrapper + (case (car m) + ((floating-constant) + (parse-float (cadr m))) + + ((integer-constant) + (parse-integer (cdr m))) + + ((enumeration-constant character-constant) + (scm-error 'misc-error "parse-c-number" + "Couldn't parse [~a] as a /number/ (~s)" + (list string m) #f)))))) + + (else (scm-error 'misc-error "parse-c-number" + "Couldn't parse [~a] as a number" + (list string) #f)))) + + + + +;;; 5.1.11.2 Translation phases + +(define (tokenize string) + (-> string +;;; 1. trigraph replacement + replace-trigraphs +;;; 2. Line folding + fold-lines +;;; 3. Decomposition into preprocenning tokens, whitespaces, and comments + lex + comments->whitespace)) + +;; These really belong in (c cpp-types), but that would create a dependency cycle + +(define (comment->whitespace token) + (if ;; (comment-token? token) + (and (lexeme? token) + (eq? 'comment (type token))) + (car (lex " ")) + token)) + +(define (comments->whitespace tokens) + (map comment->whitespace tokens)) diff --git a/module/c/line-fold.scm b/module/c/line-fold.scm new file mode 100644 index 00000000..c61c2c70 --- /dev/null +++ b/module/c/line-fold.scm @@ -0,0 +1,29 @@ +(define-module (c line-fold) + :use-module (srfi srfi-1) + :use-module (srfi srfi-71) + :export (fold-lines)) + +(define (line-continued? line) + (and (not (string-null? line)) + (char=? #\\ (string-ref line (1- (string-length line)))))) + +(define (strip-backslash line) + (string-drop-right line 1)) + +(define (fold-lines string) + (with-output-to-string + (lambda () + (let loop ((lines (string-split string #\newline))) + (cond ((null? lines) 'NOOP) + ((null? (cdr lines)) + ;; TODO error message if last character is a backslash + (display (car lines)) + (newline)) + (else + (let ((to-merge remaining (span line-continued? lines))) + (for-each display (map strip-backslash to-merge)) + (display (car remaining)) + (newline) + (for-each (lambda _ (newline)) + (iota (length to-merge))) + (loop (cdr remaining))))))))) diff --git a/module/c/operators.scm b/module/c/operators.scm index ab1b3e7c..910dc8a9 100644 --- a/module/c/operators.scm +++ b/module/c/operators.scm @@ -9,8 +9,9 @@ `(+ - * / & ,(symbol #\|) ^ << >> % < > =)) ;; apparently part of C +;; https://en.cppreference.com/w/cpp/language/operator_alternative (define wordy-binary-operators - '(bitand and_eq and bitor or_eq or xor_eq xor)) + '(bitand and_eq and bitor or_eq or xor_eq xor not_eq)) (define symbol-binary-operators (append (map (lambda (x) (symbol-append x '=)) simple-operators) diff --git a/module/c/parse.scm b/module/c/parse.scm index 8030da77..7d11ea17 100644 --- a/module/c/parse.scm +++ b/module/c/parse.scm @@ -1,11 +1,14 @@ (define-module (c parse) :use-module (hnh util) :use-module (srfi srfi-1) + :use-module (srfi srfi-71) :use-module (ice-9 match) + :use-module ((rnrs io ports) + :select (string->bytevector make-transcoder utf-8-codec)) + :use-module (rnrs bytevectors) :export (parse-lexeme-tree)) -;;; Rename this -(define (perms set) +(define (permutations set) (concatenate (map (lambda (key) (map (lambda (o) (cons key o)) @@ -21,23 +24,124 @@ (define valid-sequences (delete 'dummy (lset-union eq? '(dummy) - (map symbol-concat (perms '(() U L))) - (map symbol-concat (perms '(() U LL)))))) + (map symbol-concat (permutations '(() U L))) + (map symbol-concat (permutations '(() U LL)))))) ;; => (LLU ULL LL LU UL L U) (aif (memv (string->symbol (string-upcase str)) valid-sequences) (case (car it) - [(LLU ULL) '(unsigned long-long)] + [(LLU ULL) '(unsigned long long)] [(LU UL) '(unsigned long)] - [(LL) '(long-long)] + [(LL) '(long long)] [(L) '(long)] [(U) '(unsigned)]) (scm-error 'c-parse-error "parse-integer-suffix" "Invalid integer suffix ~s" (list str) #f))) +(define (parse-float-suffix str) + (case (string->symbol str) + ((f F) '(float)) + ((l L) '(long double)))) + +(define (group-body->type vars) + (concatenate + (map + (match-lambda (('variable var) (list (parse-lexeme-tree `(variable ,var)))) + (('postfix ('variable var) + ('postfix-operator "*")) + (list (parse-lexeme-tree `(variable ,var)) + '*)) + (else (scm-error 'c-parse-error "parse-lexeme-tree" + "Invalid token ~s in typecast form: ~s" + (list else vars) #f))) + vars))) + +;; Takes a list of strings and integers, and merges it all into a single +;; bytevector representing a c string +(define* (string-fragments->c-string + fragments optional: (transcoder (make-transcoder (utf-8-codec)))) + + (define fragments-fixed + (map (lambda (frag) + (if (string? frag) + (string->bytevector frag transcoder) + frag)) + fragments)) + + (define bv-length + (fold (lambda (item sum) (+ sum (if (bytevector? item) + (bytevector-length item) + 1))) + 0 fragments-fixed)) + + (define bv (make-bytevector (1+ bv-length))) + ;; trailing null byte + (bytevector-u8-set! bv bv-length 0) + (fold (lambda (item idx) + (cond ((bytevector? item) + (bytevector-copy! item 0 + bv idx + (bytevector-length item)) + (+ idx (bytevector-length item))) + (else (bytevector-u8-set! bv idx item) + (+ idx 1)))) + 0 + fragments-fixed) + bv) + +(define (parse-float-form float-form) + (let ((float-string + (fold (lambda (arg str) + (string-append + str + (match arg + (('float-integer ('base-10 n)) n) + (('float-decimal ('base-10 n)) (string-append "." n)) + (('exponent "+" ('base-10 n)) (string-append "e" n)) + (('exponent ('base-10 n)) (string-append "e" n)) + (('exponent "-" ('base-10 n)) (string-append "e-" n))))) + "" float-form))) + ;; exact->inexact is a no-op if we already have an inexact number, but + ;; ensures we get an inexact number when we have an exact number (which we + ;; can get from the "1." case). Returning an inexact number here is important + ;; to avoid arithmetic suprises later. + (exact->inexact + (or (string->number float-string) + (scm-error 'c-parse-error "parse-lexeme-tree" + "Couldn't parse expression as float: ~s" + (list `(float ,@args)) #f))))) + + +(define (resolve-escaped-char form) + (match form + (('base-8-char n) (string->number n 8)) + (('base-16-char n) (string->number n 16)) + (c (char->integer + (case (string-ref c 0) + ((#\a) #\alarm) + ((#\b) #\backspace) + ((#\e) #\esc) ;; non-standard + ((#\f) #\page) + ((#\n) #\newline) + ((#\r) #\return) + ((#\t) #\tab) + ((#\v) #\vtab) + ((#\\) #\\) + ;; These are valid in both strings and chars + ((#\') #\') + ((#\") #\")))))) + +;; Takes a list of strings and escaped-char form +;; and returns a list of strings and integers +(define (resolve-string-fragment fragment) + (match fragment + (('escaped-char c) + (resolve-escaped-char c)) + (fargment fragment))) + (define (parse-lexeme-tree tree) (match tree ['() '()] @@ -50,55 +154,67 @@ [('integer n ('integer-suffix suffix)) `(as-type ,(parse-integer-suffix suffix) - ,(parse-lexeme-tree n)) - ] + ,(parse-lexeme-tree n))] + [('integer n) (parse-lexeme-tree n)] + + [('float args ... ('float-suffix suffix)) + `(as-type ,(parse-float-suffix suffix) + ;; parse rest of float as if it lacked a suffix + ,(parse-lexeme-tree `(float ,@args)))] + + [('float args ...) (parse-float-form args)] + ;; Character literals, stored as raw integers ;; so mathematical operations keep working on them. - [('char ('escaped-char ('base-8-char n))) - (-> n (string->number 8) #; integer->char)] - [('char ('escaped-char ('base-16-char n))) - (-> n (string->number 16) #; integer->char)] - [('char ('escaped-char c)) - (char->integer - (case (string-ref c 0) - ((#\a) #\alarm) - ((#\b) #\backspace) - ((#\e) #\esc) - ((#\f) #\page) - ((#\n) #\newline) - ((#\r) #\return) - ((#\t) #\tab) - ((#\v) #\vtab) - ((#\\) #\\) - ((#\') #\')))] + [('char ('escaped-char c)) (resolve-escaped-char c)] + [('char c) (char->integer (string-ref c 0))] [('variable var) (string->symbol var)] + + ;; normalize some binary operators to their wordy equivalent + ;; (which also happens to match better with scheme) + [('operator "&&") 'and] + [('operator "&=") 'and_eq] + [('operator "&") 'bitand] + [('operator "|") 'bitor] + [('operator "!=") 'not_eq] + [('operator "||") 'or] + [('operator "|=") 'or_eq] + [('operator "^") 'xor] + [('operator "^=") 'xor_eq] + ;; Change these names to something scheme can handle better + [('operator ".") 'object-slot] + [('operator "->") 'dereference-slot] [('operator op) (string->symbol op)] + [('prefix-operator op) (case (string->symbol op) + ((!) 'not) + ((~) 'compl) ((*) 'dereference) ((&) 'pointer) ((++) 'pre-increment) ((--) 'pre-decrement) - (else => identity))] + ((-) '-) + (else (scm-error 'c-parse-error "parse-lexeme-tree" + "Unknown prefix operator ~s" + (list op) #f)))] [('postfix-operator op) (case (string->symbol op) [(++) 'post-increment] [(--) 'post-decrement] - [else => identity])] + [else (scm-error 'c-parse-error "parse-lexeme-tree" + "Unknown postfix operator ~s" + (list op) #f)])] ;; Parenthesis grouping - [('group args) + [('group args ...) (parse-lexeme-tree args)] - ;; Atomic item. Used by flatten-infix - [('atom body) - (parse-lexeme-tree body)] - [('prefix op arg) `(,(parse-lexeme-tree op) ,(parse-lexeme-tree arg))] @@ -107,81 +223,204 @@ `(,(parse-lexeme-tree op) ,(parse-lexeme-tree arg))] + + + + + ;; resolved-operator and ternary are the return "types" + ;; of resolve-order-of-operations + [(('resolved-operator op) args ...) + `(,op ,@(map parse-lexeme-tree args))] + + [('ternary a b c) + `(ternary ,(parse-lexeme-tree a) + ,(parse-lexeme-tree b) + ,(parse-lexeme-tree c))] + + + + + ;; Is it OK for literal strings to be "stored" inline? + ;; Or must they be a pointer? + ['string #vu8(0)] + [('string str ...) + (-> (map resolve-string-fragment str) + string-fragments->c-string)] + + ;; implicit concatenation of string literals + [(('string str ...) ...) + (-> (map resolve-string-fragment (concatenate str)) + string-fragments->c-string)] + [('infix args ...) - (resolve-order-of-operations - (flatten-infix (cons 'infix args)))] + (let ((r (resolve-order-of-operations + (flatten-infix (cons 'infix args))))) + (parse-lexeme-tree r))] + [('funcall function ('group arguments)) `(funcall ,(parse-lexeme-tree function) ,(parse-lexeme-tree arguments))] - [bare (scm-error 'c-parse-error - "parse-lexeme-tree" - "Naked literal in lex-tree: ~s" - (list bare) - #f)])) + [(('variable "struct") ('variable value) ..1) + `(struct-type ,@(map string->symbol value)) + ] + + ;; A list of variables. Most likely a type signature + ;; [(('variable value) ..1) + ;; ] + + ;; A typecast with only variables must (?) be a typecast? + [(('group groups ..1) ... value) + (fold-right (lambda (type done) `(as-type ,type ,done)) + (parse-lexeme-tree value) + (map group-body->type groups))] + + ;; Type name resolution? + ;; https://en.wikipedia.org/wiki/C_data_types + ;; base types with spaces: + ;; ======================= + ;; [[un]signed] char + ;; [[un]signed] short [int] + ;; [[un]signed] int + ;; [un]signed [int] + ;; [[un]signed] long [int] + ;; [[un]signed] long long [int] + ;; float + ;; [long] double + + ;; https://en.wikipedia.org/wiki/Type_qualifier + ;; qualifiers + ;; const + ;; volatile + ;; restrict + ;; _Atomic + + + ;; Storage specifiers + ;; auto + ;; register + ;; static + ;; extern + + ;; struct <typename> + ;; enum <typename> + ;; union <typename> + + ;; https://en.wikipedia.org/wiki/C_syntax + ;; int (*ptr_to_array)[100] + + + [(? symbol? bare) + (scm-error 'c-parse-error + "parse-lexeme-tree" + "Naked literal in lex-tree: ~s" + (list bare) + #f)] + + [form + (scm-error 'c-parse-error + "parse-lexeme-tree" + "Unknown form in lex-tree: ~s" + (list form) #f) + ])) ;; https://en.wikipedia.org/wiki/Operators_in_C_and_C%2B%2B +;; https://en.cppreference.com/w/c/language/operator_precedence (define order-of-operations (reverse - (concatenate - ;; This is only for binary operations - `((-> ,(symbol #\.)) - (* / %) - (+ -) - (<< >>) - (< <= > >=) - (== !=) - (&) - (^) - (,(symbol #\|)) - (&&) - (,(symbol #\| #\|)) - (= += -= *= /= %= <<= >>= &= ^= ,(symbol #\| #\=)) - (,(symbol #\,)) - )))) - -(define (mark-other form) - (if (list? form) (cons '*other* form) form)) + ;; This is only for binary operations + `((-> ,(symbol #\.)) + ;; All unary procedures go here, incnluding typecasts, and sizeof + (* / %) + (+ -) + (<< >>) + (< <= > >=) + (== != not_eq) + (& bitand) + (^ xorg) + (,(symbol #\|) bitor) + (&& and) + (,(symbol #\| #\|) or) + (? :) + (= += -= *= /= %= <<= >>= &= ^= ,(symbol #\| #\=) + and_eq or_eq xor_eq) + (,(symbol #\,)) + ))) + +;; a.b->c.d (. (-> (. a b) c) d) +;; 2 * 3 / 4 * 5 => (* (/ (* 2 3) 4) 5) +;; eller => (* 2 (/ 3 4) 5) (define* (resolve-order-of-operations tree optional: (order order-of-operations)) (if (null? order) - (car tree) + (scm-error 'c-parse-error + "resolve-order-of-operations" + "Out of operations to resolve when resolving expression ~s" + (list tree) #f) (match tree - [('*other* body ...) body] - [(form) (resolve-order-of-operations form order)] - [(forms ...) - (match (split-by forms (car order)) - [(group) (resolve-order-of-operations group (cdr order))] - [groups - (cons (car order) - (map (lambda (form) (resolve-order-of-operations - form order-of-operations)) - groups))])] - [a a]))) + [('fixed-infix form) form] + [('fixed-infix forms ...) + (match (split-by-one-of forms (car order)) + [(group) + (resolve-order-of-operations (cons 'fixed-infix group) + (cdr order))] + [(a ('? b ...) (': c ...)) + `(ternary ,(resolve-order-of-operations (cons 'fixed-infix a) (cdr order)) + ,(resolve-order-of-operations (cons 'fixed-infix b) (cdr order)) + ,(resolve-order-of-operations (cons 'fixed-infix c) (cdr order)))] + [(first rest ...) + ;; TODO this is only valid for the associative operators (+, ...) + ;; but not some other (<, ...) + (if (apply eq? (map car rest)) + (let ((op (caar rest))) + `((resolved-operator ,op) + ,@(map (lambda (x) (resolve-order-of-operations (cons 'fixed-infix x) + (cdr order))) + (cons first (map cdr rest))))) + (fold (lambda (item done) + (let ((operator args (car+cdr item))) + `((resolved-operator ,operator) + ,done ,(resolve-order-of-operations + (cons 'fixed-infix args) + (cdr order))))) + (resolve-order-of-operations (cons 'fixed-infix first) + (cdr order)) + rest))])]))) + +;; 1 * 2 / 3 * 4 +;; ⇒ ((1) (* 2) (/ 3) (* 4)) +;; (1) +;; (* (1) 2) +;; (/ (* (1) 2) 3) +;; (* (/ (* (1) 2) 3) 4) ;; Flatens a tree of infix triples. Stops when it should. ;; (parenthesis, function calls, ...) (define (flatten-infix form) - (match form - [('infix left op ('infix right ...)) - (cons* (parse-lexeme-tree left) - (parse-lexeme-tree op) - (flatten-infix (cons 'infix right)))] - - [('infix left op right) - (list (mark-other (parse-lexeme-tree left)) - (parse-lexeme-tree op) - (mark-other (parse-lexeme-tree right)))] - - [other (scm-error 'c-parse-error - "flatten-infix" - "Not an infix tree ~a" - (list other) - #f)])) + (cons 'fixed-infix + (let loop ((form form)) + (match form + [('infix left op ('infix right ...)) + (cons* left + (parse-lexeme-tree op) + (loop (cons 'infix right)))] + + [('infix left op right) + (list left + (parse-lexeme-tree op) + right)] + + [('infix form) form] + + [other (scm-error 'c-parse-error + "flatten-infix" + "Not an infix tree ~a" + (list other) + #f)])))) diff --git a/module/c/parse2.scm b/module/c/parse2.scm new file mode 100644 index 00000000..fad2ffd8 --- /dev/null +++ b/module/c/parse2.scm @@ -0,0 +1,554 @@ +(define-module (c parse2) + :use-module (hnh util) + :use-module (system base lalr)) + +(define (make-parser) + (lalr-parser + (#{out-table:}# "/tmp/c-parser.txt") + (#{output:}# c-parser "/tmp/c-parser.scm") + ;; (#{driver:}# glr) + + + ( + ;; keywords + auto break case char const continue default do double else enum + extern float for goto if inline int long register restrict + return short signed sizeof static struct switch typedef union + unsigned void volatile while _Alignas _Alignof _Atomic _Bool + _Complex _Generic _Imaginary _Noreturn _Static_assert + _Thread_local + + ;; punctuators + ;; 6.4.6 + lbrack rbrack ; [] + lparen rparen ; () + lbrace rbrace ; {} + dot ; . + -> + ++ -- & * + - ~ ! + / % << >> < > <= >= == != ^ + pipe pipe2 ; | and || scheme handles these fine, but not emacs + && + ? : + semicolon + ... + = *= /= %= += -= <<= >>= &= ^= + pipe= ; |= + comma ; , + hash ; # + hash2 ; ## + ;; TODO digraphs + + ;; 6.4 + ;; keyword - already translated + identifier + constant + string-literal + ;; punctuator - already translated + ) + + ;; Primitives + + ;; (identifier) : $1 + ;; (constant) : $1 + ;; (string-literal) : $1 + + + ;; compounds + + (primary-expression + + ;; 6.5.1 + (identifier) + (constant) + (string-literal) + (lparen expression rparen) + (generic-selection)) + + (enumeration-constant + (identifier)) + + + ;; 6.5.1.1 + (generic-selection + (_Generic lparen assignment-expression comma generic-assoc-list)) + + (generic-assoc-list + (generic-association) + (generic-assoc-list comma generic-association)) + + (generic-association + (type-name : assignment-expression) + (default : assignment-expression)) + + ;; 6.5.2 + (postfix-expression + (primary-expression) + (postfix-expression lbrack expression rbrack) + (postfix-expression lparen rparen) + (postfix-expression lparen argument-expression-list rparen) + (postfix-expression dot identifier) + (postfix-expression -> identifier) + (postfix-expression ++) + (postfix-expression --) + (lparen type-name rparen lbrace initializer-list rbrace) + (lparen type-name rparen lbrace initializer-list comma rbrace)) + + (argument-expression-list + (assignment-expression) + (argument-expression-list comma assignment-expression)) + + ;; 6.5.3 + (unary-expression + (postfix-expression) + (++ unary-expression) + (-- unary-expression) + (unary-operator cast-expression) + (sizeof unary-expression) + (sizeof lparen type-name rparen) + (_Alignof lparen type-name rparen)) + + (unary-operator + (&) + (*) + (+) + (-) + (~) + (!)) + + ;; 6.5.4 + (cast-expression + (unary-expression) + (lparen type-name rparen cast-expression)) + + ;; 6.5.5 + (multiplicative-expression + (cast-expression) + (multiplicative-expression * cast-expression) + (multiplicative-expression / cast-expression) + (multiplicative-expression % cast-expression)) + + ;; 6.5.6 + (additive-expression + (multiplicative-expression) + (additive-expression + multiplicative-expression) + (additive-expression - multiplicative-expression)) + + + ;; 6.5.7 + (shift-expression + (additive-expression) + (shift-expression << additive-expression) + (shift-expression >> additive-expression)) + + ;; 6.5.8 + (relational-expression + (shift-expression) + (relational-expression < shift-expression) + (relational-expression > shift-expression) + (relational-expression <= shift-expression) + (relational-expression >= shift-expression)) + + ;; 6.5.9 + (equality-expression + (relational-expression) + (equality-expression == relational-expression) + (equality-expression != relational-expression)) + + + ;; 6.5.10 + (AND-expression + (equality-expression) + (AND-expression & equality-expression)) + + ;; 6.5.11 + (exclusive-OR-expression + (AND-expression) + (exclusive-OR-expression ^ AND-expression)) + + ;; 6.5.12 + (inclusive-OR-expression + (exclusive-OR-expression) + (inclusive-OR-expression pipe exclusive-OR-expression)) + + ;; 6.5.13 + (logical-AND-expression + (inclusive-OR-expression) + (logical-AND-expression && inclusive-OR-expression)) + + ;; 6.5.14 + (logical-OR-expression + (logical-AND-expression) + (logical-OR-expression pipe2 logical-AND-expression)) + + ;; 6.5.15 + (conditional-expression + (logical-OR-expression) + (logical-OR-expression ? expression : conditional-expression)) + + ;; 6.5.16 + (assignment-expression + (conditional-expression) + (unary-expression assignment-operator assignment-expression)) + + (assignment-operator + (=) + (*=) + (/=) + (%=) + (+=) + (-=) + (<<=) + (>>=) + (&=) + (^=) + (pipe=)) + + ;; 6.5.17 + (expression + (assignment-expression) + (expression comma assignment-expression)) + + ;; 6.6 constant expression + (constant-expression + (expression)) + + ;; 6.7 + (declaration + (declaration-specifiers semicolon) + (declaration-specifiers init-declarator-list semicolon) + (static_assert-declaration)) + + (declaration-specifiers + (storage-class-specifier) + (storage-class-specifier declaration-specifiers) + + (type-specifier) + (type-specifier declaration-specifiers) + + (type-qualifier) + (type-qualifier declaration-specifiers) + + (function-specifier) + (function-specifier declaration-specifiers) + + (alignment-specifier) + (alignment-specifier declaration-specifiers)) + + (init-declarator-list + (init-declarator) + (init-declarator-list comma init-declarator)) + + (init-declarator + (declarator) + (declarator = initializer)) + + + ;; 6.7.1 + (storage-class-specifier + (typedef) + (extern) + (static) + (_Thread_local) + (auto) + (register)) + + + ;; 6.7.2 + (type-specifier + (void) + (char) + (short) + (int) + (long) + (float) + (double) + (signed) + (unsigned) + (_Bool) + (_Complex) + (atomic-type-specifier) + (struct-or-union-specifier) + (enum-specifier) + (typedef-name)) + + ;; 6.7.2.1 + (struct-or-union-specifier + (struct-or-union lbrace struct-declaration-list rbrace) + (struct-or-union identifier lbrace struct-declaration-list rbrace) + (struct-or-union identifier)) + + (struct-or-union + (struct) + (union)) + + (struct-declaration-list + (struct-declaration) + (struct-declaration-list struct-declaration)) + + (struct-declaration + (specifier-qualifier-list semicolon) + (specifier-qualifier-list struct-declarator-list semicolon) + (static_assert-declaration)) + + (specifier-qualifier-list + (type-specifier) + (type-specifier specifier-qualifier-list) + + (type-qualifier) + (type-qualifier specifier-qualifier-list)) + + (struct-declarator-list + (struct-declarator) + (struct-declarator-list comma struct-declarator)) + + (struct-declarator + (declarator) + (: constant-expression) + (declarator : constant-expression)) + + ;; 6.7.2.2 + (enum-specifier + (enum identifier lbrace enumerator-list rbrace) + (enum lbrace enumerator-list rbrace) + + (enum identifier lbrace enumerator-list comma rbrace) + (enum lbrace enumerator-list comma rbrace) + + (enum identifier)) + + (enumerator-list + (enumerator) + (enumerator-list comma enumerator)) + + (enumerator + (enumeration-constant) + (enumeration-constant = constant-expression)) + + ;; 6.7.2.4 + (atomic-type-specifier + (_Atomic lparen type-name rparen)) + + ;; 6.7.3 + (type-qualifier + (const) + (restrict) + (volatile) + (_Atomic)) + + ;; 6.7.4 + (function-specifier + (inline) + (_Noreturn)) + + ;; 6.7.5 + (alignment-specifier + (_Alignas lparen type-name rparen) + (_Alignas lparen constant-expression rparen)) + + ;; 6.7.6 + + (declarator + (pointer direct-declarator) + (direct-declarator)) + + (direct-declarator + (identifier) + ( lparen declarator rparen ) + + (direct-declarator lbrack type-qualifier-list assignment-expression rbrack ) + (direct-declarator lbrack assignment-expression rbrack ) + (direct-declarator lbrack type-qualifier-list rbrack ) + (direct-declarator lbrack rbrack ) + + (direct-declarator lbrack static type-qualifier-list assignment-expression rbrack) + (direct-declarator lbrack static assignment-expression rbrack) + + (direct-declarator lbrack type-qualifier-list static assignment-expression rbrack) + + (direct-declarator lbrack type-qualifier-list * rbrack) + (direct-declarator lbrack * rbrack) + + (direct-declarator lparen parameter-type-list rparen ) + (direct-declarator lparen identifier-list rparen ) + (direct-declarator lparen rparen )) + + (pointer + (* type-qualifier-list) + (*) + (* type-qualifier-list pointer) + (* pointer)) + + (type-qualifier-list + (type-qualifier) + (type-qualifier-list type-qualifier)) + + (parameter-type-list + (parameter-list) + (parameter-list comma ...)) + + (parameter-list + (parameter-declaration) + (parameter-list comma parameter-declaration)) + + (parameter-declaration + (declaration-specifiers declarator) + (declaration-specifiers abstract-declarator) + (declaration-specifiers)) + + (identifier-list + (identifier) + (identifier-list comma identifier)) + + ;; 6.7.7 + (type-name + (specifier-qualifier-list) + (specifier-qualifier-list abstract-declarator)) + + (abstract-declarator + (pointer) + (pointer direct-abstract-declarator) + ( direct-abstract-declarator)) + + (direct-abstract-declarator + ( lparen abstract-declarator rparen ) + (direct-abstract-declarator lbrack type-qualifier-list assignment-expression rbrack ) + (direct-abstract-declarator lbrack type-qualifier-list rbrack ) + (direct-abstract-declarator lbrack assignment-expression rbrack ) + ( lbrack rbrack ) + ( lbrack type-qualifier-list assignment-expression rbrack ) + ( lbrack type-qualifier-list rbrack ) + (direct-abstract-declarator lbrack * rbrack) + ( lbrack * rbrack) + (direct-abstract-declarator lparen parameter-type-list rparen ) + (direct-abstract-declarator lparen rparen ) + ( lparen parameter-type-list rparen ) + ( lparen rparen )) + + ;; 6.7.8 + (typedef-name + (identifier)) + + ;; 6.7.9 + (initializer + (assignment-expression) + (lbrace initializer-list rbrace) + (lbrace initializer-list comma rbrace)) + + (initializer-list + (designation initializer) + (initializer) + (initializer-list comma designation initializer) + (initializer-list comma initializer)) + + (designation + (designator-list =)) + + (designator-list + (designator) + (designator-list designator)) + + (designator + (lbrack constant-expression rbrack) + (dot identifier)) + + ;; 6.7.10 + (static_assert-declaration + (_Static_assert lparen constant-expression comma string-literal rparen semicolon)) + + ;; 6.8 + (statement + (labeled-statement) + (compound-statement) + (expression-statement) + (selection-statement) + (iteration-statement) + (jump-statement)) + + ;; 6.8.1 + (labeled-statement + (identifier : statement) + (case constant-expression : statement) + (default : statement)) + + ;; 6.8.2 + (compound-statement + (lbrace block-item-list rbrace) + (lbrace rbrace)) + + (block-item-list + (block-item) + (block-item-list block-item)) + + (block-item + (declaration) + (statement)) + + ;; 6.8.3 + (expression-statement + (expression semicolon) + (semicolon)) + + (selection-statement + (if lparen expression rparen statement) + (if lparen expression rparen statement else statement) + (switch lparen expression rparen statement)) + + ;; 6.8.5 + (iteration-statement + (while lparen expression rparen statement) + (do statement while lparen expression rparen semicolon) + (for lparen expression semicolon expression semicolon expression rparen statement) + (for lparen expression semicolon expression semicolon rparen statement) + (for lparen expression semicolon semicolon expression rparen statement) + (for lparen semicolon semicolon rparen statement) + (for lparen semicolon expression semicolon expression rparen statement) + (for lparen semicolon expression semicolon rparen statement) + (for lparen declaration expression semicolon expression rparen statement) + (for lparen declaration expression semicolon rparen statement) + (for lparen declaration semicolon expression rparen statement) + (for lparen declaration semicolon rparen statement)) + + ;; 6.8.6 + (jump-statement + (goto identifier semicolon) + (continue semicolon) + (break semicolon) + (return expression semicolon) + (return semicolon)) + + + ;; 6.9 + (translation-unit + (external-declaration) + (translation-unit external-declaration)) + + (external-declaration + (function-definition) + (declaration)) + + ;; 6.9.1 + (function-definition + (declaration-specifiers declarator declaration-list compound-statement) + (declaration-specifiers declarator compound-statement)) + + (declaration-list + (declaration) + (declaration-list declaration)))) + + +(define (build-lexical-analyzer tokens) + (let ((tokens tokens)) + (lambda () + (if (null? tokens) + '*eoi* + (begin1 (car tokens) + (set! tokens (cdr tokens))))))) + + +;; (build-lexical-analyzer (list (cons 'string "hello"))) + +(define (error-procedure a b) + (throw 'parse-error a b)) + +;; (parser lexical-analyzer error-procedure) diff --git a/module/c/preprocessor.scm b/module/c/preprocessor.scm new file mode 100644 index 00000000..71712b17 --- /dev/null +++ b/module/c/preprocessor.scm @@ -0,0 +1,370 @@ +(define-module (c preprocessor) + :use-module (srfi srfi-1) + :use-module (srfi srfi-9 gnu) + :use-module (ice-9 rdelim) + :use-module (ice-9 regex) + :use-module (hnh util object) + + :use-module (hnh util) + :use-module (hnh util object) + ) + +(define (read-lines port) + (let loop ((done '())) + (let ((line (read-line port))) + (if (eof-object? line) + (reverse done) + (loop (cons line done)))))) + +;; The source line of a give readen line +(define line-number (make-object-property)) +;; The source file of a given readen line +(define line-file (make-object-property)) + + +(define (mark-with-property! items property property-value) + (for-each (lambda (item) (set! (property item) property-value)) + items)) + +(define trigraph-rx (make-regexp "??([=()/'<>!-])")) +(define (expand-trigraphs line) + (regexp-substitute/global + #f trigraph-rx + line + 'pre (lambda (m) (case (string-ref (match:substring m 1) 1) + ((#\=) "#") + ((#\() "[") + ((#\)) "]") + ((#\/) "\\") + ((#\') "^") + ((#\<) "{") + ((#\>) "}") + ((#\!) "|") + ((#\-) "~"))) + 'post)) + +(define (number-lines lines) + (for-each (lambda (line number) + (set! (line-number line) number)) + lines + (iota (length lines) 1)) + lines) + +;; Should this line be merged with the next +(define (line-continued? line) + (case (string-length line) + ((0) #f) + ((1) (string=? "\\" line)) + (else + (let ((len (string-length line))) + ;; TODO can extra backslashes change this? + (and (char=? #\\ (string-ref line (- len 1))) + (not (char=? #\\ (string-ref line (- len 2))))))))) + +(define (transfer-line-number to from) + (set! (line-number to) (line-number from)) + to) + +;; Merge two lines, assuming that upper ends with a backslash +(define (merge-lines upper lower) + (let ((new-string (string-append (string-drop-right upper 1) lower))) + (transfer-line-number new-string upper) + new-string)) + +(define (fold-lines lines) + (fold-right (lambda (line done) + (if (line-continued? line) + (cons (merge-lines line (car done)) (cdr done)) + (cons line done))) + '() + lines)) + + +(define comment-rx (make-regexp "(//|/[*]|[*]/)")) + +(define (strip-comments lines) + (let loop ((in-comment #f) (lines lines) (done '())) + (if (null? lines) + (reverse done) + (let ((line (car lines))) + (cond ((regexp-exec comment-rx line) + => (lambda (m) + (format (current-output-port) "~s ~s substr = ~s~%" in-comment (line-number line) (match:substring m)) + (cond ((and in-comment (string=? "*/" (match:substring m))) + (loop #f (cons (transfer-line-number (match:suffix m) line) + (cdr lines)) + done)) + (in-comment (loop #t (cdr lines) done)) + ((string=? "//" (match:substring m)) + (loop #f (cdr lines) (cons (transfer-line-number (match:prefix m) line) + done))) + ((string=? "/*" (match:substring m)) + (loop #t (cons (transfer-line-number (match:suffix m) line) (cdr lines)) done)) + (else + (scm-error 'cpp-error "strip-comments" + "Unexpected */ in file ~a on line ~a" + (list (line-file line) (line-number line)) + #f))))) + (else (loop in-comment (cdr lines) (cons line done)))))))) + + +(define-immutable-record-type <preprocessor-directive> + (make-preprocessor-directive type body) + proprocessor-directive? + (type directive-type) + (body directive-body)) + +(define cpp-directive-rx (make-regexp "\\s*#\\s*((\\w+)(.*))?")) +(define (preprocessor-directive? line) + (cond ((regexp-exec cpp-directive-rx line) + => (lambda (m) + (if (match:substring m 2) + (make-preprocessor-directive + (string->symbol (match:substring m 2)) + (string-trim-both (match:substring m 3) char-set:whitespace)) + 'sort-of))) + (else #f))) + +;; defined + +;; TODO _Pragma + + +(define (expand-function-line-macro environment macro . params) + (expand-macro environment (apply-macro macro (map (lambda (param) (expand-macro environment param)) params)))) + +;; (define (environment-ref )) + +(define (list-of? lst predicate) + (every predicate lst)) + + +;; Parantheses when defining macro +(define (parse-parameter-string string) + (map string-trim-both + (string-split (string-trim-both string (char-set #\( #\))) + #\,))) + + +(define-type (object-macro) + (body type: string?)) + +(define-type (function-macro) + (formals type: (list-of? string?)) + (body type: string?)) + + +;; The interesting part +;; environment, (list string) -> (values (list string) (list strings)) +;; multiple lines since since a function-like macro can extend over multiple lines +;; (define (expand-macros environment strings) +;; ) + + +(define (crash-if-not-if body guilty) + (scm-error 'cpp-error guilty + "else, elif, and endif invalid outside if scope: ~s~%file: ~s line: ~s" + (list body (line-file body) (line-number body)))) + +;; (environment, lines) -> environment x lines +(define (parse-directives environment lines) + (let loop ((environment environment) (lines lines) (done '())) + (let* ((line (car line)) + (directive? (preprocessor-directive? line))) + (case directive? + ((#f) ; regular line + (loop environment (cdr lines) + ;; TODO this doesn't work, since parse-macros works on multiple lines + (cons (parse-macros environment (car lines)) done))) + ((sort-of) ; a no-op directive + (loop environment (cdr lines) done)) + (else ; an actual directive + (case (car (cpp-if-status environment)) + ((outside) + (case (directive-type m) + ((ifndef endif else) + (scm-error 'cpp-error "parse-directives" + "Unexpected directive: ~s" + (list line) #f)) + (else ; inside if, ifelse or else + ;; outside active-if inactive-if + (case (directive-type m) + ;; stack ending directives + ((endif) + (case (car (cpp-if-status environment)) + ((outside) (crash-if-not-if (directive-body m) "endif")) + (else (loop (modify environment cpp-if-status cdr) + (cdr lines) + done)))) + + ;; stack nudging directives + ((else) + (case (car (cpp-if-status environment)) + ((outside) (crash-if-not-if (directive-body m) "else")) + (else (loop (modify environment (lens-compose cpp-if-status car*) + (lambda (old) + (case old + ((active-if) 'inactive-if) + ((inactive-if) 'active-if)))) + (cdr lines) + done)))) + ((elif) + (case (car (cpp-if-status environment)) + ((outside) (crash-if-not-if (directive-body m) "elif")) + (else 'TODO ;; TODO + ) + )) + + ;; stack creating directives + ;; on inactive-if each creates a new frame, which also is inactive + ((ifndef) + (case (car (cpp-if-status environment)) + ((inactive-if) (loop (modify environment cpp-if-status + xcons 'inactive-if) + (cdr lines) + done)) + (else (loop (modify environment cpp-if-status + xcons (if (in-environment? environment (directive-body directive?)) + 'inactive-if 'active-if)) + (cdr lines) + done)))) + + ((ifdef) + (case (car (cpp-if-status environment)) + ((inactive-if) (loop (modify environment cpp-if-status + xcons 'inactive-if) + (cdr lines) + done)) + (else + (loop (modify environment cpp-if-status + xcons (if (in-environment? environment (directive-body directive?)) + 'active-if 'inactive-if)) + (cdr lines) + done)))) + + ((if) + (case (car (cpp-if-status environment)) + ((inactive-if) (loop (modify environment cpp-if-status + xcons 'inactive-if) + (cdr lines) + done)) + (else 'TODO ;; TODO + ))) + + + ;; other directives + ((include) (cond ((string-match "[<\"](.*)" + => (lambda (m) + (let ((fileneme (string-drop-right (directive-body m) 1))) + (case (string-ref (match:substring m 1) 0) + ;; TODO include-path + ((#\<) (handle-file environment filename)) + ((#\") (handle-file environment filename)))))) + (else (scm-error 'cpp-error "parse-directives" + "Invalid include" + '() #f))))) + ((define) + ;; TODO what are valid names? + (cond ((string-match "^(\\w+)([(][^)]*[)])?\\s+(.*)" (directive-body directive?)) + => (lambda (m) + (loop (let ((macro-body (string-trim-both (match:substring m 3)))) + (add-identifier! + environment + (match:substring m 1) + (cond ((match:substring m 2) + => (lambda (parameter-string) + (function-macro + formals: (parse-parameter-string parameter-string) + body: macro-body))) + (else (object-macro body: macro-body))))) + (cdr lines) + done))) + (else (scm-error 'cpp-error "parse-directives" + "Invalid #define line, ~s" + (list (directive-body directive?)) + #f)))) + + ((undef) + (case (car (cpp-if-status environment)) + ((inactive-if) (loop environment (cdr lines) done)) + (else (loop (remove-identifier environment (directive-body directive?)) + (cdr lines) + done)))) + + ((line) + (case (car (cpp-if-status environment)) + ((inactive-if) (loop environment (cdr lines) done)) + ;; TODO add first-run parameter to loop, in case expand-macros still return something invalid + (else (let parse-line-directive ((tokens (string-tokenize (directive-body directive?)))) + (cond ((= 1 (length tokens)) + ;; TODO parse token + (if (integer? (car tokens)) + ;; TODO update current line + (loop environment (cdr lines) done) + (parse-line-directive (expand-macros environment (directive-body directive?))))) + ((= 2 (length tokens)) + ;; TODO parse tokens + (if (and (integer? (car tokens)) + (string-literal? (cadr tokens))) + ;; TODO update current line and file + (loop environment (cdr lines) done) + (parse-line-directive (expand-macros environment (directive-body directive?))))) + (else (parse-line-directive (expand-macros environment (directive-body directive?))))))))) + + ((error) + (throw 'cpp-error-directive + (directive-body directive?))) + + ((warning) + (format (current-error-port) "#warning ~a~%" + (directive-body directive?)) + (loop environment (cdr lines) done)) + + ((pragma) + (format (current-error-port) + "PRAGMA: ~s~%" (directive-body directive?)) + (loop environment (cdr lines) done)) + + ((ident sccs) + (format (current-error-port) + "IDENT: ~s~%" (directive-body directive?)) + (loop environment (cdr lines) done)) + + (else + (scm-error 'cpp-error "parse-directives" + "Unknown pre-processor directive: ~s" + (list line) #f) + ))))))))) + )) + + +(define* (writeln expr optional: (port (current-output-port))) + (write expr port) + (newline port)) + +(define (handle-lines environment lines) + (parse-directive environment + (compose strip-comments fold-lines number-lines))) + + ;; parse-directives environment + +;; Return a number of lines +(define (read-file file-path) + (define raw-lines (call-with-input-file file-path read-lines)) + (mark-with-property! line line-file file-path) + (handle-lines raw-lines)) + + +;; pre defined macros +;; see info manual for cpp 3.7.1 Standard Predefined Macros +;; __FILE__ +;; __LINE__ +;; __DATE__ "Feb 12 1996" +;; __TIME__ "23:59:01" +;; __STDC__ 1 +;; __STDC_VERSION__ 201112L +;; __STDC_HOSTED__ 1 + +;; __cplusplus +;; __OBJC__ +;; __ASSEMBLER__ diff --git a/module/c/preprocessor2.scm b/module/c/preprocessor2.scm new file mode 100644 index 00000000..3f9552c5 --- /dev/null +++ b/module/c/preprocessor2.scm @@ -0,0 +1,752 @@ +(define-module (c preprocessor2) + :use-module (srfi srfi-1) + :use-module (srfi srfi-71) + :use-module (srfi srfi-88) + + :use-module (c cpp-environment) + :use-module (c eval2) + :use-module ((c cpp-environment function-like-macro) + :select (function-like-macro variadic? identifier-list)) + :use-module ((c cpp-environment object-like-macro) :select (object-like-macro object-like-macro?)) + :use-module ((c cpp-environment internal-macro) :select (internal-macro)) + :use-module ((hnh util) :select (-> ->> intersperse aif swap unless unval break/all)) + :use-module ((hnh util lens) :select (set modify cdr*)) + :use-module (hnh util path) + :use-module (hnh util type) + :use-module (hnh util object) + :use-module ((hnh util values) :select (abort* on-fst on-snd apply/values)) + :use-module ((c lex2) + :select (lex + placemaker + lexeme? + lexeme-body + lexeme-noexpand + + tokenize + )) + :use-module (c unlex) + :use-module (c cpp-types) + :use-module (c cpp-util) + :export (_Pragma-macro + defined-macro + c-search-path + handle-preprocessing-tokens)) + + +(define (read-file path) + (call-with-input-file path (@ (ice-9 rdelim) read-string))) + + + +(define-syntax-rule (alist-of variable key-type value-type) + (build-validator-body variable (list-of (pair-of key-type value-type)))) + +(define (list-of-length lst n) + (= n (length lst))) + +(define parameter-map? (of-type? (alist-of string? (list-of lexeme?)))) + +(define (concat-token? token) (and (equal? "##" (punctuator-token? token)) + (not (member "##" (lexeme-noexpand token))))) +(define (stringify-token? token) (equal? "#" (punctuator-token? token))) +(define (left-parenthesis-token? token) (equal? "(" (punctuator-token? token))) +(define (right-parenthesis-token? token) (equal? ")" (punctuator-token? token))) +(define (comma-token? token) (equal? "," (punctuator-token? token))) +(define (ellipsis-token? token) (equal? "..." (punctuator-token? token))) + + +;; TODO +;; > #if defined X +;; is equivalent to +;; > #if defined(X) + + +;; parameters is a lexeme list, as returned by parse-parameter-list +(define (build-parameter-map macro parameters) + (typecheck macro cpp-macro?) + (typecheck parameters (list-of (list-of lexeme?))) + (map (lambda (pair) (modify pair cdr* drop-whitespace-both)) + (if (macro-variadic? macro) + (let ((head rest (split-at parameters (length (macro-identifier-list macro))))) + (cons (cons "__VA_ARGS__" (concatenate (intersperse + (lex ",") + rest))) + (map cons (macro-identifier-list macro) head))) + (map cons + (macro-identifier-list macro) + parameters)))) + +(define (expand# macro parameter-map) + (typecheck macro cpp-macro?) + (typecheck parameter-map parameter-map?) + (let loop ((tokens (macro-body macro))) + (cond ((null? tokens) '()) + ((stringify-token? (car tokens)) + (let* ((head rest (car+cdr (drop-whitespace (cdr tokens)))) + (x (identifier-token? head))) + (cond ((assoc-ref parameter-map x) + => (lambda (tokens) + (cons (stringify-tokens tokens) + (loop rest)))) + (else + (scm-error 'macro-expand-error "expand#" + "'#' is not followed by a macro parameter: ~s" + (list x) #f))))) + (else (cons (car tokens) + (loop (cdr tokens))))))) + + +;; 6.10.3.3 +(define (expand## tokens) + ;; (typecheck tokens (list-of lexeme?)) + + (let loop ((left '()) + (right tokens)) + (cond ((null? right) + (reverse left)) + ((concat-token? (car right)) + (let ((l (drop-whitespace left)) + (r (drop-whitespace (cdr right)))) + (cond ((or (null? l) (null? r)) + (scm-error 'cpp-error "expand##" + "## can't be first or last token: ~s" + (list (unlex tokens)) #f)) + ((and (placemaker-token? (car l)) + (placemaker-token? (car r))) + (loop (cdr l) (cons (placemaker) (cdr r)))) + ((placemaker-token? (car l)) + (loop (cdr l) r)) + ((placemaker-token? (car r)) + (loop (cdr l) (cons (car l) (cdr r)))) + (else + ;; 6.10.3.3 p. 3 + ;; I believe that ## is the only special case where the + ;; result of concatenation is differente from the token directly. + (let ((token (concatenate-tokens (car l) (car r)))) + (let ((token (if (concat-token? token) + (modify token lexeme-noexpand xcons "##") + token))) + (loop (cdr l) (cons token (cdr r))))))))) + (else + (let ((pre post (break concat-token? right))) + (loop (append left (reverse pre)) post)))))) + + +(define (check-arity macro parameters) + (if (variadic? macro) + (unless (>= (length parameters) + (length (macro-identifier-list macro))) + (scm-error 'cpp-arity-error "apply-macro" + "Too few arguments to variadic macro ~s, expected at least ~s, got ~s" + (list (macro-identifier macro) + (length (macro-identifier-list macro)) + (length parameters)) + (list macro))) + (unless (or (and (= 0 (length (macro-identifier-list macro))) + (= 1 (length parameters)) + (null? (car parameters))) + (= (length (macro-identifier-list macro)) + (length parameters))) + (scm-error 'cpp-arity-error "apply-macro" + "Wrong number of arguments to macro ~s, expected ~s, got ~s" + (list (macro-identifier macro) + (length (macro-identifier-list macro)) + (length parameters)) + (list macro))))) + +;; expand function like macro +;; parameter is a list of lexeme-lists, each "top level" element matching one +;; argument to the macro +(define (apply-macro environment macro parameters) + (typecheck environment cpp-environment?) + ;; Each element should be the lexeme list for that argument + (typecheck parameters (list-of (list-of lexeme?))) + (typecheck macro cpp-macro?) + (check-arity macro parameters) + + (let () + + (define (resolve-cpp-variables tokens parameter-map) + (define (bound-identifier? id) + (assoc-ref parameter-map id)) + + ;; expand parameters, and place placemaker tokens + (let loop ((tokens tokens) (last #f)) + (cond ((null? tokens) '()) + ((identifier-token? (car tokens)) + bound-identifier? + => (lambda (id) + (let ((replacement (assoc-ref parameter-map id))) + (if (null? replacement) + (cons (placemaker) (loop (cdr tokens) #f)) + ;; macroexpand replacement here! But only if the token isn't used with ## (or #) + (append + (if (or (concat-token? last) + (next-token-matches? concat-token? tokens)) + replacement + ;; resolve-token-stream only modifies environment by updating current line + ;; that can't happen in a macro body + ((unval resolve-token-stream 1) environment replacement once?: #t)) + (loop (cdr tokens) #f)))))) + ((whitespace-token? (car tokens)) + (cons (car tokens) (loop (cdr tokens) last))) + (else (cons (car tokens) (loop (cdr tokens) (car tokens))))))) + + + (define parameter-map (build-parameter-map macro parameters)) + (remove placemaker-token? + (-> macro + (expand# parameter-map) + (resolve-cpp-variables parameter-map) + expand##)))) + + + +;; Expand object-like macro + +;; #define VALUE 10 +;; #define str(x) #x +;; #define OTHER str(VALUE) +;; OTHER +;; ⇒ "VALUE" + +;; remaining-tokens should be the token stream just after the name of the macro +(define (expand-macro environment macro noexpand-list remaining-tokens) + (typecheck environment cpp-environment?) + (typecheck macro cpp-macro?) + ;; (typecheck remaining-tokens (list-of lexeme?)) + (typecheck noexpand-list (list-of string?)) + + (let ((name (macro-identifier macro))) + (cond ((object-macro? macro) + (values environment (append (fold (swap mark-noexpand) + (expand## (macro-body macro)) + (cons name noexpand-list)) + remaining-tokens))) + + ((function-macro? macro) + (if (next-token-matches? left-parenthesis-token? remaining-tokens) + (let ((containing remaining newlines (parse-parameter-list remaining-tokens))) + (values (bump-line environment newlines) + (append (fold (swap mark-noexpand) + (apply-macro environment macro containing) + (cons name noexpand-list)) + remaining))) + (values environment + ;; TODO#1 the token shouldn't be expanded here, but it should neither be marked no-expand? + ;; Consider the case + ;; #define m(a) a(0,1) + ;; #define f(a) f(2 * (a)) + ;; m(f) + (append (mark-noexpand (lex (macro-identifier macro)) (macro-identifier macro)) + remaining-tokens)))) + + ((internal-macro? macro) + (if (next-token-matches? left-parenthesis-token? remaining-tokens) + (let ((containing remaining newlines (parse-parameter-list remaining-tokens))) + (let ((env* tokens* ((macro-body macro) environment containing))) + (values (bump-line env* newlines) + (append (fold (swap mark-noexpand) + tokens* + (cons name noexpand-list)) + remaining)))) + (values environment + (append (mark-noexpand (lex (macro-identifier macro)) (macro-identifier macro)) + remaining-tokens)))) + + (else + (scm-error 'wrong-type-arg "expand-macro" + "Macro isn't a macro: ~s" + (list macro) #f))))) + + + +(define-type (parenthesis-group) + (parenthesis-group-tokens + type: (list-of (or lexeme? parenthesis-group?)))) + +(define (make-parenthesis-group tokens) + (parenthesis-group parenthesis-group-tokens: tokens)) + + +(define (flatten-group tokens) + (cond ((null? tokens) '()) + ((lexeme? (car tokens)) + (cons (car tokens) (flatten-group (cdr tokens)))) + ((parenthesis-group? (car tokens)) + (append (lex "(") + (flatten-group (parenthesis-group-tokens (car tokens))) + (lex ")") + (flatten-group (cdr tokens)))))) + + +;; Takes a list of preprocessing tokens, and returns three values +;; - if the last token was '...' +;; - a list of strings of all token names +;; - the remaining tokens +;; Note that this is ONLY #define f(x) forms +;; not usage forms +(define (parse-identifier-list tokens) + ;; (typecheck tokens (list-of lexeme?)) + (let* ((group remaining (parse-group (drop-whitespace tokens))) + (groups (reverse (map drop-whitespace-both + (break/all comma-token? (parenthesis-group-tokens group)))))) + ;; Checks that there where no nested parenthesis + (cond ((equal? '(()) groups) + (values #f '() remaining)) + (else + (typecheck groups (list-of (and (list-of-length 1) + (list-of lexeme?)))) + + (let ((variadic? groups (if (ellipsis-token? (caar groups)) + (values #t (cdr groups)) + (values #f groups)))) + (values + variadic? + (map (lambda (x) (or (identifier-token? x) + (scm-error 'cpp-error "parse-identifier-list" + "Unexpected preprocessing-token in identifier list: ~s" + (list x) #f))) + (map car (reverse groups))) + remaining)))))) + + + +(define (newline-count group) + (let loop ((tokens (parenthesis-group-tokens group))) + (fold (lambda (item nls) + (+ nls + (cond ((newline-token? item) 1) + ((parenthesis-group? item) (newline-count item)) + (else 0)))) + 0 tokens))) + +;; tokens ⇒ parenthesis-group, remaining-tokens +(define (parse-group tokens) + (typecheck tokens (not null?)) + (typecheck (car tokens) left-parenthesis-token?) + + (let loop ((stack '()) (remaining tokens)) + (cond ((and (not (null? stack)) + (null? (cdr stack)) + (car stack)) + parenthesis-group? + => (lambda (item) (values item remaining))) + ((null? remaining) + (scm-error 'misc-error "parse-group" + "Ran out of tokens while parsing: ~s (stack: ~s)" + (list (unlex tokens) stack) #f)) + (else + (let ((token remaining (car+cdr remaining))) + (loop (cond ((right-parenthesis-token? token) + (let ((group rest (break left-parenthesis-token? stack))) + (cons (make-parenthesis-group (reverse group)) + ;; Remove left-parenthesis + (cdr rest)))) + (else (cons token stack))) + remaining)))))) + + +;; returns three values: +;; - a list of tokens where each is a parameter to the function like macro +;; - the remaining tokenstream +;; - how many newlines were encountered +;; The standard might call these "replacement lists" +;; Note that each returned token-list might have padding whitespace which should be trimmed. +;; It's kept to allow __VA_ARGS__ to "remember" its whitespace +(define (parse-parameter-list tokens) + (let ((group remaining (parse-group (drop-whitespace tokens)))) + ;; Checks that no inner groups where here + ;; (typecheck tokens (list-of lexeme?)) + (values (map flatten-group + (break/all comma-token? (parenthesis-group-tokens group))) + remaining + (newline-count group)))) + + +;; Add __FILE__ and __LINE__ object macros to the environment +(define (join-file-line environment) + (extend-environment + environment + ;; 6.10.8 + (list + (object-like-macro + identifier: "__FILE__" + body: (lex (format #f "~s" (current-file environment)))) + (object-like-macro + identifier: "__LINE__" + body: (lex (number->string (current-line environment))))))) + + +(define defined-macro + (internal-macro + identifier: "defined" + body: (lambda (environment arguments) + (typecheck arguments (and (list-of (list-of lexeme?)) + (not null?))) + (aif (identifier-token? (car (list-ref arguments 0))) + (let ((in-env (boolean->c-boolean (in-environment? environment it)))) + (values environment (lex (number->string in-env)))) + (scm-error 'cpp-error "defined" + "Invalid parameter list to `defined': ~s" + (list arguments) #f))))) + +(define _Pragma-macro + (internal-macro + identifier: "_Pragma" + body: (lambda (environment arguments) + (typecheck arguments (and (list-of (list-of lexeme?)) + (not null?))) + (cond ((string-token? (caar arguments)) + (lambda (a . _) a) + ;; TODO handle rest + => (lambda (encoding it . rest) + (values (handle-pragma environment (lex it)) + '()))) + (else (scm-error 'cpp-pragma-error "_Pragma" + "Invalid argument to _Pragma: ~s" + (list (car arguments)) #f)))))) + + + +;; environment, tokens → environment +(define (handle-pragma environment tokens) + (typecheck environment cpp-environment?) + ;; (typecheck tokens (list-of lexeme?)) + + (let ((err (lambda () + (scm-error 'cpp-pragma-error "handle-pragma" + "Invalid pragma directive: ~a" + (list (unlex tokens)) #f)))) + + (cond ((null? tokens) (err)) + ((equal? "STDC" (identifier-token? (car tokens))) + (call-with-values (lambda () (apply values (filter identifier-token? (cdr tokens)))) + (case-lambda ((identifier on-off-switch) + (format (current-output-port) + "#Pragma STDC ~a ~a" + (unlex (list identifier)) + (unlex (list on-off-switch))) + environment) + (_ (err))))) + (else + (format (current-output-port) + "Non-standard #Pragma: ~a" + (unlex tokens)) + environment)))) + + +;; 6.10.1 p. 4 +(define (resolve-constant-expression cpp-tokens) + ;; (typecheck tokens (list-of lexeme?)) + (define zero (car (lex "0"))) + #; + (define tokens + (map preprocessing-token->token + (map (lambda (token) + (cond ((identifier-token? token) zero) + (else token))) + (remove whitespace-token? tokens)))) + + 'TODO + ;; eval as per 6.6 + ) + + + +(define (mark-noexpand1 token name) + (modify token lexeme-noexpand xcons name)) + +(define (mark-noexpand tokens name) + ;; (typecheck tokens (list-of lexeme?)) + ;; (typecheck name string?) + (map (lambda (token) (mark-noexpand1 token name)) tokens)) + +(define (marked-noexpand? token) + (cond ((identifier-token? token) + => (lambda (id) (member id (lexeme-noexpand token)))) + (else #f))) + +;; Expands a token-stream which contains no pre-processing directives (#if:s, ...) +;; If @var{once?} is true then the resulting body won't be scanned again for tokens to expand +;; environment, tokens, [boolean] → environment, tokens +(define* (resolve-token-stream environment tokens key: once?) + (typecheck environment cpp-environment?) + ;; (typecheck tokens (list-of lexeme?)) + ;; (pprint-environment environment) + ;; (format (current-error-port) "~a~%~%" (unlex tokens)) + (let loop ((environment environment) (tokens tokens)) + (cond ((null? tokens) (values environment '())) + ((newline-token? (car tokens)) + (on-snd (cons (car tokens) (abort* (loop (bump-line environment) (cdr tokens)))))) + ((and (identifier-token? (car tokens)) + (not (marked-noexpand? (car tokens)))) + ;; Here is the loop after expansion + (apply/values (if once? values loop) + (maybe-extend-identifier environment + (identifier-token? (car tokens)) + (lexeme-noexpand (car tokens)) + (cdr tokens)))) + (else (on-snd (cons (car tokens) (abort* (loop environment (cdr tokens))))))))) + + + +;; returns a new environment +;; handle body of #if +;; environment, (list token) → environment +(define (resolve-for-if environment tokens) + (typecheck environment cpp-environment?) + ;; (typecheck tokens (list-of lexeme?)) + + (-> (extend-environment environment (list defined-macro)) + ;; no newlines in #if line + ((unval resolve-token-stream 1) tokens) + resolve-constant-expression + c-boolean->boolean + (if (enter-active-if environment) + (enter-inactive-if environment)))) + +;; environment, string, (list token) → environment, (list token) +(define (maybe-extend-identifier environment identifier noexpand-list remaining-tokens) + (typecheck environment cpp-environment?) + (typecheck identifier string?) + ;; (typecheck remaining-tokens (list-of lexeme?)) + (typecheck noexpand-list (list-of string?)) + (cond ((get-identifier (join-file-line environment) identifier) + => (lambda (value) + (expand-macro (join-file-line environment) + value + noexpand-list + remaining-tokens))) + (else ; It wasn't an identifier, leave it as is + (values environment + (append (mark-noexpand (lex identifier) + identifier) + remaining-tokens))))) + +;; 'gcc -xc -E -v /dev/null' prints GCC:s search path +(define c-search-path + (make-parameter (list "/usr/include" + "/usr/local/include"))) + +;; #include <stdio.h> +(define (resolve-h-file string) + (typecheck string string?) + (cond + ;; NOTE do I want this case? + ;; GCC has it + ((path-absolute? string) string) + (else + (or + (find file-exists? + (map (lambda (path-prefix) + (path-append path-prefix string)) + (c-search-path))) + (scm-error 'cpp-error "resolve-h-file" + "Can't resolve file: ~s" + (list string) #f))))) + +;; #include "myheader.h" +(define (resolve-q-file string) + (typecheck string string?) + (cond ((file-exists? string) string) + ;; This should always be a fallback (6.10.2, p. 3) + (else (resolve-h-file string)))) + + +(define (resolve-header environment tokens) + (typecheck environment cpp-environment?) + ;; (typecheck tokens (list-of lexeme?)) + + (let ((err (lambda (msg . args) + (scm-error 'cpp-error "resolve-and-include-header" + (string-append msg ", tokens: ~s") + (append args (list (unlex tokens))) #f)))) + (let loop ((%first-time #t) (tokens tokens)) + (cond ((null? tokens) (err "Invalid #include line")) + ((h-string-token? (car tokens)) + => (lambda (str) + (unless (null? (drop-whitespace (cdr tokens))) + (err "Unexpected tokens after #include <>")) + (resolve-h-file str))) + ((q-string-token? (car tokens)) + => (lambda (str) + (unless (null? (drop-whitespace (cdr tokens))) + (err "Unexpected tokens after #include \"\"")) + (resolve-q-file str))) + (else + (unless %first-time (err "Failed parsing tokens")) + ;; No newlines in #include + (loop #f ((unval resolve-token-stream 1) environment tokens))))))) + +;; environment, tokens → environment +(define (handle-line-directive environment tokens*) + (typecheck environment cpp-environment?) + ;; (typecheck tokens* (list-of lexeme?)) + + (let ((err (lambda () (scm-error 'cpp-error "handle-line-directive" + "Invalid line directive: ~s" + (list tokens*) #f)))) + (let loop ((%first-time #t) (tokens tokens*)) + (cond ((null? tokens)) + ((pp-number? (car tokens)) + => (lambda (line) + (let ((line (string->number line)) + (remaining (drop-whitespace (cdr tokens)))) + (cond ((null? remaining) (set environment current-line (1- line))) + ((string-token? (car remaining)) + (lambda (a . _) a) + => (lambda (encoding . fragments) + (-> environment + (set current-line (1- line)) + ;; TODO properly join string + (set current-file (car fragments))))) + ;; no newlines in #line + (%first-time (loop #f ((unval resolve-token-stream 1) environment tokens))) + (else (err)))))) + ;; no newlines in #line + (%first-time (loop #f ((unval resolve-token-stream 1) environment tokens))) + (else (err)))))) + +;; environment, tokens → environment +(define (resolve-define environment tokens) + (typecheck environment cpp-environment?) + ;; (typecheck tokens (list-of lexeme?)) + + (let ((identifier (identifier-token? (car tokens))) + (tail (cdr tokens))) + (-> environment + bump-line + (add-identifier + identifier + (cond ((and (not (null? tail)) + (left-parenthesis-token? (car tail))) + ;; function like macro + (let ((variadic? identifiers replacement-list + (parse-identifier-list tail))) + (function-like-macro + identifier: identifier + variadic?: variadic? + identifier-list: identifiers + ;; surrounding whitespace is not part of the replacement list + ;; (6.10.3 p.7) + body: (drop-whitespace-both replacement-list)))) + (else (object-like-macro + identifier: identifier + body: (drop-whitespace-both tail)))))))) + + + + +;; environment, tokens -> environment, tokens +(define (handle-preprocessing-tokens environment tokens) + ;; Prepend a newline to ensure that the token stream always starts with a + ;; newline (otherwise guaranteed by how we loop). Decrement line-counter + ;; by one to compensate. + (let loop ((environment (bump-line environment -1)) + (tokens (append (lex "\n") tokens))) + + (define (err fmt . args) + (scm-error 'cpp-error "handle-preprocessing-tokens" + (string-append "~a:~a " fmt) + (cons* (current-file environment) + (current-line environment) + args) + #f)) + + (cond ((null? tokens) (values environment '())) + ((newline-token? (car tokens)) + (let ((environment (bump-line environment)) + (tokens* (drop-whitespace (cdr tokens)))) + (cond ((null? tokens*) (values environment '())) + ((equal? "#" (punctuator-token? (car tokens*))) + (let ((line-tokens remaining-tokens (tokens-until-eol (cdr tokens*)))) + ;; drop whitespace after to not "eat" the next newline token + (let ((line-tokens (drop-whitespace line-tokens))) + (cond ((null? line-tokens) + ;; null directive + (loop environment remaining-tokens)) + + ((in-conditional/inactive? environment) + (case (string->symbol (identifier-token? (car line-tokens))) + ((ifdef if) (loop (enter-inactive-if environment) remaining-tokens)) + ((else) (loop (flip-flop-if environment) remaining-tokens)) + ((endif) (loop (leave-if environment) remaining-tokens)) + ((elif) (loop (resolve-for-if + (leave-if environment) + (drop-whitespace (cdr line-tokens))) + remaining-tokens)) + (else (loop environment remaining-tokens)))) + + ;; From here on we are not in a comment block + (else + (let ((directive (string->symbol (identifier-token? (car line-tokens)))) + (body (drop-whitespace (cdr line-tokens)))) + (if (eq? 'include directive) + ;; include is special since it returns a token stream + (let ((path (resolve-header environment body))) + ;; TODO change to store source location in lexemes + ;; and rewrite the following to + ;; (loop environment + ;; (append (-> path read-file tokenize) remaining-tokens)) + ;; TODO and then transfer these source locations when we move + ;; to "real" tokens (c to-token) + (let ((env* tokens* + (loop + ;; same hack as at start of loop + (-> environment + (enter-file path) + (bump-line -1)) + (append (lex "\n") + (-> path read-file tokenize))))) + (on-snd (append tokens* (abort* (loop (leave-file env*) remaining-tokens)))))) + + (let ((operation ; (environment, list token) → environment + (case directive + ((if) resolve-for-if) + ((ifdef) + (lambda (env body) + ((if (in-environment? env (identifier-token? (car body))) + enter-active-if enter-inactive-if) + env))) + ((ifndef) + (lambda (env body) + ((if (in-environment? env (identifier-token? (car body))) + enter-inactive-if enter-active-if) + env))) + ;; NOTE possibly validate that body is empty for endif and else + ((endif) (lambda (env _) + (unless (in-conditional? env) + (err "#endif outside conditional")) + (leave-if env))) + ((else) (lambda (env _) + (unless (in-conditional? env) + (err "#else outside conditional")) + (flip-flop-if env))) + ;; ((elif) (lambda )) + ((define) resolve-define) + ((undef) (lambda (env body) (remove-identifier env (identifier-token? (car body))))) + ((line) handle-line-directive) + ((error) (lambda (_ tokens) + (throw 'cpp-error-directive (unlex tokens)))) + ((pragma) handle-pragma) + (else (err "Unknown preprocessing directive: ~s" + (list line-tokens)))))) + (loop (operation environment body) + remaining-tokens))))))))) + + ;; Line is not a pre-processing directive + (else (let ((preceding-tokens remaining-tokens (tokens-until-cpp-directive (cdr tokens)))) + (let* ((env* resolved-tokens (if (in-conditional/inactive? environment) + (values environment '()) + (resolve-token-stream environment preceding-tokens)))) + (on-snd (append resolved-tokens + ;; The initial newline is presreved here, for better output, + ;; and to keep at least one whitespace token when there was one previously. + ;; possibly also keep a newline for line-directives. + (unless (null? remaining-tokens) (lex "\n")) + (abort* (loop env* remaining-tokens)))))))))) + + (else (err "Unexpected middle of line, (near ~s)" + (unlex tokens)))))) + diff --git a/module/c/to-token.scm b/module/c/to-token.scm new file mode 100644 index 00000000..f5e459de --- /dev/null +++ b/module/c/to-token.scm @@ -0,0 +1,161 @@ +(define-module (c to-token) + :use-module ((srfi srfi-1) :select (fold append-map)) + :use-module (ice-9 match) + :use-module ((hnh util) :select (->)) + :use-module ((system base lalr) + :select (make-lexical-token)) + :use-module (c cpp-types) + :use-module ((c lex2) :select (parse-c-number)) + ;; :use-module (hnh util type) + :use-module ((rnrs bytevectors) + :select (make-bytevector + bytevector-length + bytevector-copy! + u8-list->bytevector + bytevector-u8-ref)) + :use-module ((rnrs io ports) + :select (string->bytevector + make-transcoder + utf-8-codec)) + :export (preprocessing-token->token)) + +(define (pp-number->c-number token) + (parse-c-number (pp-number? token))) + +(define keywords + '(auto break case char const continue default + do double else enum extern float for goto + if inline int long register restrict return + short signed sizeof static struct switch + typedef union unsigned void volatile while + _Alignas _Alignof _Atomic _Bool _Complex + _Generic _Imaginary _Noreturn _Static_assert + _Thread_local)) + +(define (cpp-char->bytevector c) + (match c + (`(escape-sequence (simple-escape-sequence ,x)) + (case (string-ref x 0) + ((#\a) #vu8(#x07)) ; #\alarm + ((#\b) #vu8(#x08)) ; #\backspace + ((#\f) #vu8(#x0C)) ; #\page + ((#\n) #vu8(#x0A)) ; #\newline + ((#\r) #vu8(#x0D)) ; #\return + ((#\t) #vu8(#x09)) ; #\tab + ((#\v) #vu8(#x0B)) ; #\vtab + ;; ' " ? \ + (else (char->integer (string-ref x 0))))) + + ;; TODO these u8-list->bytevector should depend on the + ;; encoding prefix of the string/char + (`(escape-sequence (octal-escape-sequence ,x)) + (-> x (string->number 8) list u8-list->bytevector)) + + (`(escape-sequence (hexadecimal-escape-sequence ,x)) + (-> x (string->number 16) list u8-list->bytevector)) + + (`(escape-sequence (universal-character-name ,x)) + (let ((n (string->number x 16))) + (when (<= #xD800 x #xDFFF) + (error)) + (when (and (< x #xA0) + (or (not (= x #x24)) + (not (= x #x40)) + (not (= x #x60)))) + (error)) + (-> n + integer->char + string + (string->bytevector (make-transcoder (utf-8-codec)))))) + (_ (error)))) + +(define (concat-bytevectors bvs) + (define target (make-bytevector (apply + (map bytevector-length bvs)))) + (fold (lambda (bv offset) + (let ((len (bytevector-length bv))) + (bytevector-copy! bv 0 target offset len) + (+ offset len))) + 0 + bvs) + target) + +(define (handle-string-fragments content) + (map + (lambda (x) (if (string? x) + (string->bytevector x (make-transcoder (utf-8-codec))) + (cpp-char->bytevector x))) + content)) + +;; 6.4 paragraph 2 +;; Each preprocessing toket thas is converted to a token shall have the +;; lexcal form of a keyword, an identifier, a constant, a string literal, +;; or a puncturtor +(define (preprocessing-token->token cpp-token) + ;; Guile's cond handles multiple from expr, if written on the form + ;; (cond (expr check => proc) ...) + (cond ((string-token? cpp-token) + (lambda (a . _) a) + => (lambda (encoding . content) + (make-lexical-token + 'string-literal #f + (concat-bytevectors + (append + ;; TODO this should depend on encoding + (handle-string-fragments content) + (list #vu8(0))))))) + + ((identifier-token? cpp-token) + => (lambda (name) + (let ((name (string->symbol name))) + (if (memv name keywords) + name + (make-lexical-token 'identifier #f name))))) + + ((pp-number? cpp-token) + => (lambda (content) + ;; TOOD should return an integer-constant or a floating-constant + (make-lexical-token 'constant #f (parse-c-number content)))) + + ((character-constant? cpp-token) + (lambda (a . _) a) + => (lambda (encoding . content) + (make-lexical-token + 'constant #f + ;; TODO that to do with multi-byte characters? + ;; > 'ab' == 'a' << 8 | 'b' == 0x6162 + ;; > '\x1234' == 0x1234 + ;; GCC prints 34 for the following expression + ;; > printf("%x\n", '\x1234'); + ;; but 6162 for this + ;; > printf("%x\n", 'ab'); + + ;; What about + ;; > 'a\x1234' == a << 16 | 'b' == 0x611234 + (let ((bv (concat-bytevectors + ;; TODO this should depend on encoding + (handle-string-fragments content)))) + ;; TODO maybe actually store multiple bytes from multi byte literals + (bytevector-u8-ref bv (1- (bytevector-length bv))))))) + + ((punctuator-token? cpp-token) + => (lambda (s) + (cond ((string=? s "{") 'lbrace) + ((string=? s "}") 'rbrace) + ((string=? s "[") 'lbrack) + ((string=? s "]") 'rbrack) + ((string=? s "(") 'lparen) + ((string=? s ")") 'rparen) + ((string=? s ".") 'dot) + ((string=? s "|") 'pipe) + ((string=? s "||") 'pipe2) + ((string=? s ";") 'semicolon) + ((string=? s "|=") 'pipe=) + ((string=? s ",") 'comma) + ((string=? s "#") 'hash) + ((string=? s "##") 'hash2) + (else (string->symbol s))))) + + (else + (scm-error 'cpp-error "preprocessing-token->token" + "Can't convert ~s into a \"regular\" token." + (list cpp-token) #f)))) diff --git a/module/c/trigraph.scm b/module/c/trigraph.scm new file mode 100644 index 00000000..197e01a4 --- /dev/null +++ b/module/c/trigraph.scm @@ -0,0 +1,24 @@ +(define-module (c trigraph) + :use-module (ice-9 regex) + :export (replace-trigraphs)) + +(define rx (make-regexp "\\?\\?([=\\(\\)'!<>/-])")) + +(define (proc m) + (case (string-ref (match:substring m 2) 0) + ((#\=) "#") + ((#\() "[") + ((#\)) "]") + ((#\') "^") + ((#\<) "{") + ((#\>) "}") + ((#\!) "|") + ((#\-) "~") + ((#\/) "\\"))) + +(define (replace-trigraphs string) + (call-with-output-string + (lambda (port) + (regexp-substitute/global + port rx string + 'pre proc 'post)))) diff --git a/module/c/unlex.scm b/module/c/unlex.scm new file mode 100644 index 00000000..e467a50f --- /dev/null +++ b/module/c/unlex.scm @@ -0,0 +1,84 @@ +(define-module (c unlex) + :use-module (hnh util type) + :use-module (ice-9 match) + :use-module (c lex2) + :use-module (c cpp-types) + :use-module (c cpp-util) + :use-module ((texinfo string-utils) :select (escape-special-chars)) + :export (unlex + unlex-aggressive + stringify-token + stringify-tokens)) + +(define (unlex tokens) + (typecheck tokens (list-of lexeme?)) + (string-concatenate + (map (lambda (x) (cond (x preprocessing-token? => stringify-token) + ((whitespace-token? x) (lexeme-body x)) + ((other-token? x) (lexeme-body x)))) + tokens))) + +;; takes a list of preprocessing-token's, and return a "source" string +(define (unlex-aggressive tokens) + (typecheck tokens (list-of lexeme?)) + (string-concatenate + (map (lambda (x) + (cond ((preprocessing-token? x) (stringify-token x)) + ((whitespace-token? x) " ") + ((other-token? x) (lexeme-body x)))) + (squeeze-whitespace tokens)))) + +(define (stringify-escape-sequence sub-token) + (match sub-token + (`(simple-escape-sequence ,x) + (format #f "\\~a" x)) + (`(octal-escape-sequence ,x) + (format #f "\\~a" x)) + (`(hexadecimal-escape-sequence ,x) + (format #f "\\x~a" x)) + (`(universal-character-name ,x) + (case (string-length x) + ((4) (format #f "\\u~a" x)) + ((8) (format #f "\\U~a" x)))))) + +(define (stringify-string-tokens fragments) + (with-output-to-string + (lambda () + (display #\") + (for-each (match-lambda + (`(escape-sequence ,x) + (display (stringify-escape-sequence x))) + ;; Backslash in source strings is usually encoded by an + ;; 'escape-sequence, but literal backslashes can be in + ;; "regular" string fragments as result of the stringification + ;; operator (#). + (s (display (escape-special-chars s "\"\\" #\\)))) + fragments) + (display #\")))) + +;; Returns the "source" of the token, as a preprocessing string literal token +(define (stringify-token preprocessing-token) + (match (lexeme-body preprocessing-token) + (('string-literal `(encoding-prefix . ,prefix) parts ...) + (stringify-string-tokens parts)) + + (`(header-name (q-string ,s)) + (format #f "~s" s)) + + (`(header-name (h-string ,s)) + (format #f "<~a>" s)) + + (`(identifier ,id) id) + + (`(pp-number ,n) n) + + ;; TODO remaining parts + (('character-constant `(character-encoding . ,x) c parts ...) + (format #f "'~a'" c)) + + (`(punctuator ,p) p))) + +;; takes a token list, and return a single string literal token +(define (stringify-tokens tokens) + (lexeme type: 'preprocessing-token + body: `(string-literal (encoding-prefix) ,(unlex-aggressive tokens)))) diff --git a/module/c/zipper.scm b/module/c/zipper.scm new file mode 100644 index 00000000..65cea211 --- /dev/null +++ b/module/c/zipper.scm @@ -0,0 +1,60 @@ +;;; Commentary: +;; Zipper data structure. Could be moved to (hnh util), but would then need to +;; be at least slightly more thorough. +;;; Code: + +(define-module (c zipper) + :use-module (srfi srfi-88) + :use-module (hnh util object) + :export (list-zipper + list-zipper? + left focused right + zip-left + zip-right + zip-find-right + list->zipper + zipper->list + rezip)) + +(define-type (list-zipper) + (left type: list?) + focused + (right type: list?)) + +;; Move zipper one step to the left +(define (zip-left zipper) + (if (null? (left zipper)) + zipper + (list-zipper left: (cdr (left zipper)) + right: (cons (focused zipper) (right zipper)) + focused: (car (left zipper))))) + +;; Move zipper one step to the right +(define (zip-right zipper) + (if (null? (right zipper)) + zipper + (list-zipper left: (cons (focused zipper) (left zipper)) + right: (cdr (right zipper)) + focused: (car (right zipper))))) + +;; find first element matching predicate, going right +(define (zip-find-right predicate zipper) + (cond ((null? (right zipper)) zipper) + ((predicate (focused zipper)) zipper) + (else (zip-find-right predicate (zip-right zipper))))) + +(define (list->zipper list) + (list-zipper left: '() + focused: (car list) + right: (cdr list))) + + +(define (rezip zipper) + (if (null? (left zipper)) + zipper + (rezip (zip-left zipper)))) + +(define (zipper->list zipper) + (let ((z (rezip zipper))) + (cons (focused z) + (right z)))) |