diff options
Diffstat (limited to 'module')
-rw-r--r-- | module/c/compiler.scm | 66 | ||||
-rw-r--r-- | module/c/cpp-environment.scm | 144 | ||||
-rw-r--r-- | module/c/cpp-environment/function-like-macro.scm | 18 | ||||
-rw-r--r-- | module/c/cpp-environment/internal-macro.scm | 11 | ||||
-rw-r--r-- | module/c/cpp-environment/object-like-macro.scm | 13 | ||||
-rw-r--r-- | module/c/cpp.scm | 95 | ||||
-rw-r--r-- | module/c/eval.scm | 265 | ||||
-rw-r--r-- | module/c/eval/environment.scm | 34 | ||||
-rw-r--r-- | module/c/eval2.scm | 20 | ||||
-rw-r--r-- | module/c/lex.scm | 52 | ||||
-rw-r--r-- | module/c/lex2.scm | 326 | ||||
-rw-r--r-- | module/c/line-fold.scm | 29 | ||||
-rw-r--r-- | module/c/operators.scm | 3 | ||||
-rw-r--r-- | module/c/parse.scm | 411 | ||||
-rw-r--r-- | module/c/preprocessor.scm | 370 | ||||
-rw-r--r-- | module/c/preprocessor2.scm | 590 | ||||
-rw-r--r-- | module/c/trigraph.scm | 24 | ||||
-rw-r--r-- | module/hnh/util.scm | 25 | ||||
-rw-r--r-- | module/hnh/util/path.scm | 3 | ||||
-rw-r--r-- | module/srfi/srfi-64/util.scm | 11 |
20 files changed, 2372 insertions, 138 deletions
diff --git a/module/c/compiler.scm b/module/c/compiler.scm new file mode 100644 index 00000000..801c3752 --- /dev/null +++ b/module/c/compiler.scm @@ -0,0 +1,66 @@ +(define-module (c compiler) + :use-module ((c lex2) :select (lex)) + :use-module ((c trigraph) :select (replace-trigraphs)) + :use-module ((c line-fold) :select (fold-lines)) + :use-module (c cpp-environment object-like-macro) + :use-module ((c cpp-environment) + :select (make-environment + extend-environment + enter-file)) + :use-module (hnh util) + ;; TODO importort + ;; handle-preprocessing-tokens + ;; load-and-tokenize-file + :export (run-compiler)) + +" +#define __STDC__ 1 +#define __STDC_HOSTED__ 1 +#define __STDC_VERSION__ 201112L +" + +(define now (localtime (current-time))) +(define default-macros + (list + ;; 6.10.8 + (object-like-macro + identifier: "__STDC__" + body: '(preprocessing-token (pp-number "1"))) + (object-like-macro + identifier: "__STDC_HOSTED__" + body: '(preprocessing-token (pp-number "1"))) + (object-like-macro + identifier: "__STDC_VERSION__" + body: '(preprocessing-token (pp-number "201112L"))) + (object-like-macro + identifier: "__DATE__" + ;; TODO format should always be in + ;; english, and not tranlated + body: `(preprocessing-token (string-literal ,(strftime "%b %_d %Y" now)))) + (object-like-macro + identifier: "__TIME__" + body: `(preprocessing-token + (string-literal + ,(strftime "%H:%M:%S" now)))))) + +(define environment + (-> (make-environment) + (extend-environment default-macros))) + + + +;;; 5.1.11.2 Translation phases + + + +(define (run-compiler path) + (define environment (enter-file (make-environment) path)) + (-> (load-and-tokenize-file path) + (handle-preprocessing-tokens environment)) +;;; 5. (something with character sets) +;;; 6. concatenation of string literals +;;; 7. Whitespace tokens are discarded, each preprocessing token is converted into a token + ;; 6.4 paragraph 2 + ;; Each preprocessing toket thas is converted to a token shall have the lexcal form of a keyword, an identifier, a constant, a string literal, or a puncturtor +;;; 8. All external objects and functions are resolved + ) diff --git a/module/c/cpp-environment.scm b/module/c/cpp-environment.scm new file mode 100644 index 00000000..d6c86f7a --- /dev/null +++ b/module/c/cpp-environment.scm @@ -0,0 +1,144 @@ +(define-module (c cpp-environment) + :use-module (srfi srfi-1) + :use-module (srfi srfi-88) + :use-module (ice-9 hash-table) + :use-module (hnh util object) + :use-module (hnh util type) + :use-module (hnh util lens) + :use-module ((c cpp-environment function-like-macro) :prefix #{fun:}#) + :use-module ((c cpp-environment object-like-macro) :prefix #{obj:}#) + :use-module ((c cpp-environment internal-macro) :prefix #{int:}#) + :export ( + + macro-identifier + macro-body + macro-identifier-list + macro-variadic? + macro? + + enter-active-if + enter-inactive-if + leave-if + + enter-file + leave-file + bump-line + current-line + current-file + + function-macro? + object-macro? + internal-macro? + + cpp-environment + cpp-if-status cpp-variables + + make-environment in-environment? + remove-identifier! add-identifier! + get-identifier + extend-environment + + )) + +(define (macro-identifier x) + (define identifier + (cond ((obj:object-like-macro? x) obj:identifier) + ((fun:function-like-macro? x) fun:identifier) + ((int:internal-macro? x) int:identifier) + (else (scm-error 'wrong-type-arg "macro-identifier" + "Not a macro: ~s" + (list x) #f)))) + (identifier x)) + + +(define (macro-body macro) + (define body-proc + (cond ((obj:object-like-macro? macro) obj:body) + ((fun:function-like-macro? macro) fun:body) + ((int:internal-macro? macro) int:body) + (else (scm-error 'wrong-type-arg "macro-body" + "Not a macro: ~s" + (list macro) #f)))) + (body-proc macro)) + +(define macro-identifier-list fun:identifier-list) +(define macro-variadic? fun:variadic?) + +(define function-macro? fun:function-like-macro?) +(define object-macro? obj:object-like-macro?) +(define internal-macro? int:internal-macro?) + +(define (macro? x) + (or (obj:object-like-macro? x) + (fun:function-like-macro? x) + (int:internal-macro? x))) + +(define-type (cpp-environment) + (cpp-if-status type: (list-of (memv '(outside active-if inactive-if))) + default: '(outside)) + (cpp-variables type: hash-table? default: (make-hash-table)) + (cpp-file-stack type: (and ((negate null?)) + (list-of (pair-of string? exact-integer?))) + default: '(("*outside*" . 1)))) + + + +(define (enter-active-if environment) + (modify environment cpp-if-status xcons 'active-if)) + +(define (enter-inactive-if environment) + (modify environment cpp-if-status xcons 'inactive-if)) + +(define (leave-if environment) + (modify environment cpp-if-status cdr)) + + + +(define (enter-file environment filename) + (modify environment cpp-file-stack xcons (cons filename 1))) + +(define (leave-file environment) + (modify environment cpp-file-stack cdr)) + +(define current-line (compose-lenses cpp-file-stack car* cdr*)) + +(define current-file (compose-lenses cpp-file-stack car* car*)) + +(define* (bump-line environment optional: (count 1)) + (modify environment current-line + count)) + + + + +(define (make-environment) (cpp-environment)) + +(define (in-envirnoment? environment key) + (hash-get-handle (cpp-variables environment) key)) + +(define (remove-identifier! environment key) + (hash-remove! (cpp-variables environment) key) + environment) + +(define (add-identifier! environment key value) + (unless (string? key) + (scm-error 'wrong-type-arg "add-identifier!" + "Key must be a string, got: ~s" + (list key) #f)) + (unless (macro? value) + (scm-error 'wrong-type-arg "add-identifier!" + "Value must be a macro, got: ~s" + (list value) #f)) + (hash-set! (cpp-variables environment) key value) + environment) + +(define (get-identifier environment key) + (hash-ref (cpp-variables environment) key)) + +(define (clone-hash-table ht) + (alist->hash-table (hash-map->list cons ht))) + +(define (extend-environment environment macros) + (let ((env (modify environment cpp-variables clone-hash-table))) + (fold (lambda (m env) (add-identifier! env (macro-identifier m) m)) + env macros))) + diff --git a/module/c/cpp-environment/function-like-macro.scm b/module/c/cpp-environment/function-like-macro.scm new file mode 100644 index 00000000..26512439 --- /dev/null +++ b/module/c/cpp-environment/function-like-macro.scm @@ -0,0 +1,18 @@ +(define-module (c cpp-environment function-like-macro) + :use-module (hnh util object) + :use-module (hnh util type) + :export (function-like-macro + function-like-macro? + identifier + identifier-list + body + variadic?)) + +(define-type (function-like-macro) + (identifier type: string?) + (identifier-list type: (list-of string?)) + ;; TODO import these + (body type: list? ; (list-of (or whitespace-token? preprocessing-token?)) + ) + (variadic? type: boolean? + default: #f)) diff --git a/module/c/cpp-environment/internal-macro.scm b/module/c/cpp-environment/internal-macro.scm new file mode 100644 index 00000000..3c946738 --- /dev/null +++ b/module/c/cpp-environment/internal-macro.scm @@ -0,0 +1,11 @@ +(define-module (c cpp-environment internal-macro) + :use-module (hnh util object) + :export (internal-macro + internal-macro? + identifier body)) + +(define-type (internal-macro) + (identifier type: string?) + (body type: procedure? + ;; Arity 2 + )) diff --git a/module/c/cpp-environment/object-like-macro.scm b/module/c/cpp-environment/object-like-macro.scm new file mode 100644 index 00000000..5d4c8810 --- /dev/null +++ b/module/c/cpp-environment/object-like-macro.scm @@ -0,0 +1,13 @@ +(define-module (c cpp-environment object-like-macro) + :use-module (hnh util object) + :export (object-like-macro + object-like-macro? + identifier + body)) + + +(define-type (object-like-macro) + (identifier type: string?) + ;; TODO import these + (body type: list? ; (list-of (or whitespace-token? preprocessing-token?)) + )) diff --git a/module/c/cpp.scm b/module/c/cpp.scm index a2935352..aed496f2 100644 --- a/module/c/cpp.scm +++ b/module/c/cpp.scm @@ -5,47 +5,37 @@ :use-module (ice-9 match) :use-module (ice-9 regex) :use-module ((rnrs io ports) :select (call-with-port)) + :use-module ((rnrs bytevectors) :select (bytevector?)) :use-module (ice-9 format) :use-module ((hnh util io) :select (read-lines)) :use-module (hnh util graph) :use-module (c lex) :use-module (c parse) :use-module (c operators) - :export (do-funcall replace-symbols include#) + :export (replace-symbols include#) ) ;; input "#define F(x, y) x + y" -;; 1 full define | F(x, y) +;; 1 full define | F(x,y) ;; 2 macro name | F -;; 3 macro args | (x, y) -;; 4 macro body | x + y -(define define-re (make-regexp "^#define ((\\w+)(\\([^)]*\\))?) (.*)")) +;; 3 macro args | (x,y) +;; 5 macro body | x + y or #f +(define define-re (make-regexp "^#define ((\\w+)([(][^)]*[)])?)( (.*))?")) (define (tokenize-define-line header-line) (aif (regexp-exec define-re header-line) (cons (match:substring it 1) - (match:substring it 4)) + (let ((body (match:substring it 5))) + (if (or (eqv? body #f) + (string-null? body)) + "1" body))) (scm-error 'c-parse-error "tokenize-define-line" "Line dosen't match: ~s" (list header-line) #f))) -(define (do-funcall function arguments) - (if (list? arguments) - (apply function arguments) - (function arguments))) - -(define symb-map - `((,(symbol #\|) . logior) - (funcall . (@ (c cpp) do-funcall)) - (&& . and) - (& . logand) - (== . =) - (!= . (negate =)) - )) - (define (replace-symbols tree dict) (if (not (list? tree)) (or (assoc-ref dict tree) tree) @@ -55,12 +45,27 @@ ;; Direct values. Lisp also has quoted symbols in this group. (define (immediate? x) (or (number? x) - (char? x) - (string? x))) + (bytevector? x))) +;; TODO replace this with something sensible +;; like a correct list extracted from (c eval) +;; and not thinging that types are variables ;; built in symbols. Should never be marked as dependencies (define (primitive? x) - (memv x (cons 'funcall binary-operators))) + (memv x `( + ;; language primitives + sizeof + + ;; special forms introduced by parser + funcall ternary struct-type as-type + + ;; unary operatons which aren't also binary operators + ++ -- ! ~ + not compl dereference pointer + pre-increment pre-decrement + post-increment post-decrement + ,@binary-operators + ))) @@ -77,7 +82,6 @@ [arg (list arg)])) (define right (f (cdr pair))) - (define alt-right (replace-symbols right symb-map)) (define dependencies (lset-difference eq? @@ -91,12 +95,12 @@ dependencies (match left [('funcall name ('#{,}# args ...)) - (cons name `(lambda ,args ,alt-right))] + (cons name `(lambda ,args ,right))] [('funcall name arg) - (cons name `(lambda (,arg) ,alt-right))] + (cons name `(lambda (,arg) ,right))] - [name (cons name alt-right)]))) + [name (cons name right)]))) (define (parse-cpp-file lines) @@ -104,7 +108,9 @@ (catch #t (lambda () (parse-cpp-define line)) (lambda (err caller fmt args data) - (format #t "~a ~?~%" fmt args) + (format #t "~a in ~a: ~?~%" + err caller fmt args) + (format #t "~s~%" line) #f))) lines)) @@ -114,29 +120,32 @@ (define (tokenize-header-file header-file) (map tokenize-define-line (call-with-port - (open-input-pipe - (string-append "cpp -dM " header-file)) + (open-pipe* OPEN_READ "cpp" "-dM" header-file) read-lines))) -(define-macro (include# header-file . args) - - (define define-form (if (null? args) 'define (car args))) - - (define lines (remove (compose private-c-symbol? car) - (tokenize-header-file header-file))) +(define (load-cpp-file header-file) + (define lines (tokenize-header-file header-file)) (define forms (parse-cpp-file lines)) - (define graph* - (fold (lambda (node graph) - (add-node graph (cdr node) (car node))) - (make-graph car) - (filter identity forms))) + (fold (lambda (node graph) + (add-node graph (cdr node) (car node))) + (make-graph car) + (filter identity forms))) +(define (include% header-file) + (define graph* (load-cpp-file header-file)) ;; Hack for termios since this symbol isn't defined. ;; (including in the above removed private c symbols) - (define graph (add-node graph* (cons '_POSIX_VDISABLE #f) '())) + (define graph (add-node graph* (cons '_POSIX_VDISABLE 0) '())) + ;; TODO expand bodies + ;; (remove (compose private-c-symbol? car)) + (resolve-dependency-graph graph)) + +(define-macro (include# header-file . args) + + (define define-form (if (null? args) 'define (car args))) `(begin ,@(map (lambda (pair) `(,define-form ,(car pair) ,(cdr pair))) - (resolve-dependency-graph graph)))) + (include% header-file)))) diff --git a/module/c/eval.scm b/module/c/eval.scm new file mode 100644 index 00000000..67d0075d --- /dev/null +++ b/module/c/eval.scm @@ -0,0 +1,265 @@ +(define-module (c eval) + :use-module (hnh util) + :use-module (srfi srfi-1) + :use-module (srfi srfi-71) + :use-module (srfi srfi-88) + :use-module (ice-9 match) + :use-module (ice-9 curried-definitions) + :use-module ((rnrs bytevectors) + :select (bytevector?)) + :use-module ((rnrs arithmetic bitwise) + :select (bitwise-not + bitwise-and + bitwise-ior + bitwise-xor + bitwise-arithmetic-shift-left + bitwise-arithmetic-shift-right)) + :use-module (c eval environment) + :export (c-procedure? + procedure-formals + procedure-body + procedure-arity + + c-eval + )) + +(define C-TRUE 1) +(define C-FALSE 0) + +(define (boolean->c-boolean bool) + (if bool C-TRUE C-FALSE)) + +(define (c-boolean->boolean bool) + (not (zero? bool))) + +(define (c-not b) + (-> b c-boolean->boolean not boolean->c-boolean)) + +(define (c-procedure? expr) + (and (list? expr) + (not (null? expr)) + (eq? 'lambda (car expr)))) + +(define* (ensure-c-procedure expr optional: calling-procedure) + (unless (c-procedure? expr) + (scm-error 'c-eval-error calling-procedure + "Value not a procedure: ~s" + (list procedure #f)))) + +(define (procedure-formals procedure) + (ensure-c-procedure procedure "procedure-formals") + (list-ref procedure 1)) + +(define (procedure-body procedure) + (ensure-c-procedure procedure "procedure-body") + (list-ref procedure 2)) + +(define (procedure-arity procedure) + (length (procedure-formals procedure))) + +(define (literal? expression) + (or (number? expression) + (bytevector? expression))) + + + +;; Internal helper procedures + +(define (mod-set operation) + (lambda (env slot value) + ;; a += b + ;; a = a + b + (c-eval env `(= ,slot (,operation ,slot ,value))))) + +(define (fold-2 proc init lst) + (car+cdr + (fold (lambda (arg env+done) + (let ((env* arg* (proc (car env+done) arg))) + (cons* env* arg* (cdr env+done)))) + init + lst))) + +;; TODO this disregards +;; - floating point convertions +;; - integer truncation +(define ((simple-infix operation) env . operands) + (let ((env done (fold-2 c-eval (cons env '()) operands))) + (values env (apply operation (reverse done))))) + +(define ((binary-operator proc) env i c) + (let ((env* i* (c-eval env i))) + (let ((env** c* (c-eval env* c))) + (values env** (proc i* c*))))) + + + + +;; The order of evaluation for a number of these is undefined, meaning +;; that any side effects without sequence points is undefined. +;; However, for many of these I do the sensible thing and evaluate them +;; from left to right, instead of ignoring all side effects. + +;; TODO double check these with a C standard + + +;; Operators have their own namespace. They are called without funcall +;; in the pseudo-lisp which C is compiled to, and they expand more like +;; lisp macros, since its up to each operator what to do with its operands. +;; This to allow assignment and short circuting. +(define primitives + `((and . ,(lambda (env . operands) + (let loop ((env env) (operands operands)) + (if (null? operands) + (values env C-TRUE) + (let* ((env* result (c-eval env (car operands)))) + (if (c-boolean->boolean result) + (loop env* (cdr operands)) + (values env* result))))))) + (or . ,(lambda (env . operands) + (let loop ((env env) (operands operands)) + (if (null? operands) + (values env C-FALSE) + (let* ((env* result (c-eval env (car operands)))) + (if (false? result) + (values env* result) + (loop env* (cdr operands)))))))) + (= . ,(lambda (env slot value) + ;; TOOD if slot isn't a variable, but a field (or array index) + ;; then it needs to be resolved... + (let ((env* result (c-eval env value))) + (values (env-set! env* slot result) + result)))) + (and_eq ,(mod-set 'bitand)) ; &= + (or_eq ,(mod-set 'bitor)) ; |= + (xor_eq ,(mod-set 'xor)) ; ^= + (+= ,(mod-set '+)) + (-= ,(mod-set '-)) + (*= ,(mod-set '*)) + (/= ,(mod-set '/)) + (<<= ,(mod-set '<<)) + (>>= ,(mod-set '>>)) + (%= ,(mod-set '%)) + (+ . ,(simple-infix +)) + (* . ,(simple-infix *)) + (/ . ,(simple-infix /)) + (- . ,(lambda (env op . operands) + (if (null? operands) + (let ((env* value (c-eval env op))) + (values env* (- value))) + (apply (simple-infix -) + env op operands)))) + (bitor . ,(simple-infix bitwise-ior)) + (bitand . ,(simple-infix bitwise-and)) + (xor . ,(simple-infix bitwise-xor)) + (not_eq . ,(lambda (env a b) (c-eval env `(not (== ,a ,b))))) ; != + (<< . ,(binary-operator bitwise-arithmetic-shift-left)) + (>> . ,(binary-operator bitwise-arithmetic-shift-right)) + (< . ,(binary-operator (compose boolean->c-boolean <))) + (> . ,(binary-operator (compose boolean->c-boolean >))) + ;; this assumes that = handles pointers + (== . ,(binary-operator (compose boolean->c-boolean =))) + (<= . ,(binary-operator (compose boolean->c-boolean <=))) + (>= . ,(binary-operator (compose boolean->c-boolean >=))) + (% . ,(binary-operator modulo)) + + (not . ,(lambda (env value) + (let ((env* result (c-eval env value))) + (values env* (c-not result))))) + (compl . ,(lambda (env value) + (let ((env* result (c-eval env value))) + (values env* (bitwise-not result))))) + + ;; ++C + (pre-increment . ,(lambda (env slot) (c-eval env `(+= ,slot 1)))) + (pre-decrement . ,(lambda (env slot) (c-eval env `(-= ,slot 1)))) + ;; TODO these (C++, C--) need to handle if slot isn't a direct variabl + (post-increment . ,(lambda (env slot) + (let ((value (env-ref env slot))) + (values (env-set! env slot (1+ value)) + value)))) + (pre-decrement . ,(lambda (env slot) + (let ((value (env-ref env slot))) + (values (env-set! env slot (1+ value)) + value)))) + + (ternary . ,(lambda (env test true-clause false-clause) + (let ((env* value (c-eval env test))) + (c-eval env* + (if (c-boolean->boolean value) + true-clause false-clause))))) + + ;; TODO remaining operations + (as-type . ,(lambda (env target-type value) + (format (current-error-port) "cast<~s>(~s)~%" target-type value) + (values env value))) + + (object-slot . ,(lambda (env object slot) + (scm-error 'not-implemented "object-slot" + "Object slots are not implemented, when accessing ~s.~s" + (list object slot) #f))) + (dereference-slot ,(lambda (env ptr slot) + (scm-error 'not-implemented "dereference-slot" + "Object slots are not implemented, when accessing ~s->~s" + (list object slot) #f))) + (dereference . ,(lambda (env ptr) + (scm-error 'not-implemented "dereference" + "Poiner dereferencing is not implemented: *~s" + (list ptr) #f))) + (pointer . ,(lambda (env value) + (scm-error 'not-implemented "pointer" + "Pointer of is not implemented: &~s" + (list value) #f))))) + +;; TODO |,| + + +(define (c-eval environment expression) + (match expression + (('lambda formals body) (values environment `(lambda ,formals ,body))) + ;; hack since sizeof really should be a operator + (('funcall 'sizeof arg) + ;; TODO + (format (current-error-port) "sizeof ~s~%" arg) + (values environment 1)) + + (('funcall procedure-name ('#{,}# args ...)) + (let ((procedure (env-ref environment procedure-name))) + (ensure-c-procedure procedure "c-eval") + (unless (= (length args) (procedure-arity procedure)) + (scm-error 'c-eval-error "c-eval" + "Procedure arity mismatch for ~s, expected ~s, got ~s" + (list procedure-name + (procedure-arity procedure) + (length args)) + #f)) + (let ((env args* (fold-2 c-eval (cons environment '()) args ))) + (let ((inner-environment + (fold (lambda (name value env) (env-set! env name value)) + (push-frame! env) + (procedure-formals procedure) args*))) + (let ((resulting-environment + result-value + (c-eval inner-environment (procedure-body procedure)))) + (values (pop-frame! resulting-environment) + result-value)))))) + + (('funcall procedure arg) + (c-eval environment `(funcall ,procedure (#{,}# ,arg)))) + + ((operator operands ...) + (apply (or (assoc-ref primitives operator) + (scm-error 'c-eval-error "c-eval" + "Applying non-existant primitive operator: ~s, operands: ~s" + (list operator operands) #f)) + environment operands)) + + ;; "f()" gets compiled to simply f + ;; meaning that we instead use the environment to determine + ;; if something is a variable or procedure + (expr + (if (literal? expr) + (values environment expr) + (let ((value (env-ref environment expr))) + (if (c-procedure? value) + (c-eval environment `(funcall ,value (#{,}#))) + (values environment value))))))) diff --git a/module/c/eval/environment.scm b/module/c/eval/environment.scm new file mode 100644 index 00000000..12eefaf7 --- /dev/null +++ b/module/c/eval/environment.scm @@ -0,0 +1,34 @@ +(define-module (c eval environment) + :use-module (srfi srfi-1) + :export (make-environment + env-set! env-ref push-frame! pop-frame!)) + +(define (make-frame) + (make-hash-table)) + +(define (make-environment) + (list (make-frame))) + +;; Returns an updated environment, linear update +(define (env-set! env key value) + ;; get handle to differentiate #f + ;; (even though #f should never be stored since it's not a C value) + (cond ((find (lambda (frame) (hashq-get-handle frame key)) env) + => (lambda (frame) (hashq-set! frame key value))) + (else (hashq-set! (car env) key value))) + env) + +(define (env-ref env key) + (cond ((null? env) + (scm-error 'misc-error "env-ref" + "~s unbound" + (list key) + #f)) + ((hashq-get-handle (car env) key) => cdr) + (else (env-ref (cdr env) key)))) + +(define (push-frame! environment) + (cons (make-frame) environment)) + +(define (pop-frame! environment) + (cdr environment)) diff --git a/module/c/eval2.scm b/module/c/eval2.scm new file mode 100644 index 00000000..d58f20bf --- /dev/null +++ b/module/c/eval2.scm @@ -0,0 +1,20 @@ +(define-module (c eval2) + :use-module ((hnh util) :select (->)) + :export (C-TRUE + C-FALSE + boolean->c-boolean + c-boolean->boolean + c-not)) + + +(define C-TRUE 1) +(define C-FALSE 0) + +(define (boolean->c-boolean bool) + (if bool C-TRUE C-FALSE)) + +(define (c-boolean->boolean bool) + (not (zero? bool))) + +(define (c-not b) + (-> b c-boolean->boolean not boolean->c-boolean)) diff --git a/module/c/lex.scm b/module/c/lex.scm index 34e52d88..0bde5c9e 100644 --- a/module/c/lex.scm +++ b/module/c/lex.scm @@ -43,8 +43,23 @@ (define-peg-pattern integer all (and (or base-8 base-16 base-10) (? integer-suffix))) +(define-peg-pattern float-suffix all + (* (or "f" "F" "l" "L"))) + +(define-peg-pattern exponent all + (and (ignore (or "e" "E")) (? (or "+" "-")) base-10)) + +;; Helper patterns for creating named groups in float +(define-peg-pattern float-integer all base-10) +(define-peg-pattern float-decimal all base-10) + +(define-peg-pattern float all + (or (and float-integer exponent (? float-suffix)) + (and (? float-integer) (ignore ".") float-decimal (? exponent) (? float-suffix)) + (and float-integer (ignore ".") (? exponent) (? float-suffix)))) + (define-peg-pattern number body - (or integer)) + (or float integer)) (define-peg-pattern group all (and (ignore "(") expr (ignore ")"))) @@ -65,11 +80,16 @@ (define-peg-pattern char all (and (ignore "'") (or escaped-char peg-any) (ignore "'"))) +(define-peg-pattern quot none "\"") + +(define-peg-pattern string all + (and quot (* (or escaped-char (and (not-followed-by "\"") peg-any))) quot)) (define-peg-pattern* operator all `(or ,@(map symbol->string symbol-binary-operators) ,@(map (lambda (op) `(and ,(symbol->string op) ws)) - wordy-binary-operators))) + wordy-binary-operators) + "?" ":")) ;; whitespace (define-peg-pattern ws none @@ -89,17 +109,23 @@ base-10-digit)))) (define-peg-pattern prefix-operator all - (or "!" "~" "*" "&" "++" "--" "+" "-")) + ;; It's important that ++ and -- are BEFORE + and - + ;; otherwise the first + is found, leaving the second +, which fails + ;; to lex since it's an invalid token + ;; TODO sizeof can be written as a prefix operator + ;; (without parenthesis) if the operand is an expression. + (or "*" "&" "++" "--" + "!" "~" "+" "-")) + ;;; Note that stacked pre or postfix operators without parenthesis ;;; dosen't work. So `*&C' is invalid, while `*(&C)' is valid. (define-peg-pattern prefix all - (and prefix-operator sp (or variable group funcall #; postfix - ))) + (and prefix-operator sp (or variable group funcall literal))) (define-peg-pattern postfix-operator all - (or "++" "--")) + (or "++" "--" "*")) (define-peg-pattern postfix all ;; literals can't be in-place incremented and decremented @@ -111,15 +137,25 @@ ;; first case is "same" as expr, but in different order to prevent ;; infinite self reference. Pre and postfix not here, solved by having ;; them before infix in expr - (and (or funcall postfix prefix group char number variable) + (and (or funcall postfix prefix group literal variable) sp operator sp expr)) (define-peg-pattern funcall all (and variable sp group)) +(define-peg-pattern literal body + (or char string number)) + ;;; main parser (define-peg-pattern expr body - (+ (and sp (or infix postfix prefix funcall group char number variable) + (+ (and sp (or + ;; float must be BEFORE infix, otherwise 3.2 is parsed as (infix 3 (operator ".") 2) + ;; that however breaks the infix logic, meaning that floating point numbers can't be + ;; used in basic arithmetic. + ;; TODO remove all implicit order of operations handling in the lexer, and move it to + ;; the parser. This should also fix the case of typecasts being applied incorrectly. + float + infix postfix prefix funcall group literal variable) sp))) diff --git a/module/c/lex2.scm b/module/c/lex2.scm new file mode 100644 index 00000000..6083190f --- /dev/null +++ b/module/c/lex2.scm @@ -0,0 +1,326 @@ +(define-module (c lex2) + :use-module (ice-9 peg) + :export (lex)) + +;;; A.1 Lexical grammar +;;; A.1.1 Lexical elements + +;; (6.4) +(define-peg-pattern token all + (or keyword + identifier + constant + string-literal + punctuator + )) + +;; (6.4) +(define-peg-pattern preprocessing-token all + ;; string literal moved before header-name since string literals + ;; otherwise became q-strings + (or string-literal + header-name + identifier + pp-number + character-constant + punctuator + ;; Each non-white-space character that cannot be one of the above + )) + +;;; A.1.2 Keywords + +;; (6.4.1) +(define-peg-pattern keyword all + (or "auto" "break" "case" "char" "const" "continue" "default" + "do" "double" "else" "enum" "extern" "float" "for" "goto" + "if" "inline" "int" "long" "register" "restrict" "return" + "short" "signed" "sizeof" "static" "struct" "switch" + "typedef" "union" "unsigned" "void" "volatile" "while" + "_Alignas" "_Alignof" "_Atomic" "_Bool" "_Complex" + "_Generic" "_Imaginary" "_Noreturn" "_Static_assert" + "_Thread_local")) + +;;; A.1.3 Identifiers + +;; (6.4.2.1) +(define-peg-pattern identifier all + (and identifier-nondigit (* (or identifier-nondigit digit)))) + +;; (6.4.2.1) +(define-peg-pattern identifier-nondigit body + (or nondigit + universal-character-name + ;; TODO other implementation-defined characters + )) + +;; (6.4.2.1) +(define-peg-pattern nondigit body + (or "_" + (range #\A #\Z) + (range #\a #\z))) + +;; (6.4.2.1) +(define-peg-pattern digit body + (range #\0 #\9)) + +;;; A.1.4 Universal character names + +;; (6.4.3) +(define-peg-pattern universal-character-name all + (or (and "\\u" hex-quad) + (and "\\U" hex-quad hex-quad))) + +;; (6.4.3) +(define-peg-pattern hex-quad body + (and hexadecimal-digit hexadecimal-digit + hexadecimal-digit hexadecimal-digit)) + +;;; A.1.5 Constants + +;; (6.4.4) +(define-peg-pattern constant all + ;; Int and float swapped from standard since we need to try parsing + ;; the floats beforehand + (or floating-constant + integer-constant + enumeration-constant + character-constant)) + +;; (6.4.4.1) +(define-peg-pattern integer-constant all + (and (or decimal-constant + octal-constant + hexadecimal-constant) + integer-suffix)) + +;; (6.4.4.1) +(define-peg-pattern decimal-constant all + (and nonzero-digit + (+ digit))) + +;; (6.4.4.1) +(define-peg-pattern octal-constant all + (+ octal-digit)) + +;; (6.4.4.1) +(define-peg-pattern hexadecimal-constant all + (and hexadecimal-prefix (+ hexadecimal-digit))) + +;; (6.4.4.1) +(define-peg-pattern hexadecimal-prefix none + (or "0x" "0X")) + +;; (6.4.4.1) +(define-peg-pattern nonzero-digit body + (range #\1 #\9)) + +;; (6.4.4.1) +(define-peg-pattern octal-digit body + (range #\0 #\7)) + +;; (6.4.4.1) +(define-peg-pattern hexadecimal-digit body + (or (range #\0 #\9) + (range #\a #\f) + (range #\A #\F))) + +;; (6.4.4.1) +(define-peg-pattern integer-suffix all + (or (and unsigned-suffix (? long-suffix)) + (and long-suffix (? unsigned-suffix)))) + +;; (6.4.4.1) +;; This is a merger of long-suffix and long-long-suffix +(define-peg-pattern long-suffix body + (or "l" "L" "ll" "LL")) + +;; (6.4.4.1) +(define-peg-pattern unsigned-suffix body + (or "u" "U")) + +;; (6.4.4.2) +(define-peg-pattern floating-constant all + (or decimal-floating-constant + hexadecimal-floating-constant)) + +;; (6.4.4.2) +(define-peg-pattern decimal-floating-constant all + (or (and fractional-constant (? exponent-part) (? floating-suffix)) + (and digit-sequence exponent-part (? floating-suffix)))) + +;; (6.4.4.2) +(define-peg-pattern hexadecimal-floating-constant all + (and hexadecimal-prefix + (or hexadecimal-fractional-constant + hexadecimal-digit-sequence) + binary-exponent-part + (? floating-suffix))) + +;; (6.4.4.2) +(define-peg-pattern fractional-constant all + (or (and (? digit-sequence) "." digit-sequence) + (and digit-sequence "."))) + +;; (6.4.4.2) +(define-peg-pattern exponent-part all + (and (or "e" "E") (? sign) digit-sequence)) + +;; (6.4.4.2) +(define-peg-pattern sign all + (or "+" "-")) + +;; (6.4.4.2) +(define-peg-pattern digit-sequence body + (+ digit)) + +;; (6.4.4.2) +(define-peg-pattern hexadecimal-fractional-constant all + (or (and (? hexadecimal-digit-sequence) "." hexadecimal-digit-sequence) + (and hexadecimal-digit-sequence "."))) + +;; (6.4.4.2) +(define-peg-pattern binary-exponent-part all + (and (or "p" "P") + (? sign) + digit-sequence)) + +;; (6.4.4.2) +(define-peg-pattern hexadecimal-digit-sequence body + (+ hexadecimal-digit)) + +;; (6.4.4.2) +(define-peg-pattern floating-suffix all + (or "f" "l" "F" "L")) + +;; (6.4.4.3) +(define-peg-pattern enumeration-constant all + identifier) + +(define-peg-pattern character-prefix all + (or "L" "u" "U")) + +;; (6.4.4.4) +(define-peg-pattern character-constant all + (and (? character-prefix) + (ignore "'") + (+ c-char) + (ignore "'"))) + +;; (6.4.4.4) +(define-peg-pattern c-char body + (or (and (not-followed-by (or "'" "\\" "\n")) peg-any) + escape-sequence)) + +;; (6.4.4.4) +(define-peg-pattern escape-sequence all + (or simple-escape-sequence + octal-escape-sequence + hexadecimal-escape-sequence + universal-character-name)) + +;; (6.4.4.4) +(define-peg-pattern simple-escape-sequence all + (and (ignore "\\") (or "'" "\"" "?" "\\" + "a" "b" "f" "n" "r" "t" "v"))) + +;; (6.4.4.4) +(define-peg-pattern octal-escape-sequence all + (and (ignore "\\") octal-digit (? octal-digit) (? octal-digit))) + +;; (6.4.4.4) +(define-peg-pattern hexadecimal-escape-sequence all + (and (ignore "\\x") (+ hexadecimal-digit))) + +;; A.1.6 String literals + +;; (6.4.5) +(define-peg-pattern string-literal all + (and (? encoding-prefix) + (ignore "\"") + (* s-char) + (ignore "\""))) + +;; (6.4.5) +(define-peg-pattern encoding-prefix all + (or "u8" "u" "U" "L")) + +;; (6.4.5) +(define-peg-pattern s-char body + (or (and (not-followed-by (or "\"" "\\" "\n")) peg-any) + escape-sequence)) + +;;; A.1.7 + +;; (6.4.6) +(define-peg-pattern punctuator all + (or "[" "]" "(" ")" "{" "}" + "..." ; Moved to be before "." + "." "->" + "++" "--" "&" "*" "+" "-" "~" "!" + "/" "%" "<<" ">>" "<" ">" "<=" ">=" "==" "!=" "^" "|" "&&" "||" + "?" ":" ";" + "=" "*=" "/=" "%=" "+=" "-=" "<<=" ">>=" "&=" "^=" "|=" + "," "#" "##" + "<:" ":>" "<%" "%>" "%:" "%:%:")) + +;;; A.1.8 Header names + +(define-peg-pattern h-string all (+ h-char)) +(define-peg-pattern q-string all (+ q-char)) + +;; (6.4.7) +(define-peg-pattern header-name all + (or (and (ignore "<") h-string (ignore ">")) + (and (ignore "\"") q-string (ignore "\"")))) + +;; (6.4.7) +(define-peg-pattern h-char body + (or (and (not-followed-by (or ">" "\n")) peg-any) + escape-sequence)) + +;; (6.4.7) +(define-peg-pattern q-char body + (or (and (not-followed-by (or "\"" "\n")) peg-any) + escape-sequence)) + +;;; A.1.9 Preprocessing numbers + +;; (6.4.8) +(define-peg-pattern pp-number all + (and (? ".") digit + (* (or digit + identifier-nondigit + (and (or "e" "E" "p" "P") + sign) + ".")))) + + + +(define-peg-pattern whitespace all + (or "\t" "\n" "\v" "\f" " " + ;; "\r" + )) + +(define-peg-pattern block-comment body + (and (ignore "/*") + (* (and (not-followed-by "*/") + peg-any)) + (ignore "*/"))) + +(define-peg-pattern line-comment body + (and (ignore "//") + (* (and (not-followed-by "\n") + peg-any)))) + +(define-peg-pattern comment all + (or line-comment block-comment)) + +(define-peg-pattern preprocessing-tokens all + (* (or whitespace + comment + preprocessing-token))) + + +;; returns a list of lexemes +(define (lex string) + (cdr (peg:tree (match-pattern preprocessing-tokens string)))) diff --git a/module/c/line-fold.scm b/module/c/line-fold.scm new file mode 100644 index 00000000..c61c2c70 --- /dev/null +++ b/module/c/line-fold.scm @@ -0,0 +1,29 @@ +(define-module (c line-fold) + :use-module (srfi srfi-1) + :use-module (srfi srfi-71) + :export (fold-lines)) + +(define (line-continued? line) + (and (not (string-null? line)) + (char=? #\\ (string-ref line (1- (string-length line)))))) + +(define (strip-backslash line) + (string-drop-right line 1)) + +(define (fold-lines string) + (with-output-to-string + (lambda () + (let loop ((lines (string-split string #\newline))) + (cond ((null? lines) 'NOOP) + ((null? (cdr lines)) + ;; TODO error message if last character is a backslash + (display (car lines)) + (newline)) + (else + (let ((to-merge remaining (span line-continued? lines))) + (for-each display (map strip-backslash to-merge)) + (display (car remaining)) + (newline) + (for-each (lambda _ (newline)) + (iota (length to-merge))) + (loop (cdr remaining))))))))) diff --git a/module/c/operators.scm b/module/c/operators.scm index ab1b3e7c..910dc8a9 100644 --- a/module/c/operators.scm +++ b/module/c/operators.scm @@ -9,8 +9,9 @@ `(+ - * / & ,(symbol #\|) ^ << >> % < > =)) ;; apparently part of C +;; https://en.cppreference.com/w/cpp/language/operator_alternative (define wordy-binary-operators - '(bitand and_eq and bitor or_eq or xor_eq xor)) + '(bitand and_eq and bitor or_eq or xor_eq xor not_eq)) (define symbol-binary-operators (append (map (lambda (x) (symbol-append x '=)) simple-operators) diff --git a/module/c/parse.scm b/module/c/parse.scm index 8030da77..7d11ea17 100644 --- a/module/c/parse.scm +++ b/module/c/parse.scm @@ -1,11 +1,14 @@ (define-module (c parse) :use-module (hnh util) :use-module (srfi srfi-1) + :use-module (srfi srfi-71) :use-module (ice-9 match) + :use-module ((rnrs io ports) + :select (string->bytevector make-transcoder utf-8-codec)) + :use-module (rnrs bytevectors) :export (parse-lexeme-tree)) -;;; Rename this -(define (perms set) +(define (permutations set) (concatenate (map (lambda (key) (map (lambda (o) (cons key o)) @@ -21,23 +24,124 @@ (define valid-sequences (delete 'dummy (lset-union eq? '(dummy) - (map symbol-concat (perms '(() U L))) - (map symbol-concat (perms '(() U LL)))))) + (map symbol-concat (permutations '(() U L))) + (map symbol-concat (permutations '(() U LL)))))) ;; => (LLU ULL LL LU UL L U) (aif (memv (string->symbol (string-upcase str)) valid-sequences) (case (car it) - [(LLU ULL) '(unsigned long-long)] + [(LLU ULL) '(unsigned long long)] [(LU UL) '(unsigned long)] - [(LL) '(long-long)] + [(LL) '(long long)] [(L) '(long)] [(U) '(unsigned)]) (scm-error 'c-parse-error "parse-integer-suffix" "Invalid integer suffix ~s" (list str) #f))) +(define (parse-float-suffix str) + (case (string->symbol str) + ((f F) '(float)) + ((l L) '(long double)))) + +(define (group-body->type vars) + (concatenate + (map + (match-lambda (('variable var) (list (parse-lexeme-tree `(variable ,var)))) + (('postfix ('variable var) + ('postfix-operator "*")) + (list (parse-lexeme-tree `(variable ,var)) + '*)) + (else (scm-error 'c-parse-error "parse-lexeme-tree" + "Invalid token ~s in typecast form: ~s" + (list else vars) #f))) + vars))) + +;; Takes a list of strings and integers, and merges it all into a single +;; bytevector representing a c string +(define* (string-fragments->c-string + fragments optional: (transcoder (make-transcoder (utf-8-codec)))) + + (define fragments-fixed + (map (lambda (frag) + (if (string? frag) + (string->bytevector frag transcoder) + frag)) + fragments)) + + (define bv-length + (fold (lambda (item sum) (+ sum (if (bytevector? item) + (bytevector-length item) + 1))) + 0 fragments-fixed)) + + (define bv (make-bytevector (1+ bv-length))) + ;; trailing null byte + (bytevector-u8-set! bv bv-length 0) + (fold (lambda (item idx) + (cond ((bytevector? item) + (bytevector-copy! item 0 + bv idx + (bytevector-length item)) + (+ idx (bytevector-length item))) + (else (bytevector-u8-set! bv idx item) + (+ idx 1)))) + 0 + fragments-fixed) + bv) + +(define (parse-float-form float-form) + (let ((float-string + (fold (lambda (arg str) + (string-append + str + (match arg + (('float-integer ('base-10 n)) n) + (('float-decimal ('base-10 n)) (string-append "." n)) + (('exponent "+" ('base-10 n)) (string-append "e" n)) + (('exponent ('base-10 n)) (string-append "e" n)) + (('exponent "-" ('base-10 n)) (string-append "e-" n))))) + "" float-form))) + ;; exact->inexact is a no-op if we already have an inexact number, but + ;; ensures we get an inexact number when we have an exact number (which we + ;; can get from the "1." case). Returning an inexact number here is important + ;; to avoid arithmetic suprises later. + (exact->inexact + (or (string->number float-string) + (scm-error 'c-parse-error "parse-lexeme-tree" + "Couldn't parse expression as float: ~s" + (list `(float ,@args)) #f))))) + + +(define (resolve-escaped-char form) + (match form + (('base-8-char n) (string->number n 8)) + (('base-16-char n) (string->number n 16)) + (c (char->integer + (case (string-ref c 0) + ((#\a) #\alarm) + ((#\b) #\backspace) + ((#\e) #\esc) ;; non-standard + ((#\f) #\page) + ((#\n) #\newline) + ((#\r) #\return) + ((#\t) #\tab) + ((#\v) #\vtab) + ((#\\) #\\) + ;; These are valid in both strings and chars + ((#\') #\') + ((#\") #\")))))) + +;; Takes a list of strings and escaped-char form +;; and returns a list of strings and integers +(define (resolve-string-fragment fragment) + (match fragment + (('escaped-char c) + (resolve-escaped-char c)) + (fargment fragment))) + (define (parse-lexeme-tree tree) (match tree ['() '()] @@ -50,55 +154,67 @@ [('integer n ('integer-suffix suffix)) `(as-type ,(parse-integer-suffix suffix) - ,(parse-lexeme-tree n)) - ] + ,(parse-lexeme-tree n))] + [('integer n) (parse-lexeme-tree n)] + + [('float args ... ('float-suffix suffix)) + `(as-type ,(parse-float-suffix suffix) + ;; parse rest of float as if it lacked a suffix + ,(parse-lexeme-tree `(float ,@args)))] + + [('float args ...) (parse-float-form args)] + ;; Character literals, stored as raw integers ;; so mathematical operations keep working on them. - [('char ('escaped-char ('base-8-char n))) - (-> n (string->number 8) #; integer->char)] - [('char ('escaped-char ('base-16-char n))) - (-> n (string->number 16) #; integer->char)] - [('char ('escaped-char c)) - (char->integer - (case (string-ref c 0) - ((#\a) #\alarm) - ((#\b) #\backspace) - ((#\e) #\esc) - ((#\f) #\page) - ((#\n) #\newline) - ((#\r) #\return) - ((#\t) #\tab) - ((#\v) #\vtab) - ((#\\) #\\) - ((#\') #\')))] + [('char ('escaped-char c)) (resolve-escaped-char c)] + [('char c) (char->integer (string-ref c 0))] [('variable var) (string->symbol var)] + + ;; normalize some binary operators to their wordy equivalent + ;; (which also happens to match better with scheme) + [('operator "&&") 'and] + [('operator "&=") 'and_eq] + [('operator "&") 'bitand] + [('operator "|") 'bitor] + [('operator "!=") 'not_eq] + [('operator "||") 'or] + [('operator "|=") 'or_eq] + [('operator "^") 'xor] + [('operator "^=") 'xor_eq] + ;; Change these names to something scheme can handle better + [('operator ".") 'object-slot] + [('operator "->") 'dereference-slot] [('operator op) (string->symbol op)] + [('prefix-operator op) (case (string->symbol op) + ((!) 'not) + ((~) 'compl) ((*) 'dereference) ((&) 'pointer) ((++) 'pre-increment) ((--) 'pre-decrement) - (else => identity))] + ((-) '-) + (else (scm-error 'c-parse-error "parse-lexeme-tree" + "Unknown prefix operator ~s" + (list op) #f)))] [('postfix-operator op) (case (string->symbol op) [(++) 'post-increment] [(--) 'post-decrement] - [else => identity])] + [else (scm-error 'c-parse-error "parse-lexeme-tree" + "Unknown postfix operator ~s" + (list op) #f)])] ;; Parenthesis grouping - [('group args) + [('group args ...) (parse-lexeme-tree args)] - ;; Atomic item. Used by flatten-infix - [('atom body) - (parse-lexeme-tree body)] - [('prefix op arg) `(,(parse-lexeme-tree op) ,(parse-lexeme-tree arg))] @@ -107,81 +223,204 @@ `(,(parse-lexeme-tree op) ,(parse-lexeme-tree arg))] + + + + + ;; resolved-operator and ternary are the return "types" + ;; of resolve-order-of-operations + [(('resolved-operator op) args ...) + `(,op ,@(map parse-lexeme-tree args))] + + [('ternary a b c) + `(ternary ,(parse-lexeme-tree a) + ,(parse-lexeme-tree b) + ,(parse-lexeme-tree c))] + + + + + ;; Is it OK for literal strings to be "stored" inline? + ;; Or must they be a pointer? + ['string #vu8(0)] + [('string str ...) + (-> (map resolve-string-fragment str) + string-fragments->c-string)] + + ;; implicit concatenation of string literals + [(('string str ...) ...) + (-> (map resolve-string-fragment (concatenate str)) + string-fragments->c-string)] + [('infix args ...) - (resolve-order-of-operations - (flatten-infix (cons 'infix args)))] + (let ((r (resolve-order-of-operations + (flatten-infix (cons 'infix args))))) + (parse-lexeme-tree r))] + [('funcall function ('group arguments)) `(funcall ,(parse-lexeme-tree function) ,(parse-lexeme-tree arguments))] - [bare (scm-error 'c-parse-error - "parse-lexeme-tree" - "Naked literal in lex-tree: ~s" - (list bare) - #f)])) + [(('variable "struct") ('variable value) ..1) + `(struct-type ,@(map string->symbol value)) + ] + + ;; A list of variables. Most likely a type signature + ;; [(('variable value) ..1) + ;; ] + + ;; A typecast with only variables must (?) be a typecast? + [(('group groups ..1) ... value) + (fold-right (lambda (type done) `(as-type ,type ,done)) + (parse-lexeme-tree value) + (map group-body->type groups))] + + ;; Type name resolution? + ;; https://en.wikipedia.org/wiki/C_data_types + ;; base types with spaces: + ;; ======================= + ;; [[un]signed] char + ;; [[un]signed] short [int] + ;; [[un]signed] int + ;; [un]signed [int] + ;; [[un]signed] long [int] + ;; [[un]signed] long long [int] + ;; float + ;; [long] double + + ;; https://en.wikipedia.org/wiki/Type_qualifier + ;; qualifiers + ;; const + ;; volatile + ;; restrict + ;; _Atomic + + + ;; Storage specifiers + ;; auto + ;; register + ;; static + ;; extern + + ;; struct <typename> + ;; enum <typename> + ;; union <typename> + + ;; https://en.wikipedia.org/wiki/C_syntax + ;; int (*ptr_to_array)[100] + + + [(? symbol? bare) + (scm-error 'c-parse-error + "parse-lexeme-tree" + "Naked literal in lex-tree: ~s" + (list bare) + #f)] + + [form + (scm-error 'c-parse-error + "parse-lexeme-tree" + "Unknown form in lex-tree: ~s" + (list form) #f) + ])) ;; https://en.wikipedia.org/wiki/Operators_in_C_and_C%2B%2B +;; https://en.cppreference.com/w/c/language/operator_precedence (define order-of-operations (reverse - (concatenate - ;; This is only for binary operations - `((-> ,(symbol #\.)) - (* / %) - (+ -) - (<< >>) - (< <= > >=) - (== !=) - (&) - (^) - (,(symbol #\|)) - (&&) - (,(symbol #\| #\|)) - (= += -= *= /= %= <<= >>= &= ^= ,(symbol #\| #\=)) - (,(symbol #\,)) - )))) - -(define (mark-other form) - (if (list? form) (cons '*other* form) form)) + ;; This is only for binary operations + `((-> ,(symbol #\.)) + ;; All unary procedures go here, incnluding typecasts, and sizeof + (* / %) + (+ -) + (<< >>) + (< <= > >=) + (== != not_eq) + (& bitand) + (^ xorg) + (,(symbol #\|) bitor) + (&& and) + (,(symbol #\| #\|) or) + (? :) + (= += -= *= /= %= <<= >>= &= ^= ,(symbol #\| #\=) + and_eq or_eq xor_eq) + (,(symbol #\,)) + ))) + +;; a.b->c.d (. (-> (. a b) c) d) +;; 2 * 3 / 4 * 5 => (* (/ (* 2 3) 4) 5) +;; eller => (* 2 (/ 3 4) 5) (define* (resolve-order-of-operations tree optional: (order order-of-operations)) (if (null? order) - (car tree) + (scm-error 'c-parse-error + "resolve-order-of-operations" + "Out of operations to resolve when resolving expression ~s" + (list tree) #f) (match tree - [('*other* body ...) body] - [(form) (resolve-order-of-operations form order)] - [(forms ...) - (match (split-by forms (car order)) - [(group) (resolve-order-of-operations group (cdr order))] - [groups - (cons (car order) - (map (lambda (form) (resolve-order-of-operations - form order-of-operations)) - groups))])] - [a a]))) + [('fixed-infix form) form] + [('fixed-infix forms ...) + (match (split-by-one-of forms (car order)) + [(group) + (resolve-order-of-operations (cons 'fixed-infix group) + (cdr order))] + [(a ('? b ...) (': c ...)) + `(ternary ,(resolve-order-of-operations (cons 'fixed-infix a) (cdr order)) + ,(resolve-order-of-operations (cons 'fixed-infix b) (cdr order)) + ,(resolve-order-of-operations (cons 'fixed-infix c) (cdr order)))] + [(first rest ...) + ;; TODO this is only valid for the associative operators (+, ...) + ;; but not some other (<, ...) + (if (apply eq? (map car rest)) + (let ((op (caar rest))) + `((resolved-operator ,op) + ,@(map (lambda (x) (resolve-order-of-operations (cons 'fixed-infix x) + (cdr order))) + (cons first (map cdr rest))))) + (fold (lambda (item done) + (let ((operator args (car+cdr item))) + `((resolved-operator ,operator) + ,done ,(resolve-order-of-operations + (cons 'fixed-infix args) + (cdr order))))) + (resolve-order-of-operations (cons 'fixed-infix first) + (cdr order)) + rest))])]))) + +;; 1 * 2 / 3 * 4 +;; ⇒ ((1) (* 2) (/ 3) (* 4)) +;; (1) +;; (* (1) 2) +;; (/ (* (1) 2) 3) +;; (* (/ (* (1) 2) 3) 4) ;; Flatens a tree of infix triples. Stops when it should. ;; (parenthesis, function calls, ...) (define (flatten-infix form) - (match form - [('infix left op ('infix right ...)) - (cons* (parse-lexeme-tree left) - (parse-lexeme-tree op) - (flatten-infix (cons 'infix right)))] - - [('infix left op right) - (list (mark-other (parse-lexeme-tree left)) - (parse-lexeme-tree op) - (mark-other (parse-lexeme-tree right)))] - - [other (scm-error 'c-parse-error - "flatten-infix" - "Not an infix tree ~a" - (list other) - #f)])) + (cons 'fixed-infix + (let loop ((form form)) + (match form + [('infix left op ('infix right ...)) + (cons* left + (parse-lexeme-tree op) + (loop (cons 'infix right)))] + + [('infix left op right) + (list left + (parse-lexeme-tree op) + right)] + + [('infix form) form] + + [other (scm-error 'c-parse-error + "flatten-infix" + "Not an infix tree ~a" + (list other) + #f)])))) diff --git a/module/c/preprocessor.scm b/module/c/preprocessor.scm new file mode 100644 index 00000000..71712b17 --- /dev/null +++ b/module/c/preprocessor.scm @@ -0,0 +1,370 @@ +(define-module (c preprocessor) + :use-module (srfi srfi-1) + :use-module (srfi srfi-9 gnu) + :use-module (ice-9 rdelim) + :use-module (ice-9 regex) + :use-module (hnh util object) + + :use-module (hnh util) + :use-module (hnh util object) + ) + +(define (read-lines port) + (let loop ((done '())) + (let ((line (read-line port))) + (if (eof-object? line) + (reverse done) + (loop (cons line done)))))) + +;; The source line of a give readen line +(define line-number (make-object-property)) +;; The source file of a given readen line +(define line-file (make-object-property)) + + +(define (mark-with-property! items property property-value) + (for-each (lambda (item) (set! (property item) property-value)) + items)) + +(define trigraph-rx (make-regexp "??([=()/'<>!-])")) +(define (expand-trigraphs line) + (regexp-substitute/global + #f trigraph-rx + line + 'pre (lambda (m) (case (string-ref (match:substring m 1) 1) + ((#\=) "#") + ((#\() "[") + ((#\)) "]") + ((#\/) "\\") + ((#\') "^") + ((#\<) "{") + ((#\>) "}") + ((#\!) "|") + ((#\-) "~"))) + 'post)) + +(define (number-lines lines) + (for-each (lambda (line number) + (set! (line-number line) number)) + lines + (iota (length lines) 1)) + lines) + +;; Should this line be merged with the next +(define (line-continued? line) + (case (string-length line) + ((0) #f) + ((1) (string=? "\\" line)) + (else + (let ((len (string-length line))) + ;; TODO can extra backslashes change this? + (and (char=? #\\ (string-ref line (- len 1))) + (not (char=? #\\ (string-ref line (- len 2))))))))) + +(define (transfer-line-number to from) + (set! (line-number to) (line-number from)) + to) + +;; Merge two lines, assuming that upper ends with a backslash +(define (merge-lines upper lower) + (let ((new-string (string-append (string-drop-right upper 1) lower))) + (transfer-line-number new-string upper) + new-string)) + +(define (fold-lines lines) + (fold-right (lambda (line done) + (if (line-continued? line) + (cons (merge-lines line (car done)) (cdr done)) + (cons line done))) + '() + lines)) + + +(define comment-rx (make-regexp "(//|/[*]|[*]/)")) + +(define (strip-comments lines) + (let loop ((in-comment #f) (lines lines) (done '())) + (if (null? lines) + (reverse done) + (let ((line (car lines))) + (cond ((regexp-exec comment-rx line) + => (lambda (m) + (format (current-output-port) "~s ~s substr = ~s~%" in-comment (line-number line) (match:substring m)) + (cond ((and in-comment (string=? "*/" (match:substring m))) + (loop #f (cons (transfer-line-number (match:suffix m) line) + (cdr lines)) + done)) + (in-comment (loop #t (cdr lines) done)) + ((string=? "//" (match:substring m)) + (loop #f (cdr lines) (cons (transfer-line-number (match:prefix m) line) + done))) + ((string=? "/*" (match:substring m)) + (loop #t (cons (transfer-line-number (match:suffix m) line) (cdr lines)) done)) + (else + (scm-error 'cpp-error "strip-comments" + "Unexpected */ in file ~a on line ~a" + (list (line-file line) (line-number line)) + #f))))) + (else (loop in-comment (cdr lines) (cons line done)))))))) + + +(define-immutable-record-type <preprocessor-directive> + (make-preprocessor-directive type body) + proprocessor-directive? + (type directive-type) + (body directive-body)) + +(define cpp-directive-rx (make-regexp "\\s*#\\s*((\\w+)(.*))?")) +(define (preprocessor-directive? line) + (cond ((regexp-exec cpp-directive-rx line) + => (lambda (m) + (if (match:substring m 2) + (make-preprocessor-directive + (string->symbol (match:substring m 2)) + (string-trim-both (match:substring m 3) char-set:whitespace)) + 'sort-of))) + (else #f))) + +;; defined + +;; TODO _Pragma + + +(define (expand-function-line-macro environment macro . params) + (expand-macro environment (apply-macro macro (map (lambda (param) (expand-macro environment param)) params)))) + +;; (define (environment-ref )) + +(define (list-of? lst predicate) + (every predicate lst)) + + +;; Parantheses when defining macro +(define (parse-parameter-string string) + (map string-trim-both + (string-split (string-trim-both string (char-set #\( #\))) + #\,))) + + +(define-type (object-macro) + (body type: string?)) + +(define-type (function-macro) + (formals type: (list-of? string?)) + (body type: string?)) + + +;; The interesting part +;; environment, (list string) -> (values (list string) (list strings)) +;; multiple lines since since a function-like macro can extend over multiple lines +;; (define (expand-macros environment strings) +;; ) + + +(define (crash-if-not-if body guilty) + (scm-error 'cpp-error guilty + "else, elif, and endif invalid outside if scope: ~s~%file: ~s line: ~s" + (list body (line-file body) (line-number body)))) + +;; (environment, lines) -> environment x lines +(define (parse-directives environment lines) + (let loop ((environment environment) (lines lines) (done '())) + (let* ((line (car line)) + (directive? (preprocessor-directive? line))) + (case directive? + ((#f) ; regular line + (loop environment (cdr lines) + ;; TODO this doesn't work, since parse-macros works on multiple lines + (cons (parse-macros environment (car lines)) done))) + ((sort-of) ; a no-op directive + (loop environment (cdr lines) done)) + (else ; an actual directive + (case (car (cpp-if-status environment)) + ((outside) + (case (directive-type m) + ((ifndef endif else) + (scm-error 'cpp-error "parse-directives" + "Unexpected directive: ~s" + (list line) #f)) + (else ; inside if, ifelse or else + ;; outside active-if inactive-if + (case (directive-type m) + ;; stack ending directives + ((endif) + (case (car (cpp-if-status environment)) + ((outside) (crash-if-not-if (directive-body m) "endif")) + (else (loop (modify environment cpp-if-status cdr) + (cdr lines) + done)))) + + ;; stack nudging directives + ((else) + (case (car (cpp-if-status environment)) + ((outside) (crash-if-not-if (directive-body m) "else")) + (else (loop (modify environment (lens-compose cpp-if-status car*) + (lambda (old) + (case old + ((active-if) 'inactive-if) + ((inactive-if) 'active-if)))) + (cdr lines) + done)))) + ((elif) + (case (car (cpp-if-status environment)) + ((outside) (crash-if-not-if (directive-body m) "elif")) + (else 'TODO ;; TODO + ) + )) + + ;; stack creating directives + ;; on inactive-if each creates a new frame, which also is inactive + ((ifndef) + (case (car (cpp-if-status environment)) + ((inactive-if) (loop (modify environment cpp-if-status + xcons 'inactive-if) + (cdr lines) + done)) + (else (loop (modify environment cpp-if-status + xcons (if (in-environment? environment (directive-body directive?)) + 'inactive-if 'active-if)) + (cdr lines) + done)))) + + ((ifdef) + (case (car (cpp-if-status environment)) + ((inactive-if) (loop (modify environment cpp-if-status + xcons 'inactive-if) + (cdr lines) + done)) + (else + (loop (modify environment cpp-if-status + xcons (if (in-environment? environment (directive-body directive?)) + 'active-if 'inactive-if)) + (cdr lines) + done)))) + + ((if) + (case (car (cpp-if-status environment)) + ((inactive-if) (loop (modify environment cpp-if-status + xcons 'inactive-if) + (cdr lines) + done)) + (else 'TODO ;; TODO + ))) + + + ;; other directives + ((include) (cond ((string-match "[<\"](.*)" + => (lambda (m) + (let ((fileneme (string-drop-right (directive-body m) 1))) + (case (string-ref (match:substring m 1) 0) + ;; TODO include-path + ((#\<) (handle-file environment filename)) + ((#\") (handle-file environment filename)))))) + (else (scm-error 'cpp-error "parse-directives" + "Invalid include" + '() #f))))) + ((define) + ;; TODO what are valid names? + (cond ((string-match "^(\\w+)([(][^)]*[)])?\\s+(.*)" (directive-body directive?)) + => (lambda (m) + (loop (let ((macro-body (string-trim-both (match:substring m 3)))) + (add-identifier! + environment + (match:substring m 1) + (cond ((match:substring m 2) + => (lambda (parameter-string) + (function-macro + formals: (parse-parameter-string parameter-string) + body: macro-body))) + (else (object-macro body: macro-body))))) + (cdr lines) + done))) + (else (scm-error 'cpp-error "parse-directives" + "Invalid #define line, ~s" + (list (directive-body directive?)) + #f)))) + + ((undef) + (case (car (cpp-if-status environment)) + ((inactive-if) (loop environment (cdr lines) done)) + (else (loop (remove-identifier environment (directive-body directive?)) + (cdr lines) + done)))) + + ((line) + (case (car (cpp-if-status environment)) + ((inactive-if) (loop environment (cdr lines) done)) + ;; TODO add first-run parameter to loop, in case expand-macros still return something invalid + (else (let parse-line-directive ((tokens (string-tokenize (directive-body directive?)))) + (cond ((= 1 (length tokens)) + ;; TODO parse token + (if (integer? (car tokens)) + ;; TODO update current line + (loop environment (cdr lines) done) + (parse-line-directive (expand-macros environment (directive-body directive?))))) + ((= 2 (length tokens)) + ;; TODO parse tokens + (if (and (integer? (car tokens)) + (string-literal? (cadr tokens))) + ;; TODO update current line and file + (loop environment (cdr lines) done) + (parse-line-directive (expand-macros environment (directive-body directive?))))) + (else (parse-line-directive (expand-macros environment (directive-body directive?))))))))) + + ((error) + (throw 'cpp-error-directive + (directive-body directive?))) + + ((warning) + (format (current-error-port) "#warning ~a~%" + (directive-body directive?)) + (loop environment (cdr lines) done)) + + ((pragma) + (format (current-error-port) + "PRAGMA: ~s~%" (directive-body directive?)) + (loop environment (cdr lines) done)) + + ((ident sccs) + (format (current-error-port) + "IDENT: ~s~%" (directive-body directive?)) + (loop environment (cdr lines) done)) + + (else + (scm-error 'cpp-error "parse-directives" + "Unknown pre-processor directive: ~s" + (list line) #f) + ))))))))) + )) + + +(define* (writeln expr optional: (port (current-output-port))) + (write expr port) + (newline port)) + +(define (handle-lines environment lines) + (parse-directive environment + (compose strip-comments fold-lines number-lines))) + + ;; parse-directives environment + +;; Return a number of lines +(define (read-file file-path) + (define raw-lines (call-with-input-file file-path read-lines)) + (mark-with-property! line line-file file-path) + (handle-lines raw-lines)) + + +;; pre defined macros +;; see info manual for cpp 3.7.1 Standard Predefined Macros +;; __FILE__ +;; __LINE__ +;; __DATE__ "Feb 12 1996" +;; __TIME__ "23:59:01" +;; __STDC__ 1 +;; __STDC_VERSION__ 201112L +;; __STDC_HOSTED__ 1 + +;; __cplusplus +;; __OBJC__ +;; __ASSEMBLER__ diff --git a/module/c/preprocessor2.scm b/module/c/preprocessor2.scm new file mode 100644 index 00000000..e99b1049 --- /dev/null +++ b/module/c/preprocessor2.scm @@ -0,0 +1,590 @@ +(define-module (c preprocessor2) + :use-module (srfi srfi-1) + :use-module (srfi srfi-71) + :use-module (srfi srfi-88) + :use-module (ice-9 match) + :use-module (c cpp-environment) + :use-module (c eval2) + :use-module ((c cpp-environment function-like-macro) :select (function-like-macro)) + :use-module ((c cpp-environment object-like-macro) :select (object-like-macro)) + :use-module ((c cpp-environment internal-macro) :select (internal-macro)) + :use-module ((hnh util) :select (->)) + :use-module ((hnh util lens) :select (set)) + :use-module (hnh util path) + :use-module ((c lex2) :select (lex)) + :use-module ((c trigraph) :select (replace-trigraphs)) + :use-module ((c line-fold) :select (fold-lines)) + :export ()) + +;; Returns two values: +;; - tokens until a newline token is met +;; - (potentially the newline token) and the remaining tokens +(define (tokens-until-eol tokens) + (break (lambda (token) (equal? token '(whitespace "\n"))) + tokens)) + +;; match in predicates so non-lists fail properly +(define (whitespace-token? token) + (match token + (`(whitespace ,_) #t) + (_ #f))) + +(define (unwrap-preprocessing-token token) + (match token + (`(preprocessing-token ,x) x) + (_ (scm-error 'wrong-type-arg "unwrap-preprocessing-token" + "Not a preprocessing token: ~s" (list token) + #f)))) + +(define (preprocessing-token? token) + (catch 'wrong-type-arg + (lambda () (unwrap-preprocessing-token token)) + (const #f))) + + +;; Replace all whitespace with single spaces. +(define (squeeze-whitespace tokens) + (match tokens + ('() '()) + ((`(whitespace ,_) `(whitespace ,_) rest ...) + (squeeze-whitespace (cons '(whitespace " ") rest))) + ((`(whitespace ,_) rest ...) + (cons '(whitespace " ") (squeeze-whitespace rest))) + ((token rest ...) + (cons token (squeeze-whitespace rest))))) + +;; Returns the "source" of the token, as a preprocessing string literal token +(define (stringify-token unwrapped-preprocessing-token) + (match unwrapped-preprocessing-token + (`(string-literal ,s) + (format #f "~s" s)) + (`(header-name (q-string ,s)) + (format #f "~s" s)) + (`(header-name (h-string ,s)) + (format #f "<~a>" s)) + (`(identifier ,id) id) + (`(pp-number ,n) n) + (`(character-constant ,c) + (format #f "'~a'" c)) + (`(punctuator ,p) p))) + +(define (stringify-tokens tokens) + `(preprocessing-token + (string-literal + ,(string-concatenate + (map (match-lambda (`(preprocessing-token ,body) (stringify-token body)) + (`(whitespace ,_) " ")) + (squeeze-whitespace tokens)))))) + +;; Expand ## tokens +;; TODO +(define (expand-join macro tokens) + tokens) + +;; parameters is a lexeme list, as returned by parse-parameter-list +(define (build-parameter-map macro parameters) + (if (macro-variadic? macro) + (let ((head rest (split-at parameters (length (macro-identifier-list macro))))) + ;; TODO commas (,) should be interleaved with rest + (cons (cons "__VA_ARGS__" rest) + (map cons (macro-identifier-list macro) head))) + (map cons + (macro-identifier-list macro) + parameters))) + +;; Drop leading whitespace tokens +(define (drop-whitespace tokens) + (drop-while whitespace-token? tokens)) + +(define (drop-whitespace-right tokens) + (-> tokens reverse drop-whitespace reverse)) + +(define (drop-whitespace-both tokens) + (-> tokens + drop-whitespace + drop-whitespace-right)) + +(define (expand-stringifiers macro parameter-map) + (let loop ((tokens (macro-body macro))) + (match tokens + (('(preprocessing-token (punctuator "#")) + rest ...) + (match (drop-whitespace rest) + ((`(preprocessing-token (identifier ,x)) rest ...) + (unless (member x (macro-identifier-list macro)) + (scm-error 'macro-expand-error "expand-stringifiers" + "'#' is not followed by a macro parameter: ~s" + (list x) #f)) + (cons (stringify-tokens (assoc-ref parameter-map x)) + (loop rest))))) + ('() '()) + ((token rest ...) + (cons token (loop rest)))))) + +;; expand function like macro +(define (apply-macro environment macro parameters) + (define parameter-map (build-parameter-map macro parameters)) + (define stringify-resolved (expand-stringifiers macro parameter-map)) + ;; TODO resolve ## + (define resulting-body stringify-resolved #; (expand-join macro stringify-resolved) + ) + (resolve-token-stream (extend-environment environment parameter-map) + resulting-body)) + + + +;; Expand object-like macro + +;; #define VALUE 10 +;; #define str(x) #x +;; #define OTHER str(VALUE) +;; OTHER +;; ⇒ "VALUE" + +;; token should be the token stream just after the name of the macro +(define (expand-macro environment macro tokens) + (cond ((object-macro? macro) + ;; Shouldn't we expand the macro body here? + (values environment (append (macro-body macro) tokens))) + + ((function-macro? macro) + (let ((containing remaining newlines (parse-parameter-list tokens))) + (values (bump-line environment newlines) + ;; Macro output can be macro expanded + ;; TODO self-referential macros? + (append (apply-macro environment macro containing) remaining)))) + + ((internal-macro? macro) + (let ((containing remaining newlines (parse-parameter-list tokens))) + (values (bump-line environment newlines) + (append ((macro-body macro) environment containing) + remaining)))) + + (else + (scm-error 'wrong-type-arg "expand-macro" + "Macro isn't a macro: ~s" + (list macro) #f)))) + +;; Takes a list of preprocessing tokens, and returns two values +;; if the last token was '...' +;; and a list of strings of all token names +;; Note that this is ONLY #define f(x) forms +;; not usage forms +(define (parse-identifier-list tokens) + (let loop ((tokens (remove whitespace-token? tokens)) (done '())) + (match tokens + ('() (values #f (reverse done))) + + ((`(preprocessing-token (identifier ,id)) rest ...) + (loop rest (cons id done))) + + ((`(preprocessing-token (punctuator "..."))) + (values #t (reverse done))) + + ((`(preprocessing-token (punctuator "...")) rest ...) + (scm-error 'cpp-error "parse-identifier-list" + "'...' only allowed as last argument in identifier list. Rest: ~s" + (list rest) #f)) + + ((`(preprocessing-token (punctuator ",")) rest ...) + (loop rest done)) + + ((`(preprocessing-token ,other) rest ...) + (scm-error 'cpp-error "parse-identifier-list" + "Unexpected preprocessing-token in identifier list: ~s" + (list other) #f))))) + + + +;; helper procedure to parse-parameter-list. +;; If a parameter is complex then whitespace is kept, but squeezed to single spaces. Surounding whitespace is removed. +;; Example: +;; #define str(x, y) #y +;; str(x, ( 2, 4 ) ) +;; expands to: +;; "( 2, 4 )" +;; 6.10.3.2 p 2 +(define (cleanup-whitespace tokens) + (-> tokens drop-whitespace-both squeeze-whitespace)) + +;; returns three values: +;; - a list of tokens where each is a parameter to the function like macro +;; - the remaining tokenstream +;; - how many newlines were encountered +;; The standard might call these "replacement lists" +(define (parse-parameter-list tokens) + (let %loop ((depth 0) (newlines 0) (current '()) + (parameters '()) (tokens tokens) (%first-iteration? #t)) + (define* (loop tokens key: + (depth depth) (newlines newlines) + (current current) (parameters parameters)) + (%loop depth newlines current parameters tokens #f)) + (let ((current* (if (zero? depth) + current + (cons (car tokens) current)))) + (match tokens + (('(whitespace "\n") rest ...) + (loop rest newlines: (1+ newlines) current: current*)) + ((`(whitespace ,_) rest ...) + (loop rest current: current*)) + (('(preprocessing-token (punctuator "(")) rest ...) + (loop rest depth: (1+ depth) current: current*)) + (('(preprocessing-token (punctuator ")")) rest ...) + (if (= 1 depth) + ;; return value + (values + (if (null? parameters) + (cond ((null? current) '()) + ((every whitespace-token? current) '()) + (else (reverse + (cons (cleanup-whitespace (reverse current)) + parameters)))) + (reverse + (cond ((null? current) parameters) + ((every whitespace-token? current) parameters) + (else (cons (cleanup-whitespace (reverse current)) + parameters))))) + + rest + newlines) + (loop rest + depth: (1- depth) + current: current*))) + (('(preprocessing-token (punctuator ",")) rest ...) + (if (= 1 depth) + (loop rest + current: '() + parameters: + (cons (cond ((null? current) '()) + ((every whitespace-token? current) '()) + (else (cleanup-whitespace (reverse current)))) + parameters)) + (loop rest current: current*))) + ((_ rest ...) + (loop rest current: current*)))))) + + +(define (join-file-line environment) + (define file (current-file environment)) + (define line (current-line environment)) + (extend-environment + environment + ;; 6.10.8 + (list + (object-like-macro + identifier: "__FILE__" + body: `((preprocessing-token (string-literal ,file)))) + (object-like-macro + identifier: "__LINE__" + body: `((preprocessing-token (pp-number ,(number->string line)))))))) + +(define (c-search-path) (make-parameter (list "." "/usr/include"))) + +;; #include <stdio.h> +(define (resolve-h-file string) + (cond ((path-absolute? string) string) + (else + (let ((filename + (find file-exists? + (map (lambda (path-prefix) + (path-append path-prefix string)) + (c-search-path))))) + (if filename filename + (scm-error 'cpp-error "resolve-h-file" + "Can't resolve file: ~s" + (list string) #f)))))) + +;; #include "myheader.h" +(define (resolve-q-file string) + ;; This should always be a fallback (6.10.2, p. 3) + (cond (else (resolve-h-file string)))) + +(define defined-macro + (internal-macro + identifier: "defined" + body: (lambda (environment tokens) + (match tokens + (`((preprocessing-token (identifier ,id))) + `(preprocessing-token (pp-number ,(boolean->c-boolean (in-environment? environment id))))) + (_ (scm-error 'cpp-error "defined" + "Invalid parameter list to `defined': ~s" + (list tokens) #f)))))) + +;; environment, tokens → environment +(define (handle-pragma environment tokens) + (match tokens + (`((preprocessing-token (identifier "STDC")) (whitespace ,_) ... + (preprocessing-token (identifier ,identifier)) (whitespace ,_) ... + (preprocessing-token (identifier ,on-off-switch)) (whitespace ,_) ...) + ;; TODO actually do something with the pragmas (probably just store them in the environment) + (format (current-error-port) + "#Pragma STDC ~a ~a" identifier on-off-switch) + environment) + (_ (format (current-error-port) + "Non-standard #Pragma: ~s~%" tokens) + environment))) + + +;; TODO +;; (define _Pragma-macro +;; (internal-macro +;; identifier: "_Pragma" +;; body: (lambda (environment tokens) +;; ))) + +;; TODO +(define (resolve-constant-expression tokens) + 'TODO + ) + +;; Expands a token-stream which contains no pre-processing directives (#if:s, ...) +(define (resolve-token-stream environment tokens) + (let loop ((tokens tokens)) + (match tokens + ('() '()) + ((`(preprocessing-token (identifier ,id)) rest ...) + (call-with-values (lambda () (maybe-extend-identifier environment id rest)) + (lambda (_ tokens) (loop tokens)))) + ((`(whitespace ,_) rest ...) + (loop rest)) + ((token rest ...) + (cons token (loop rest)))))) + +;; returns a new environment +;; handle body of #if +;; environment, (list token) → environment +(define (resolve-for-if environment tokens) + (-> (extend-environment environment defined-macro) + (resolve-token-stream tokens) + resolve-constant-expression + c-boolean->boolean + (if (enter-active-if environment) + (enter-inactive-if environment)))) + +;; environment, string, (list token) → environment, (list token) +(define (maybe-extend-identifier environment identifier remaining-tokens) + (cond ((get-identifier environment identifier) + => (lambda (value) (expand-macro (join-file-line environment) + value + remaining-tokens))) + (else ; It wasn't an identifier, leave it as is + ;; TODO shouldn't we include the identifier in the remaining tokens stream? + (values environment remaining-tokens)))) + +(define (resolve-and-include-header environment tokens) + (let loop ((%first-time #t) (tokens tokens)) + (match (drop-whitespace tokens) + ((`(header-name (h-string ,str)) rest ...) + (cond ((remove whitespace-token? rest) + (negate null?) + => (lambda (tokens) + (scm-error 'cpp-error "resolve-and-include-header" + "Unexpected tokens after #include <>: ~s" + (list tokens) #f)))) + (handle-preprocessing-tokens + environment + (-> str resolve-h-file read-file tokenize))) + + ((`(header-name (q-string ,str)) rest ...) + (cond ((remove whitespace-token? rest) + (negate null?) + => (lambda (tokens) + (scm-error 'cpp-error "resolve-and-include-header" + "Unexpected tokens after #include <>: ~s" + (list tokens) + #f)))) + (handle-preprocessing-tokens + environment + (-> str resolve-q-file read-file tokenize))) + + (tokens + (unless %first-time + (scm-error 'cpp-error "resolve-and-include-header" + "Failed parsing tokens: ~s" + (list tokens) #f)) + (loop #f (resolve-token-stream environment tokens)))))) + +;; environment, tokens → environment +(define (handle-line-directive environment tokens*) + (let loop ((%first-time #t) (tokens tokens*)) + (match tokens + (`((preprocessing-token (pp-number ,line)) (whitespace ,_) ... rest ...) + (match rest + (`((preprocessing-token (string-literal ,file)) (whitespace ,_) ...) + (-> environment + (set current-line line) + (set current-file file))) + (`((whitespace ,_) ...) + (set environment current-line line)) + (_ (unless %first-time + (scm-error 'cpp-error "handle-line-directive" + "Invalid line directive: ~s" + (list tokens*) #f)) + (loop #f (resolve-token-stream environment tokens))))) + (_ (unless %first-time + (scm-error 'cpp-error "handle-line-directive" + "Invalid line directive: ~s" + (list tokens*) #f)) + (loop #f (resolve-token-stream environment tokens)))))) + +;; environment, tokens → environment +(define (resolve-define environment tokens) + (match tokens + ((`(preprocessing-token (identifier ,identifier)) tail ...) + (-> environment + bump-line + (add-identifier! + identifier + (match tail + (('(preprocessing-token (punctuator "(")) rest ...) + ;; function like macro + (call-with-values (lambda () (break (lambda (token) (equal? token '(preprocessing-token (punctuator ")")))) + rest)) + (lambda (identifier-list replacement-list) + (let ((variadic? identifiers (parse-identifier-list identifier-list))) + + (function-like-macro + identifier: identifier + variadic?: variadic? + identifier-list: identifiers + ;; NOTE 6.10.3 states that there needs to be at least on whitespace here + body: (cdr replacement-list)))))) + (_ (object-like-macro + identifier: identifier + body: tail)))))))) + + + +;; environment, tokens -> environment, tokens +(define (handle-preprocessing-tokens environment tokens) + (let loop ((environment environment) (tokens tokens)) + (define (err fmt . args) + (scm-error 'cpp-error "handle-preprocessing-tokens" + (string-append "~a:~a " fmt) + (cons* (current-file environment) + (current-line environment) + args) + #f)) + + ;; TODO all of this needs to be surounded with a conditional for + ;; environmentns if status. However, ensure that each directive + ;; starts at start of line + + (match tokens + ('() '()) + ((`(whitespace "\n") `(whitespace ,_) '... '(preprocessing-token (puntuator "#")) rest ...) + ;; Line tokens are those in this line, + ;; while remaining tokens are the newline, follewed by the rest of the files tokens + (let ((line-tokens remaining-tokens (tokens-until-eol rest))) + ;; Actual tokens just removes all whitespace between "#" and "define" + (let ((actual-tokens (drop-whitespace line-tokens))) + (if (null? actual-tokens) + (loop (bump-line environment) remaining-tokens) + (match (car actual-tokens) + (`(preprocessing-token (identifier "if")) + (let ((environment (resolve-for-if environment actual-tokens))) + (loop environment remaining-tokens))) + + (`(preprocessing-token (identifier "ifdef")) + (match actual-tokens + ((`(preprocessing-token (identifier ,id)) _ ...) + (loop + ((if (in-environment? environment id) + enter-active-if enter-inactive-if) + environment) + remaining-tokens)) + (_ (err "Non identifier in ifdef: ~s" actual-tokens)))) + + (`(preprocessing-token (identifier "ifndef")) + (match actual-tokens + ((`(preprocessing-token (identifier ,id)) _ ...) + (loop + ((if (in-environment? environment id) + enter-inactive-if enter-active-if) + environment) + remaining-tokens)) + (_ (err "Non identifier in ifndef: ~s" actual-tokens)))) + + ('(preprocessing-token (identifier "else")) + ;; TODO + 'TODO + ) + + ('(preprocessing-token (identifier "elif")) + (-> environment leave-if + (resolve-for-if actual-tokens) + (loop remaining-tokens))) + + ('(preprocessing-token (identifier "endif")) + (loop (leave-if environment) remaining-tokens)) + + ('(preprocessing-token (identifier "include")) + (call-with-values + (lambda () (resolve-and-include-header environment (cdr actual-tokens))) + (lambda (environment tokens) + (loop environment (append tokens remaining-tokens))))) + + ('(preprocessing-token (identifier "define")) + (let ((env (resolve-define environment (cdr actual-tokens)))) + (loop env remaining-tokens)) + ) + + ('(preprocessing-token (identifier "undef")) + (loop (match actual-tokens + (`((preprocessing-token (identifier ,id))) + (-> environment bump-line (remove-identifier! id)))) + remaining-tokens)) + + ('(preprocessing-token (identifier "line")) + (loop (handle-line-directive environment actual-tokens) + remaining-tokens)) + + ('(preprocessing-token (identifier "error")) + ;; NOTE this is an "expected" error + (throw 'cpp-error actual-tokens)) + + ('(preprocessing-token (identifier "pragma")) + (loop (handle-pragma environment actual-tokens) + remaining-tokens))))))) + + ((`(preprocessing-token (identifier ,id)) rest ...) + (call-with-values (lambda () (maybe-extend-identifier environment id rest)) + loop)) + + (('(whitespace "\n") rest ...) + (cons '(whitespace "\n") (loop (bump-line environment) rest))) + + ((token rest ...) (cons token (loop environment rest)))))) + + + +(define (comment->whitespace expr) + (match expr + (('comment _) '(whitespace " ")) + (other other))) + +(define (read-file path) + (call-with-input-file path (@ (ice-9 rdelim) read-string))) + +(define (comment->whitespace token) + (match token + (`(comment ,_) '(whitespace " ")) + (other other))) + +(define (comments->whitespace tokens) + (map comment->whitespace tokens)) + +;;; 5.1.11.2 Translation phases + +(define (tokenize string) + (-> string +;;; 1. trigraph replacement + replace-trigraphs +;;; 2. Line folding + fold-lines +;;; 3. Decomposition into preprocenning tokens, whitespaces, and comments + lex +;;; 4. Execution of preprocessing directives, all preprocessing directives are then deleted + comments->whitespace + ;; squeeze-whitespace-blocks + )) diff --git a/module/c/trigraph.scm b/module/c/trigraph.scm new file mode 100644 index 00000000..197e01a4 --- /dev/null +++ b/module/c/trigraph.scm @@ -0,0 +1,24 @@ +(define-module (c trigraph) + :use-module (ice-9 regex) + :export (replace-trigraphs)) + +(define rx (make-regexp "\\?\\?([=\\(\\)'!<>/-])")) + +(define (proc m) + (case (string-ref (match:substring m 2) 0) + ((#\=) "#") + ((#\() "[") + ((#\)) "]") + ((#\') "^") + ((#\<) "{") + ((#\>) "}") + ((#\!) "|") + ((#\-) "~") + ((#\/) "\\"))) + +(define (replace-trigraphs string) + (call-with-output-string + (lambda (port) + (regexp-substitute/global + port rx string + 'pre proc 'post)))) diff --git a/module/hnh/util.scm b/module/hnh/util.scm index d2c0dd5f..9a45704b 100644 --- a/module/hnh/util.scm +++ b/module/hnh/util.scm @@ -34,6 +34,8 @@ group-by split-by + split-by-one-of + span-upto cross-product @@ -341,6 +343,29 @@ (cdr rem))]))) +(define (split-by-one-of lst items) + (cond ((null? items) + (scm-error 'wrong-type-arg "split-by-one-of" + "Must have at least one item to split by, when splitting ~s" + (cons items '()) #f)) + ((not (list? items)) + (scm-error 'wrong-type-arg "split-by-one-of" + "Items must be list of list of symbols, got ~s" + (list items) #f)) + (else + (call-with-values + (lambda () + (car+cdr + (let loop ((token 'sentinel-token) (lst lst)) + (let ((head tail (break (lambda (item) (memv item items)) + lst))) + (let ((group (cons token head))) + (if (null? tail) + (list group) + (cons group (loop (car tail) (cdr tail))))))))) + ;; Remove the sentinel token + (lambda (first rest) (cons (cdr first) rest)))))) + ;; Simar to span from srfi-1, but never takes more than ;; @var{count} items. Can however still take less. diff --git a/module/hnh/util/path.scm b/module/hnh/util/path.scm index ea081e85..0c8af48a 100644 --- a/module/hnh/util/path.scm +++ b/module/hnh/util/path.scm @@ -3,6 +3,7 @@ :use-module (srfi srfi-71) :use-module (hnh util) :export (path-append + path-absolute? path-join path-split file-hidden? @@ -12,6 +13,8 @@ (define // file-name-separator-string) (define /? file-name-separator?) +(define path-absolute? absolute-file-name?) + (define (path-append . strings) (fold (lambda (s done) (string-append diff --git a/module/srfi/srfi-64/util.scm b/module/srfi/srfi-64/util.scm new file mode 100644 index 00000000..a371227f --- /dev/null +++ b/module/srfi/srfi-64/util.scm @@ -0,0 +1,11 @@ +(define-module (srfi srfi-64 util) + :use-module (ice-9 curried-definitions) + :use-module ((srfi srfi-1) :select (every)) + :use-module (srfi srfi-64) + :export (test-match-group)) + +;; Specifier for name of group +(define ((test-match-group name . names) runner) + (every string=? + (reverse (cons name names)) + (test-runner-group-stack runner))) |