From 690034ca8abcd931e2fb6bb8129450deee701179 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hugo=20H=C3=B6rnquist?= Date: Tue, 28 Jun 2022 23:46:08 +0200 Subject: Major work on c parser. --- module/c/cpp.scm | 60 +++++++++------------ module/c/lex.scm | 25 +++++++-- module/c/operators.scm | 1 + module/c/parse.scm | 138 ++++++++++++++++++++++++++++++++++++++++++------- 4 files changed, 168 insertions(+), 56 deletions(-) (limited to 'module/c') diff --git a/module/c/cpp.scm b/module/c/cpp.scm index a2935352..9a8245ad 100644 --- a/module/c/cpp.scm +++ b/module/c/cpp.scm @@ -11,7 +11,7 @@ :use-module (c lex) :use-module (c parse) :use-module (c operators) - :export (do-funcall replace-symbols include#) + :export (replace-symbols include#) ) @@ -20,7 +20,7 @@ ;; 2 macro name | F ;; 3 macro args | (x, y) ;; 4 macro body | x + y -(define define-re (make-regexp "^#define ((\\w+)(\\([^)]*\\))?) (.*)")) +(define define-re (make-regexp "^#define ((\\w+)([(][^)]*[)])?) (.*)")) (define (tokenize-define-line header-line) (aif (regexp-exec define-re header-line) @@ -32,20 +32,6 @@ (list header-line) #f))) -(define (do-funcall function arguments) - (if (list? arguments) - (apply function arguments) - (function arguments))) - -(define symb-map - `((,(symbol #\|) . logior) - (funcall . (@ (c cpp) do-funcall)) - (&& . and) - (& . logand) - (== . =) - (!= . (negate =)) - )) - (define (replace-symbols tree dict) (if (not (list? tree)) (or (assoc-ref dict tree) tree) @@ -77,7 +63,6 @@ [arg (list arg)])) (define right (f (cdr pair))) - (define alt-right (replace-symbols right symb-map)) (define dependencies (lset-difference eq? @@ -91,12 +76,12 @@ dependencies (match left [('funcall name ('#{,}# args ...)) - (cons name `(lambda ,args ,alt-right))] + (cons name `(lambda ,args ,right))] [('funcall name arg) - (cons name `(lambda (,arg) ,alt-right))] + (cons name `(lambda (,arg) ,right))] - [name (cons name alt-right)]))) + [name (cons name right)]))) (define (parse-cpp-file lines) @@ -104,7 +89,9 @@ (catch #t (lambda () (parse-cpp-define line)) (lambda (err caller fmt args data) - (format #t "~a ~?~%" fmt args) + (format #t "~a in ~a: ~?~%" + err caller fmt args) + (format #t "~s~%" line) #f))) lines)) @@ -114,29 +101,32 @@ (define (tokenize-header-file header-file) (map tokenize-define-line (call-with-port - (open-input-pipe - (string-append "cpp -dM " header-file)) + (open-pipe* OPEN_READ "cpp" "-dM" header-file) read-lines))) -(define-macro (include# header-file . args) - - (define define-form (if (null? args) 'define (car args))) - - (define lines (remove (compose private-c-symbol? car) - (tokenize-header-file header-file))) +(define (load-cpp-file header-file) + (define lines (tokenize-header-file header-file)) (define forms (parse-cpp-file lines)) - (define graph* - (fold (lambda (node graph) - (add-node graph (cdr node) (car node))) - (make-graph car) - (filter identity forms))) + (fold (lambda (node graph) + (add-node graph (cdr node) (car node))) + (make-graph car) + (filter identity forms))) +(define (include% header-file) + (define graph* (load-cpp-file header-file)) ;; Hack for termios since this symbol isn't defined. ;; (including in the above removed private c symbols) (define graph (add-node graph* (cons '_POSIX_VDISABLE #f) '())) + ;; TODO expand bodies + ;; (remove (compose private-c-symbol? car)) + (resolve-dependency-graph graph)) + +(define-macro (include# header-file . args) + + (define define-form (if (null? args) 'define (car args))) `(begin ,@(map (lambda (pair) `(,define-form ,(car pair) ,(cdr pair))) - (resolve-dependency-graph graph)))) + (include% header-file)))) diff --git a/module/c/lex.scm b/module/c/lex.scm index 34e52d88..5f395322 100644 --- a/module/c/lex.scm +++ b/module/c/lex.scm @@ -43,8 +43,22 @@ (define-peg-pattern integer all (and (or base-8 base-16 base-10) (? integer-suffix))) +;; (define-peg-pattern float-suffix all +;; (* (or "f" "F" "l" "L"))) + +;; (define-peg-pattern exponent all +;; (and (ignore (or "e" "E")) (? (or "+" "-")) base-10)) + +;; (define-peg-pattern float all +;; (or +;; (and base-10 exponent (? float-suffix)) +;; (and base-10 (ignore ".") (? exponent) (? float-suffix)) +;; (and (? base-10) (ignore ".") base-10 (? exponent) (? float-suffix)))) + (define-peg-pattern number body - (or integer)) + (or ; float + integer + )) (define-peg-pattern group all (and (ignore "(") expr (ignore ")"))) @@ -65,11 +79,16 @@ (define-peg-pattern char all (and (ignore "'") (or escaped-char peg-any) (ignore "'"))) +;; (define-peg-pattern quot none (string "\"")) + +;; (define-peg-pattern string all +;; (and quot (* (or escaped-char (or peg-any))) quot)) (define-peg-pattern* operator all `(or ,@(map symbol->string symbol-binary-operators) ,@(map (lambda (op) `(and ,(symbol->string op) ws)) - wordy-binary-operators))) + wordy-binary-operators) + "?" ":")) ;; whitespace (define-peg-pattern ws none @@ -99,7 +118,7 @@ ))) (define-peg-pattern postfix-operator all - (or "++" "--")) + (or "++" "--" "*")) (define-peg-pattern postfix all ;; literals can't be in-place incremented and decremented diff --git a/module/c/operators.scm b/module/c/operators.scm index ab1b3e7c..131c702c 100644 --- a/module/c/operators.scm +++ b/module/c/operators.scm @@ -9,6 +9,7 @@ `(+ - * / & ,(symbol #\|) ^ << >> % < > =)) ;; apparently part of C +;; https://en.cppreference.com/w/cpp/language/operator_alternative (define wordy-binary-operators '(bitand and_eq and bitor or_eq or xor_eq xor)) diff --git a/module/c/parse.scm b/module/c/parse.scm index 8030da77..d407f5ac 100644 --- a/module/c/parse.scm +++ b/module/c/parse.scm @@ -38,6 +38,19 @@ "Invalid integer suffix ~s" (list str) #f))) +(define (group-body->type vars) + (concatenate + (map + (match-lambda (('variable var) (list (parse-lexeme-tree `(variable ,var)))) + (('postfix ('variable var) + ('postfix-operator "*")) + (list (parse-lexeme-tree `(variable ,var)) + '*)) + (else (scm-error 'c-parse-error "parse-lexeme-tree" + "Invalid token ~s in typecast form: ~s" + (list else vars) #f))) + vars))) + (define (parse-lexeme-tree tree) (match tree ['() '()] @@ -74,25 +87,43 @@ ((#\v) #\vtab) ((#\\) #\\) ((#\') #\')))] + [('char c) (char->integer (string-ref c 0))] [('variable var) (string->symbol var)] + + [('operator "&&") 'and] + [('operator "&=") 'and_eq] + [('operator "&") 'bitand] + [('operator "|") 'bitor] + [('operator "!=") 'not_eq] + [('operator "||") 'or] + [('operator "|=") 'or_eq] + [('operator "^") 'xor] + [('operator "^=") 'xor_eq] [('operator op) (string->symbol op)] + [('prefix-operator op) (case (string->symbol op) + ((!) 'not) + ((~) 'compl) ((*) 'dereference) ((&) 'pointer) ((++) 'pre-increment) ((--) 'pre-decrement) - (else => identity))] + (else (scm-error 'c-parse-error "parse-lexeme-tree" + "Unknown prefix operator ~s" + (list op) #f)))] [('postfix-operator op) (case (string->symbol op) [(++) 'post-increment] [(--) 'post-decrement] - [else => identity])] + [else (scm-error 'c-parse-error "parse-lexeme-tree" + "Unknown postfix operator ~s" + (list op) #f)])] ;; Parenthesis grouping - [('group args) + [('group args ...) (parse-lexeme-tree args)] ;; Atomic item. Used by flatten-infix @@ -108,37 +139,104 @@ ,(parse-lexeme-tree arg))] [('infix args ...) - (resolve-order-of-operations - (flatten-infix (cons 'infix args)))] + (let ((r + (resolve-order-of-operations + (flatten-infix (cons 'infix args))))) + (match r + (`(: (? ,op ,true) + ,false) + `(ternary ,op ,true ,false)) + (_ r)))] [('funcall function ('group arguments)) `(funcall ,(parse-lexeme-tree function) ,(parse-lexeme-tree arguments))] - [bare (scm-error 'c-parse-error - "parse-lexeme-tree" - "Naked literal in lex-tree: ~s" - (list bare) - #f)])) + [(('variable "struct") ('variable value) ..1) + `(struct-type ,@(map string->symbol value)) + ] + + ;; A list of variables. Most likely a type signature + ;; [(('variable value) ..1) + ;; ] + + ;; A typecast with only variables must (?) be a typecast? + [(('group groups ..1) ... value) + (fold-right (lambda (type done) `(as-type ,type ,done)) + (parse-lexeme-tree value) + (map group-body->type groups))] + + ;; Type name resolution? + ;; https://en.wikipedia.org/wiki/C_data_types + ;; base types with spaces: + ;; ======================= + ;; [[un]signed] char + ;; [[un]signed] short [int] + ;; [[un]signed] int + ;; [un]signed [int] + ;; [[un]signed] long [int] + ;; [[un]signed] long long [int] + ;; float + ;; [long] double + + ;; https://en.wikipedia.org/wiki/Type_qualifier + ;; qualifiers + ;; const + ;; volatile + ;; restrict + ;; _Atomic + + + ;; Storage specifiers + ;; auto + ;; register + ;; static + ;; extern + + ;; struct + ;; enum + ;; union + + ;; https://en.wikipedia.org/wiki/C_syntax + ;; int (*ptr_to_array)[100] + + + [(? symbol? bare) + (scm-error 'c-parse-error + "parse-lexeme-tree" + "Naked literal in lex-tree: ~s" + (list bare) + #f)] + + [form + (scm-error 'c-parse-error + "parse-lexeme-tree" + "Unknown form in lex-tree: ~s" + (list form) #f) + ])) ;; https://en.wikipedia.org/wiki/Operators_in_C_and_C%2B%2B +;; https://en.cppreference.com/w/c/language/operator_precedence (define order-of-operations (reverse (concatenate ;; This is only for binary operations `((-> ,(symbol #\.)) + ;; All unary procedures go here, incnluding typecasts, and sizeof (* / %) (+ -) (<< >>) (< <= > >=) - (== !=) - (&) - (^) - (,(symbol #\|)) - (&&) - (,(symbol #\| #\|)) - (= += -= *= /= %= <<= >>= &= ^= ,(symbol #\| #\=)) + (== != not_eq) + (& bitand) + (^ xorg) + (,(symbol #\|) bitor) + (&& and) + (,(symbol #\| #\|) or) + (? :) + (= += -= *= /= %= <<= >>= &= ^= ,(symbol #\| #\=) + and_eq or_eq xor_eq) (,(symbol #\,)) )))) @@ -148,8 +246,12 @@ (define* (resolve-order-of-operations tree optional: (order order-of-operations)) + (format (current-error-port) "~s~%" tree) (if (null? order) - (car tree) + (scm-error 'c-parse-error + "resolve-order-of-operations" + "Out of operations to resolve when resolving expression ~s" + (list tree) #f) (match tree [('*other* body ...) body] [(form) (resolve-order-of-operations form order)] -- cgit v1.2.3