diff options
Diffstat (limited to 'module/c/old')
-rw-r--r-- | module/c/old/cpp.scm | 151 | ||||
-rw-r--r-- | module/c/old/lex.scm | 163 | ||||
-rw-r--r-- | module/c/old/operators.scm | 24 | ||||
-rw-r--r-- | module/c/old/parse.scm | 426 |
4 files changed, 764 insertions, 0 deletions
diff --git a/module/c/old/cpp.scm b/module/c/old/cpp.scm new file mode 100644 index 00000000..1623bd11 --- /dev/null +++ b/module/c/old/cpp.scm @@ -0,0 +1,151 @@ +(define-module (c old cpp) + :use-module (hnh util) + :use-module (srfi srfi-1) + :use-module (ice-9 popen) + :use-module (ice-9 match) + :use-module (ice-9 regex) + :use-module ((rnrs io ports) :select (call-with-port)) + :use-module ((rnrs bytevectors) :select (bytevector?)) + :use-module (ice-9 format) + :use-module ((hnh util io) :select (read-lines)) + :use-module (hnh util graph) + :use-module (c old lex) + :use-module (c old parse) + :use-module (c old operators) + :export (replace-symbols include#) + ) + + +;; input "#define F(x, y) x + y" +;; 1 full define | F(x,y) +;; 2 macro name | F +;; 3 macro args | (x,y) +;; 5 macro body | x + y or #f +(define define-re (make-regexp "^#define ((\\w+)([(][^)]*[)])?)( (.*))?")) + +(define (tokenize-define-line header-line) + (aif (regexp-exec define-re header-line) + (cons (match:substring it 1) + (let ((body (match:substring it 5))) + (if (or (eqv? body #f) + (string-null? body)) + "1" body))) + (scm-error 'c-parse-error + "tokenize-define-line" + "Line dosen't match: ~s" + (list header-line) #f))) + + +(define (replace-symbols tree dict) + (if (not (list? tree)) + (or (assoc-ref dict tree) tree) + (map (lambda (node) (replace-symbols node dict)) + tree))) + +;; Direct values. Lisp also has quoted symbols in this group. +(define (immediate? x) + (or (number? x) + (bytevector? x))) + +;; TODO replace this with something sensible +;; like a correct list extracted from (c eval) +;; and not thinging that types are variables +;; built in symbols. Should never be marked as dependencies +(define (primitive? x) + (memv x `( + ;; language primitives + sizeof + + ;; special forms introduced by parser + funcall ternary struct-type as-type + + ;; unary operatons which aren't also binary operators + ++ -- ! ~ + not compl dereference pointer + pre-increment pre-decrement + post-increment post-decrement + ,@binary-operators + ))) + + + +;; (symbol . value) -> (list (dependencies . symbol . value) +(define (parse-cpp-define pair) + (define f (compose parse-lexeme-tree lex)) + (define left (f (car pair))) + (define proc-args + (match (and (pair? left) + (eq? 'funcall (car left)) + (caddr left)) + [#f '()] + [(_ args ...) args] + [arg (list arg)])) + + (define right (f (cdr pair))) + (define dependencies + (lset-difference + eq? + (remove primitive? + (remove immediate? + (flatten (if (list? right) + right (list right))))) + proc-args)) + + (cons + dependencies + (match left + [('funcall name ('#{,}# args ...)) + (cons name `(lambda ,args ,right))] + + [('funcall name arg) + (cons name `(lambda (,arg) ,right))] + + [name (cons name right)]))) + + +(define (parse-cpp-file lines) + (map (lambda (line) + (catch #t + (lambda () (parse-cpp-define line)) + (lambda (err caller fmt args data) + (format #t "~a in ~a: ~?~%" + err caller fmt args) + (format #t "~s~%" line) + #f))) + lines)) + +(define (private-c-symbol? string) + (char=? #\_ (string-ref string 0))) + +(define (tokenize-header-file header-file) + (map tokenize-define-line + (call-with-port + (open-pipe* OPEN_READ "cpp" "-dM" header-file) + read-lines))) + +(define (load-cpp-file header-file) + + (define lines (tokenize-header-file header-file)) + (define forms (parse-cpp-file lines)) + + (fold (lambda (node graph) + (add-node graph (cdr node) (car node))) + (make-graph car) + (filter identity forms))) + +(define (include% header-file) + (define graph* (load-cpp-file header-file)) + ;; Hack for termios since this symbol isn't defined. + ;; (including in the above removed private c symbols) + (define graph (add-node graph* (cons '_POSIX_VDISABLE 0) '())) + ;; TODO expand bodies + ;; (remove (compose private-c-symbol? car)) + (resolve-dependency-graph graph)) + +(define-macro (include# header-file . args) + + (define define-form (if (null? args) 'define (car args))) + + `(begin + ,@(map (lambda (pair) `(,define-form ,(car pair) ,(cdr pair))) + (include% header-file)))) diff --git a/module/c/old/lex.scm b/module/c/old/lex.scm new file mode 100644 index 00000000..dcc7336d --- /dev/null +++ b/module/c/old/lex.scm @@ -0,0 +1,163 @@ +(define-module (c old lex) + :use-module (ice-9 peg) + :use-module (c old operators) + :export (lex)) + + +;; Like define-peg-pattern, but body is evaluated +(define-syntax define-peg-pattern* + (lambda (stx) + (syntax-case stx () + ((_ sym accum pat) + #`(define sym + (let ((matchf (compile-peg-pattern (datum->syntax #'stx pat) 'accum))) + (let ((syn ((@ (ice-9 peg codegen) wrap-parser-for-users) #'stx matchf 'accum 'sym))) + ((@ (system base compile) compile) + ((@ (ice-9 peg cache) cg-cached-parser) + syn))))))))) + + + + +(define-peg-pattern base-8-digit body + (range #\0 #\7)) + +(define-peg-pattern base-10-digit body + (range #\0 #\9)) + +(define-peg-pattern base-16-digit body + (or (range #\0 #\9) + (range #\A #\F) + (range #\a #\f))) + +;; https://en.cppreference.com/w/cpp/language/integer_literal +(define-peg-pattern base-10 all (+ base-10-digit)) +(define-peg-pattern base-8 all (and (ignore "0") (+ base-8-digit))) +(define-peg-pattern base-16 all (and (ignore (and "0" (or "x" "X"))) + (+ base-16-digit))) + +;; accept anything now, ensure correctnes later +(define-peg-pattern integer-suffix all + (* (or "u" "U" "l" "L"))) + +(define-peg-pattern integer all + (and (or base-8 base-16 base-10) (? integer-suffix))) + +(define-peg-pattern float-suffix all + (* (or "f" "F" "l" "L"))) + +(define-peg-pattern exponent all + (and (ignore (or "e" "E")) (? (or "+" "-")) base-10)) + +;; Helper patterns for creating named groups in float +(define-peg-pattern float-integer all base-10) +(define-peg-pattern float-decimal all base-10) + +(define-peg-pattern float all + (or (and float-integer exponent (? float-suffix)) + (and (? float-integer) (ignore ".") float-decimal (? exponent) (? float-suffix)) + (and float-integer (ignore ".") (? exponent) (? float-suffix)))) + +(define-peg-pattern number body + (or float integer)) + +(define-peg-pattern group all + (and (ignore "(") expr (ignore ")"))) + +(define-peg-pattern base-8-char all + (and base-8-digit + (? base-8-digit) + (? base-8-digit))) + +(define-peg-pattern base-16-char all + (and (ignore "x") base-16-digit (? base-16-digit))) + +(define-peg-pattern escaped-char all + (and (ignore "\\") (or base-16-char + base-8-char + peg-any))) + +(define-peg-pattern char all + (and (ignore "'") (or escaped-char peg-any) (ignore "'"))) + +(define-peg-pattern quot none "\"") + +(define-peg-pattern string all + (and quot (* (or escaped-char (and (not-followed-by "\"") peg-any))) quot)) + +(define-peg-pattern* operator all + `(or ,@(map symbol->string symbol-binary-operators) + ,@(map (lambda (op) `(and ,(symbol->string op) ws)) + wordy-binary-operators) + "?" ":")) + +;; whitespace +(define-peg-pattern ws none + (or " " " " "\n")) + +;; space (for when whitespace is optional) +(define-peg-pattern sp none (* ws)) + +(define-peg-pattern safe-letter body + (or "_" + (range #\A #\Z) + (range #\a #\z))) + +(define-peg-pattern variable all + (and safe-letter + (* (or safe-letter + base-10-digit)))) + +(define-peg-pattern prefix-operator all + ;; It's important that ++ and -- are BEFORE + and - + ;; otherwise the first + is found, leaving the second +, which fails + ;; to lex since it's an invalid token + ;; TODO sizeof can be written as a prefix operator + ;; (without parenthesis) if the operand is an expression. + (or "*" "&" "++" "--" + "!" "~" "+" "-")) + + +;;; Note that stacked pre or postfix operators without parenthesis +;;; dosen't work. So `*&C' is invalid, while `*(&C)' is valid. + +(define-peg-pattern prefix all + (and prefix-operator sp (or variable group funcall literal))) + +(define-peg-pattern postfix-operator all + (or "++" "--" "*")) + +(define-peg-pattern postfix all + ;; literals can't be in-place incremented and decremented + ;; Make sure we don't match postfix-operator here, since + ;; that also gives us an infinite loop. + (and (or prefix funcall group variable) sp postfix-operator)) + +(define-peg-pattern infix all + ;; first case is "same" as expr, but in different order to prevent + ;; infinite self reference. Pre and postfix not here, solved by having + ;; them before infix in expr + (and (or funcall postfix prefix group literal variable) + sp operator sp expr)) + +(define-peg-pattern funcall all + (and variable sp group)) + +(define-peg-pattern literal body + (or char string number)) + +;;; main parser +(define-peg-pattern expr body + (+ (and sp (or + ;; float must be BEFORE infix, otherwise 3.2 is parsed as (infix 3 (operator ".") 2) + ;; that however breaks the infix logic, meaning that floating point numbers can't be + ;; used in basic arithmetic. + ;; TODO remove all implicit order of operations handling in the lexer, and move it to + ;; the parser. This should also fix the case of typecasts being applied incorrectly. + float + infix postfix prefix funcall group literal variable) + sp))) + + +(define (lex string) + (peg:tree (match-pattern expr string))) diff --git a/module/c/old/operators.scm b/module/c/old/operators.scm new file mode 100644 index 00000000..0b253ada --- /dev/null +++ b/module/c/old/operators.scm @@ -0,0 +1,24 @@ +(define-module (c old operators) + :export (wordy-binary-operators + symbol-binary-operators + binary-operators)) + + +;;; Simple operators are those which can be combined with '=' +(define simple-operators + `(+ - * / & ,(symbol #\|) ^ << >> % < > =)) + +;; apparently part of C +;; https://en.cppreference.com/w/cpp/language/operator_alternative +(define wordy-binary-operators + '(bitand and_eq and bitor or_eq or xor_eq xor not_eq)) + +(define symbol-binary-operators + (append (map (lambda (x) (symbol-append x '=)) simple-operators) + `(&& ,(symbol #\| #\|) != ,(symbol #\,) + -> ,(symbol #\.)) + simple-operators)) + +(define binary-operators + (append symbol-binary-operators + wordy-binary-operators)) diff --git a/module/c/old/parse.scm b/module/c/old/parse.scm new file mode 100644 index 00000000..d598e3c9 --- /dev/null +++ b/module/c/old/parse.scm @@ -0,0 +1,426 @@ +(define-module (c old parse) + :use-module (hnh util) + :use-module (srfi srfi-1) + :use-module (srfi srfi-71) + :use-module (ice-9 match) + :use-module ((rnrs io ports) + :select (string->bytevector make-transcoder utf-8-codec)) + :use-module (rnrs bytevectors) + :export (parse-lexeme-tree)) + +(define (permutations set) + (concatenate + (map (lambda (key) + (map (lambda (o) (cons key o)) + (delete key set))) + set))) + +(define (symbol-concat pair) + (cond [(null? (car pair)) (cdr pair)] + [(null? (cdr pair)) (car pair)] + [else (symbol-append (car pair) (cdr pair))])) + +(define (parse-integer-suffix str) + (define valid-sequences + (delete 'dummy + (lset-union eq? '(dummy) + (map symbol-concat (permutations '(() U L))) + (map symbol-concat (permutations '(() U LL)))))) + + ;; => (LLU ULL LL LU UL L U) + + (aif (memv (string->symbol (string-upcase str)) + valid-sequences) + (case (car it) + [(LLU ULL) '(unsigned long long)] + [(LU UL) '(unsigned long)] + [(LL) '(long long)] + [(L) '(long)] + [(U) '(unsigned)]) + (scm-error 'c-parse-error "parse-integer-suffix" + "Invalid integer suffix ~s" + (list str) #f))) + +(define (parse-float-suffix str) + (case (string->symbol str) + ((f F) '(float)) + ((l L) '(long double)))) + +(define (group-body->type vars) + (concatenate + (map + (match-lambda (('variable var) (list (parse-lexeme-tree `(variable ,var)))) + (('postfix ('variable var) + ('postfix-operator "*")) + (list (parse-lexeme-tree `(variable ,var)) + '*)) + (else (scm-error 'c-parse-error "parse-lexeme-tree" + "Invalid token ~s in typecast form: ~s" + (list else vars) #f))) + vars))) + +;; Takes a list of strings and integers, and merges it all into a single +;; bytevector representing a c string +(define* (string-fragments->c-string + fragments optional: (transcoder (make-transcoder (utf-8-codec)))) + + (define fragments-fixed + (map (lambda (frag) + (if (string? frag) + (string->bytevector frag transcoder) + frag)) + fragments)) + + (define bv-length + (fold (lambda (item sum) (+ sum (if (bytevector? item) + (bytevector-length item) + 1))) + 0 fragments-fixed)) + + (define bv (make-bytevector (1+ bv-length))) + ;; trailing null byte + (bytevector-u8-set! bv bv-length 0) + (fold (lambda (item idx) + (cond ((bytevector? item) + (bytevector-copy! item 0 + bv idx + (bytevector-length item)) + (+ idx (bytevector-length item))) + (else (bytevector-u8-set! bv idx item) + (+ idx 1)))) + 0 + fragments-fixed) + bv) + +(define (parse-float-form float-form) + (let ((float-string + (fold (lambda (arg str) + (string-append + str + (match arg + (('float-integer ('base-10 n)) n) + (('float-decimal ('base-10 n)) (string-append "." n)) + (('exponent "+" ('base-10 n)) (string-append "e" n)) + (('exponent ('base-10 n)) (string-append "e" n)) + (('exponent "-" ('base-10 n)) (string-append "e-" n))))) + "" float-form))) + ;; exact->inexact is a no-op if we already have an inexact number, but + ;; ensures we get an inexact number when we have an exact number (which we + ;; can get from the "1." case). Returning an inexact number here is important + ;; to avoid arithmetic suprises later. + (exact->inexact + (or (string->number float-string) + (scm-error 'c-parse-error "parse-lexeme-tree" + "Couldn't parse expression as float: ~s" + (list `(float ,@args)) #f))))) + + +(define (resolve-escaped-char form) + (match form + (('base-8-char n) (string->number n 8)) + (('base-16-char n) (string->number n 16)) + (c (char->integer + (case (string-ref c 0) + ((#\a) #\alarm) + ((#\b) #\backspace) + ((#\e) #\esc) ;; non-standard + ((#\f) #\page) + ((#\n) #\newline) + ((#\r) #\return) + ((#\t) #\tab) + ((#\v) #\vtab) + ((#\\) #\\) + ;; These are valid in both strings and chars + ((#\') #\') + ((#\") #\")))))) + +;; Takes a list of strings and escaped-char form +;; and returns a list of strings and integers +(define (resolve-string-fragment fragment) + (match fragment + (('escaped-char c) + (resolve-escaped-char c)) + (fargment fragment))) + +(define (parse-lexeme-tree tree) + (match tree + ['() '()] + + ;; Number constants + [('base-10 n) (string->number n 10)] + [('base-8 n) (string->number n 8)] + [('base-16 n) (string->number n 16)] + + [('integer n ('integer-suffix suffix)) + `(as-type + ,(parse-integer-suffix suffix) + ,(parse-lexeme-tree n))] + + [('integer n) + (parse-lexeme-tree n)] + + + [('float args ... ('float-suffix suffix)) + `(as-type ,(parse-float-suffix suffix) + ;; parse rest of float as if it lacked a suffix + ,(parse-lexeme-tree `(float ,@args)))] + + [('float args ...) (parse-float-form args)] + + ;; Character literals, stored as raw integers + ;; so mathematical operations keep working on them. + [('char ('escaped-char c)) (resolve-escaped-char c)] + + [('char c) (char->integer (string-ref c 0))] + + [('variable var) (string->symbol var)] + + ;; normalize some binary operators to their wordy equivalent + ;; (which also happens to match better with scheme) + [('operator "&&") 'and] + [('operator "&=") 'and_eq] + [('operator "&") 'bitand] + [('operator "|") 'bitor] + [('operator "!=") 'not_eq] + [('operator "||") 'or] + [('operator "|=") 'or_eq] + [('operator "^") 'xor] + [('operator "^=") 'xor_eq] + ;; Change these names to something scheme can handle better + [('operator ".") 'object-slot] + [('operator "->") 'dereference-slot] + [('operator op) (string->symbol op)] + + [('prefix-operator op) + (case (string->symbol op) + ((!) 'not) + ((~) 'compl) + ((*) 'dereference) + ((&) 'pointer) + ((++) 'pre-increment) + ((--) 'pre-decrement) + ((-) '-) + (else (scm-error 'c-parse-error "parse-lexeme-tree" + "Unknown prefix operator ~s" + (list op) #f)))] + [('postfix-operator op) + (case (string->symbol op) + [(++) 'post-increment] + [(--) 'post-decrement] + [else (scm-error 'c-parse-error "parse-lexeme-tree" + "Unknown postfix operator ~s" + (list op) #f)])] + + ;; Parenthesis grouping + [('group args ...) + (parse-lexeme-tree args)] + + [('prefix op arg) + `(,(parse-lexeme-tree op) + ,(parse-lexeme-tree arg))] + + [('postfix arg op) + `(,(parse-lexeme-tree op) + ,(parse-lexeme-tree arg))] + + + + + + ;; resolved-operator and ternary are the return "types" + ;; of resolve-order-of-operations + [(('resolved-operator op) args ...) + `(,op ,@(map parse-lexeme-tree args))] + + [('ternary a b c) + `(ternary ,(parse-lexeme-tree a) + ,(parse-lexeme-tree b) + ,(parse-lexeme-tree c))] + + + + + ;; Is it OK for literal strings to be "stored" inline? + ;; Or must they be a pointer? + ['string #vu8(0)] + [('string str ...) + (-> (map resolve-string-fragment str) + string-fragments->c-string)] + + ;; implicit concatenation of string literals + [(('string str ...) ...) + (-> (map resolve-string-fragment (concatenate str)) + string-fragments->c-string)] + + [('infix args ...) + (let ((r (resolve-order-of-operations + (flatten-infix (cons 'infix args))))) + (parse-lexeme-tree r))] + + + [('funcall function ('group arguments)) + `(funcall ,(parse-lexeme-tree function) + ,(parse-lexeme-tree arguments))] + + [(('variable "struct") ('variable value) ..1) + `(struct-type ,@(map string->symbol value)) + ] + + ;; A list of variables. Most likely a type signature + ;; [(('variable value) ..1) + ;; ] + + ;; A typecast with only variables must (?) be a typecast? + [(('group groups ..1) ... value) + (fold-right (lambda (type done) `(as-type ,type ,done)) + (parse-lexeme-tree value) + (map group-body->type groups))] + + ;; Type name resolution? + ;; https://en.wikipedia.org/wiki/C_data_types + ;; base types with spaces: + ;; ======================= + ;; [[un]signed] char + ;; [[un]signed] short [int] + ;; [[un]signed] int + ;; [un]signed [int] + ;; [[un]signed] long [int] + ;; [[un]signed] long long [int] + ;; float + ;; [long] double + + ;; https://en.wikipedia.org/wiki/Type_qualifier + ;; qualifiers + ;; const + ;; volatile + ;; restrict + ;; _Atomic + + + ;; Storage specifiers + ;; auto + ;; register + ;; static + ;; extern + + ;; struct <typename> + ;; enum <typename> + ;; union <typename> + + ;; https://en.wikipedia.org/wiki/C_syntax + ;; int (*ptr_to_array)[100] + + + [(? symbol? bare) + (scm-error 'c-parse-error + "parse-lexeme-tree" + "Naked literal in lex-tree: ~s" + (list bare) + #f)] + + [form + (scm-error 'c-parse-error + "parse-lexeme-tree" + "Unknown form in lex-tree: ~s" + (list form) #f) + ])) + +;; https://en.wikipedia.org/wiki/Operators_in_C_and_C%2B%2B +;; https://en.cppreference.com/w/c/language/operator_precedence + +(define order-of-operations + (reverse + ;; This is only for binary operations + `((-> ,(symbol #\.)) + ;; All unary procedures go here, incnluding typecasts, and sizeof + (* / %) + (+ -) + (<< >>) + (< <= > >=) + (== != not_eq) + (& bitand) + (^ xorg) + (,(symbol #\|) bitor) + (&& and) + (,(symbol #\| #\|) or) + (? :) + (= += -= *= /= %= <<= >>= &= ^= ,(symbol #\| #\=) + and_eq or_eq xor_eq) + (,(symbol #\,)) + ))) + +;; a.b->c.d (. (-> (. a b) c) d) +;; 2 * 3 / 4 * 5 => (* (/ (* 2 3) 4) 5) +;; eller => (* 2 (/ 3 4) 5) + +(define* (resolve-order-of-operations + tree optional: (order order-of-operations)) + + (if (null? order) + (scm-error 'c-parse-error + "resolve-order-of-operations" + "Out of operations to resolve when resolving expression ~s" + (list tree) #f) + (match tree + [('fixed-infix form) form] + [('fixed-infix forms ...) + (match (split-by-one-of forms (car order)) + [(group) + (resolve-order-of-operations (cons 'fixed-infix group) + (cdr order))] + [(a ('? b ...) (': c ...)) + `(ternary ,(resolve-order-of-operations (cons 'fixed-infix a) (cdr order)) + ,(resolve-order-of-operations (cons 'fixed-infix b) (cdr order)) + ,(resolve-order-of-operations (cons 'fixed-infix c) (cdr order)))] + [(first rest ...) + ;; TODO this is only valid for the associative operators (+, ...) + ;; but not some other (<, ...) + (if (apply eq? (map car rest)) + (let ((op (caar rest))) + `((resolved-operator ,op) + ,@(map (lambda (x) (resolve-order-of-operations (cons 'fixed-infix x) + (cdr order))) + (cons first (map cdr rest))))) + (fold (lambda (item done) + (let ((operator args (car+cdr item))) + `((resolved-operator ,operator) + ,done ,(resolve-order-of-operations + (cons 'fixed-infix args) + (cdr order))))) + (resolve-order-of-operations (cons 'fixed-infix first) + (cdr order)) + rest))])]))) + +;; 1 * 2 / 3 * 4 +;; ⇒ ((1) (* 2) (/ 3) (* 4)) +;; (1) +;; (* (1) 2) +;; (/ (* (1) 2) 3) +;; (* (/ (* (1) 2) 3) 4) + +;; Flatens a tree of infix triples. Stops when it should. +;; (parenthesis, function calls, ...) +(define (flatten-infix form) + (cons 'fixed-infix + (let loop ((form form)) + (match form + [('infix left op ('infix right ...)) + (cons* left + (parse-lexeme-tree op) + (loop (cons 'infix right)))] + + [('infix left op right) + (list left + (parse-lexeme-tree op) + right)] + + [('infix form) form] + + [other (scm-error 'c-parse-error + "flatten-infix" + "Not an infix tree ~a" + (list other) + #f)])))) + + + |