(define-module (c parse) :use-module (hnh util) :use-module (srfi srfi-1) :use-module (srfi srfi-71) :use-module (ice-9 match) :use-module ((rnrs io ports) :select (string->bytevector make-transcoder utf-8-codec)) :use-module (rnrs bytevectors) :export (parse-lexeme-tree)) (define (permutations set) (concatenate (map (lambda (key) (map (lambda (o) (cons key o)) (delete key set))) set))) (define (symbol-concat pair) (cond [(null? (car pair)) (cdr pair)] [(null? (cdr pair)) (car pair)] [else (symbol-append (car pair) (cdr pair))])) (define (parse-integer-suffix str) (define valid-sequences (delete 'dummy (lset-union eq? '(dummy) (map symbol-concat (permutations '(() U L))) (map symbol-concat (permutations '(() U LL)))))) ;; => (LLU ULL LL LU UL L U) (aif (memv (string->symbol (string-upcase str)) valid-sequences) (case (car it) [(LLU ULL) '(unsigned long long)] [(LU UL) '(unsigned long)] [(LL) '(long long)] [(L) '(long)] [(U) '(unsigned)]) (scm-error 'c-parse-error "parse-integer-suffix" "Invalid integer suffix ~s" (list str) #f))) (define (parse-float-suffix str) (case (string->symbol str) ((f F) '(float)) ((l L) '(long double)))) (define (group-body->type vars) (concatenate (map (match-lambda (('variable var) (list (parse-lexeme-tree `(variable ,var)))) (('postfix ('variable var) ('postfix-operator "*")) (list (parse-lexeme-tree `(variable ,var)) '*)) (else (scm-error 'c-parse-error "parse-lexeme-tree" "Invalid token ~s in typecast form: ~s" (list else vars) #f))) vars))) ;; Takes a list of strings and integers, and merges it all into a single ;; bytevector representing a c string (define* (string-fragments->c-string fragments optional: (transcoder (make-transcoder (utf-8-codec)))) (define fragments-fixed (map (lambda (frag) (if (string? frag) (string->bytevector frag transcoder) frag)) fragments)) (define bv-length (fold (lambda (item sum) (+ sum (if (bytevector? item) (bytevector-length item) 1))) 0 fragments-fixed)) (define bv (make-bytevector (1+ bv-length))) ;; trailing null byte (bytevector-u8-set! bv bv-length 0) (fold (lambda (item idx) (cond ((bytevector? item) (bytevector-copy! item 0 bv idx (bytevector-length item)) (+ idx (bytevector-length item))) (else (bytevector-u8-set! bv idx item) (+ idx 1)))) 0 fragments-fixed) bv) (define (parse-float-form float-form) (let ((float-string (fold (lambda (arg str) (string-append str (match arg (('float-integer ('base-10 n)) n) (('float-decimal ('base-10 n)) (string-append "." n)) (('exponent "+" ('base-10 n)) (string-append "e" n)) (('exponent ('base-10 n)) (string-append "e" n)) (('exponent "-" ('base-10 n)) (string-append "e-" n))))) "" float-form))) ;; exact->inexact is a no-op if we already have an inexact number, but ;; ensures we get an inexact number when we have an exact number (which we ;; can get from the "1." case). Returning an inexact number here is important ;; to avoid arithmetic suprises later. (exact->inexact (or (string->number float-string) (scm-error 'c-parse-error "parse-lexeme-tree" "Couldn't parse expression as float: ~s" (list `(float ,@args)) #f))))) (define (resolve-escaped-char form) (match form (('base-8-char n) (string->number n 8)) (('base-16-char n) (string->number n 16)) (c (char->integer (case (string-ref c 0) ((#\a) #\alarm) ((#\b) #\backspace) ((#\e) #\esc) ;; non-standard ((#\f) #\page) ((#\n) #\newline) ((#\r) #\return) ((#\t) #\tab) ((#\v) #\vtab) ((#\\) #\\) ;; These are valid in both strings and chars ((#\') #\') ((#\") #\")))))) ;; Takes a list of strings and escaped-char form ;; and returns a list of strings and integers (define (resolve-string-fragment fragment) (match fragment (('escaped-char c) (resolve-escaped-char c)) (fargment fragment))) (define (parse-lexeme-tree tree) (match tree ['() '()] ;; Number constants [('base-10 n) (string->number n 10)] [('base-8 n) (string->number n 8)] [('base-16 n) (string->number n 16)] [('integer n ('integer-suffix suffix)) `(as-type ,(parse-integer-suffix suffix) ,(parse-lexeme-tree n))] [('integer n) (parse-lexeme-tree n)] [('float args ... ('float-suffix suffix)) `(as-type ,(parse-float-suffix suffix) ;; parse rest of float as if it lacked a suffix ,(parse-lexeme-tree `(float ,@args)))] [('float args ...) (parse-float-form args)] ;; Character literals, stored as raw integers ;; so mathematical operations keep working on them. [('char ('escaped-char c)) (resolve-escaped-char c)] [('char c) (char->integer (string-ref c 0))] [('variable var) (string->symbol var)] ;; normalize some binary operators to their wordy equivalent ;; (which also happens to match better with scheme) [('operator "&&") 'and] [('operator "&=") 'and_eq] [('operator "&") 'bitand] [('operator "|") 'bitor] [('operator "!=") 'not_eq] [('operator "||") 'or] [('operator "|=") 'or_eq] [('operator "^") 'xor] [('operator "^=") 'xor_eq] ;; Change these names to something scheme can handle better [('operator ".") 'object-slot] [('operator "->") 'dereference-slot] [('operator op) (string->symbol op)] [('prefix-operator op) (case (string->symbol op) ((!) 'not) ((~) 'compl) ((*) 'dereference) ((&) 'pointer) ((++) 'pre-increment) ((--) 'pre-decrement) ((-) '-) (else (scm-error 'c-parse-error "parse-lexeme-tree" "Unknown prefix operator ~s" (list op) #f)))] [('postfix-operator op) (case (string->symbol op) [(++) 'post-increment] [(--) 'post-decrement] [else (scm-error 'c-parse-error "parse-lexeme-tree" "Unknown postfix operator ~s" (list op) #f)])] ;; Parenthesis grouping [('group args ...) (parse-lexeme-tree args)] [('prefix op arg) `(,(parse-lexeme-tree op) ,(parse-lexeme-tree arg))] [('postfix arg op) `(,(parse-lexeme-tree op) ,(parse-lexeme-tree arg))] ;; resolved-operator and ternary are the return "types" ;; of resolve-order-of-operations [(('resolved-operator op) args ...) `(,op ,@(map parse-lexeme-tree args))] [('ternary a b c) `(ternary ,(parse-lexeme-tree a) ,(parse-lexeme-tree b) ,(parse-lexeme-tree c))] ;; Is it OK for literal strings to be "stored" inline? ;; Or must they be a pointer? ['string #vu8(0)] [('string str ...) (-> (map resolve-string-fragment str) string-fragments->c-string)] ;; implicit concatenation of string literals [(('string str ...) ...) (-> (map resolve-string-fragment (concatenate str)) string-fragments->c-string)] [('infix args ...) (let ((r (resolve-order-of-operations (flatten-infix (cons 'infix args))))) (parse-lexeme-tree r))] [('funcall function ('group arguments)) `(funcall ,(parse-lexeme-tree function) ,(parse-lexeme-tree arguments))] [(('variable "struct") ('variable value) ..1) `(struct-type ,@(map string->symbol value)) ] ;; A list of variables. Most likely a type signature ;; [(('variable value) ..1) ;; ] ;; A typecast with only variables must (?) be a typecast? [(('group groups ..1) ... value) (fold-right (lambda (type done) `(as-type ,type ,done)) (parse-lexeme-tree value) (map group-body->type groups))] ;; Type name resolution? ;; https://en.wikipedia.org/wiki/C_data_types ;; base types with spaces: ;; ======================= ;; [[un]signed] char ;; [[un]signed] short [int] ;; [[un]signed] int ;; [un]signed [int] ;; [[un]signed] long [int] ;; [[un]signed] long long [int] ;; float ;; [long] double ;; https://en.wikipedia.org/wiki/Type_qualifier ;; qualifiers ;; const ;; volatile ;; restrict ;; _Atomic ;; Storage specifiers ;; auto ;; register ;; static ;; extern ;; struct ;; enum ;; union ;; https://en.wikipedia.org/wiki/C_syntax ;; int (*ptr_to_array)[100] [(? symbol? bare) (scm-error 'c-parse-error "parse-lexeme-tree" "Naked literal in lex-tree: ~s" (list bare) #f)] [form (scm-error 'c-parse-error "parse-lexeme-tree" "Unknown form in lex-tree: ~s" (list form) #f) ])) ;; https://en.wikipedia.org/wiki/Operators_in_C_and_C%2B%2B ;; https://en.cppreference.com/w/c/language/operator_precedence (define order-of-operations (reverse ;; This is only for binary operations `((-> ,(symbol #\.)) ;; All unary procedures go here, incnluding typecasts, and sizeof (* / %) (+ -) (<< >>) (< <= > >=) (== != not_eq) (& bitand) (^ xorg) (,(symbol #\|) bitor) (&& and) (,(symbol #\| #\|) or) (? :) (= += -= *= /= %= <<= >>= &= ^= ,(symbol #\| #\=) and_eq or_eq xor_eq) (,(symbol #\,)) ))) ;; a.b->c.d (. (-> (. a b) c) d) ;; 2 * 3 / 4 * 5 => (* (/ (* 2 3) 4) 5) ;; eller => (* 2 (/ 3 4) 5) (define* (resolve-order-of-operations tree optional: (order order-of-operations)) (if (null? order) (scm-error 'c-parse-error "resolve-order-of-operations" "Out of operations to resolve when resolving expression ~s" (list tree) #f) (match tree [('fixed-infix form) form] [('fixed-infix forms ...) (match (split-by-one-of forms (car order)) [(group) (resolve-order-of-operations (cons 'fixed-infix group) (cdr order))] [(a ('? b ...) (': c ...)) `(ternary ,(resolve-order-of-operations (cons 'fixed-infix a) (cdr order)) ,(resolve-order-of-operations (cons 'fixed-infix b) (cdr order)) ,(resolve-order-of-operations (cons 'fixed-infix c) (cdr order)))] [(first rest ...) ;; TODO this is only valid for the associative operators (+, ...) ;; but not some other (<, ...) (if (apply eq? (map car rest)) (let ((op (caar rest))) `((resolved-operator ,op) ,@(map (lambda (x) (resolve-order-of-operations (cons 'fixed-infix x) (cdr order))) (cons first (map cdr rest))))) (fold (lambda (item done) (let ((operator args (car+cdr item))) `((resolved-operator ,operator) ,done ,(resolve-order-of-operations (cons 'fixed-infix args) (cdr order))))) (resolve-order-of-operations (cons 'fixed-infix first) (cdr order)) rest))])]))) ;; 1 * 2 / 3 * 4 ;; ⇒ ((1) (* 2) (/ 3) (* 4)) ;; (1) ;; (* (1) 2) ;; (/ (* (1) 2) 3) ;; (* (/ (* (1) 2) 3) 4) ;; Flatens a tree of infix triples. Stops when it should. ;; (parenthesis, function calls, ...) (define (flatten-infix form) (cons 'fixed-infix (let loop ((form form)) (match form [('infix left op ('infix right ...)) (cons* left (parse-lexeme-tree op) (loop (cons 'infix right)))] [('infix left op right) (list left (parse-lexeme-tree op) right)] [('infix form) form] [other (scm-error 'c-parse-error "flatten-infix" "Not an infix tree ~a" (list other) #f)]))))