aboutsummaryrefslogtreecommitdiff
path: root/module/c/parse.scm
diff options
context:
space:
mode:
Diffstat (limited to 'module/c/parse.scm')
-rw-r--r--module/c/parse.scm411
1 files changed, 325 insertions, 86 deletions
diff --git a/module/c/parse.scm b/module/c/parse.scm
index 8030da77..7d11ea17 100644
--- a/module/c/parse.scm
+++ b/module/c/parse.scm
@@ -1,11 +1,14 @@
(define-module (c parse)
:use-module (hnh util)
:use-module (srfi srfi-1)
+ :use-module (srfi srfi-71)
:use-module (ice-9 match)
+ :use-module ((rnrs io ports)
+ :select (string->bytevector make-transcoder utf-8-codec))
+ :use-module (rnrs bytevectors)
:export (parse-lexeme-tree))
-;;; Rename this
-(define (perms set)
+(define (permutations set)
(concatenate
(map (lambda (key)
(map (lambda (o) (cons key o))
@@ -21,23 +24,124 @@
(define valid-sequences
(delete 'dummy
(lset-union eq? '(dummy)
- (map symbol-concat (perms '(() U L)))
- (map symbol-concat (perms '(() U LL))))))
+ (map symbol-concat (permutations '(() U L)))
+ (map symbol-concat (permutations '(() U LL))))))
;; => (LLU ULL LL LU UL L U)
(aif (memv (string->symbol (string-upcase str))
valid-sequences)
(case (car it)
- [(LLU ULL) '(unsigned long-long)]
+ [(LLU ULL) '(unsigned long long)]
[(LU UL) '(unsigned long)]
- [(LL) '(long-long)]
+ [(LL) '(long long)]
[(L) '(long)]
[(U) '(unsigned)])
(scm-error 'c-parse-error "parse-integer-suffix"
"Invalid integer suffix ~s"
(list str) #f)))
+(define (parse-float-suffix str)
+ (case (string->symbol str)
+ ((f F) '(float))
+ ((l L) '(long double))))
+
+(define (group-body->type vars)
+ (concatenate
+ (map
+ (match-lambda (('variable var) (list (parse-lexeme-tree `(variable ,var))))
+ (('postfix ('variable var)
+ ('postfix-operator "*"))
+ (list (parse-lexeme-tree `(variable ,var))
+ '*))
+ (else (scm-error 'c-parse-error "parse-lexeme-tree"
+ "Invalid token ~s in typecast form: ~s"
+ (list else vars) #f)))
+ vars)))
+
+;; Takes a list of strings and integers, and merges it all into a single
+;; bytevector representing a c string
+(define* (string-fragments->c-string
+ fragments optional: (transcoder (make-transcoder (utf-8-codec))))
+
+ (define fragments-fixed
+ (map (lambda (frag)
+ (if (string? frag)
+ (string->bytevector frag transcoder)
+ frag))
+ fragments))
+
+ (define bv-length
+ (fold (lambda (item sum) (+ sum (if (bytevector? item)
+ (bytevector-length item)
+ 1)))
+ 0 fragments-fixed))
+
+ (define bv (make-bytevector (1+ bv-length)))
+ ;; trailing null byte
+ (bytevector-u8-set! bv bv-length 0)
+ (fold (lambda (item idx)
+ (cond ((bytevector? item)
+ (bytevector-copy! item 0
+ bv idx
+ (bytevector-length item))
+ (+ idx (bytevector-length item)))
+ (else (bytevector-u8-set! bv idx item)
+ (+ idx 1))))
+ 0
+ fragments-fixed)
+ bv)
+
+(define (parse-float-form float-form)
+ (let ((float-string
+ (fold (lambda (arg str)
+ (string-append
+ str
+ (match arg
+ (('float-integer ('base-10 n)) n)
+ (('float-decimal ('base-10 n)) (string-append "." n))
+ (('exponent "+" ('base-10 n)) (string-append "e" n))
+ (('exponent ('base-10 n)) (string-append "e" n))
+ (('exponent "-" ('base-10 n)) (string-append "e-" n)))))
+ "" float-form)))
+ ;; exact->inexact is a no-op if we already have an inexact number, but
+ ;; ensures we get an inexact number when we have an exact number (which we
+ ;; can get from the "1." case). Returning an inexact number here is important
+ ;; to avoid arithmetic suprises later.
+ (exact->inexact
+ (or (string->number float-string)
+ (scm-error 'c-parse-error "parse-lexeme-tree"
+ "Couldn't parse expression as float: ~s"
+ (list `(float ,@args)) #f)))))
+
+
+(define (resolve-escaped-char form)
+ (match form
+ (('base-8-char n) (string->number n 8))
+ (('base-16-char n) (string->number n 16))
+ (c (char->integer
+ (case (string-ref c 0)
+ ((#\a) #\alarm)
+ ((#\b) #\backspace)
+ ((#\e) #\esc) ;; non-standard
+ ((#\f) #\page)
+ ((#\n) #\newline)
+ ((#\r) #\return)
+ ((#\t) #\tab)
+ ((#\v) #\vtab)
+ ((#\\) #\\)
+ ;; These are valid in both strings and chars
+ ((#\') #\')
+ ((#\") #\"))))))
+
+;; Takes a list of strings and escaped-char form
+;; and returns a list of strings and integers
+(define (resolve-string-fragment fragment)
+ (match fragment
+ (('escaped-char c)
+ (resolve-escaped-char c))
+ (fargment fragment)))
+
(define (parse-lexeme-tree tree)
(match tree
['() '()]
@@ -50,55 +154,67 @@
[('integer n ('integer-suffix suffix))
`(as-type
,(parse-integer-suffix suffix)
- ,(parse-lexeme-tree n))
- ]
+ ,(parse-lexeme-tree n))]
+
[('integer n)
(parse-lexeme-tree n)]
+
+ [('float args ... ('float-suffix suffix))
+ `(as-type ,(parse-float-suffix suffix)
+ ;; parse rest of float as if it lacked a suffix
+ ,(parse-lexeme-tree `(float ,@args)))]
+
+ [('float args ...) (parse-float-form args)]
+
;; Character literals, stored as raw integers
;; so mathematical operations keep working on them.
- [('char ('escaped-char ('base-8-char n)))
- (-> n (string->number 8) #; integer->char)]
- [('char ('escaped-char ('base-16-char n)))
- (-> n (string->number 16) #; integer->char)]
- [('char ('escaped-char c))
- (char->integer
- (case (string-ref c 0)
- ((#\a) #\alarm)
- ((#\b) #\backspace)
- ((#\e) #\esc)
- ((#\f) #\page)
- ((#\n) #\newline)
- ((#\r) #\return)
- ((#\t) #\tab)
- ((#\v) #\vtab)
- ((#\\) #\\)
- ((#\') #\')))]
+ [('char ('escaped-char c)) (resolve-escaped-char c)]
+
[('char c) (char->integer (string-ref c 0))]
[('variable var) (string->symbol var)]
+
+ ;; normalize some binary operators to their wordy equivalent
+ ;; (which also happens to match better with scheme)
+ [('operator "&&") 'and]
+ [('operator "&=") 'and_eq]
+ [('operator "&") 'bitand]
+ [('operator "|") 'bitor]
+ [('operator "!=") 'not_eq]
+ [('operator "||") 'or]
+ [('operator "|=") 'or_eq]
+ [('operator "^") 'xor]
+ [('operator "^=") 'xor_eq]
+ ;; Change these names to something scheme can handle better
+ [('operator ".") 'object-slot]
+ [('operator "->") 'dereference-slot]
[('operator op) (string->symbol op)]
+
[('prefix-operator op)
(case (string->symbol op)
+ ((!) 'not)
+ ((~) 'compl)
((*) 'dereference)
((&) 'pointer)
((++) 'pre-increment)
((--) 'pre-decrement)
- (else => identity))]
+ ((-) '-)
+ (else (scm-error 'c-parse-error "parse-lexeme-tree"
+ "Unknown prefix operator ~s"
+ (list op) #f)))]
[('postfix-operator op)
(case (string->symbol op)
[(++) 'post-increment]
[(--) 'post-decrement]
- [else => identity])]
+ [else (scm-error 'c-parse-error "parse-lexeme-tree"
+ "Unknown postfix operator ~s"
+ (list op) #f)])]
;; Parenthesis grouping
- [('group args)
+ [('group args ...)
(parse-lexeme-tree args)]
- ;; Atomic item. Used by flatten-infix
- [('atom body)
- (parse-lexeme-tree body)]
-
[('prefix op arg)
`(,(parse-lexeme-tree op)
,(parse-lexeme-tree arg))]
@@ -107,81 +223,204 @@
`(,(parse-lexeme-tree op)
,(parse-lexeme-tree arg))]
+
+
+
+
+ ;; resolved-operator and ternary are the return "types"
+ ;; of resolve-order-of-operations
+ [(('resolved-operator op) args ...)
+ `(,op ,@(map parse-lexeme-tree args))]
+
+ [('ternary a b c)
+ `(ternary ,(parse-lexeme-tree a)
+ ,(parse-lexeme-tree b)
+ ,(parse-lexeme-tree c))]
+
+
+
+
+ ;; Is it OK for literal strings to be "stored" inline?
+ ;; Or must they be a pointer?
+ ['string #vu8(0)]
+ [('string str ...)
+ (-> (map resolve-string-fragment str)
+ string-fragments->c-string)]
+
+ ;; implicit concatenation of string literals
+ [(('string str ...) ...)
+ (-> (map resolve-string-fragment (concatenate str))
+ string-fragments->c-string)]
+
[('infix args ...)
- (resolve-order-of-operations
- (flatten-infix (cons 'infix args)))]
+ (let ((r (resolve-order-of-operations
+ (flatten-infix (cons 'infix args)))))
+ (parse-lexeme-tree r))]
+
[('funcall function ('group arguments))
`(funcall ,(parse-lexeme-tree function)
,(parse-lexeme-tree arguments))]
- [bare (scm-error 'c-parse-error
- "parse-lexeme-tree"
- "Naked literal in lex-tree: ~s"
- (list bare)
- #f)]))
+ [(('variable "struct") ('variable value) ..1)
+ `(struct-type ,@(map string->symbol value))
+ ]
+
+ ;; A list of variables. Most likely a type signature
+ ;; [(('variable value) ..1)
+ ;; ]
+
+ ;; A typecast with only variables must (?) be a typecast?
+ [(('group groups ..1) ... value)
+ (fold-right (lambda (type done) `(as-type ,type ,done))
+ (parse-lexeme-tree value)
+ (map group-body->type groups))]
+
+ ;; Type name resolution?
+ ;; https://en.wikipedia.org/wiki/C_data_types
+ ;; base types with spaces:
+ ;; =======================
+ ;; [[un]signed] char
+ ;; [[un]signed] short [int]
+ ;; [[un]signed] int
+ ;; [un]signed [int]
+ ;; [[un]signed] long [int]
+ ;; [[un]signed] long long [int]
+ ;; float
+ ;; [long] double
+
+ ;; https://en.wikipedia.org/wiki/Type_qualifier
+ ;; qualifiers
+ ;; const
+ ;; volatile
+ ;; restrict
+ ;; _Atomic
+
+
+ ;; Storage specifiers
+ ;; auto
+ ;; register
+ ;; static
+ ;; extern
+
+ ;; struct <typename>
+ ;; enum <typename>
+ ;; union <typename>
+
+ ;; https://en.wikipedia.org/wiki/C_syntax
+ ;; int (*ptr_to_array)[100]
+
+
+ [(? symbol? bare)
+ (scm-error 'c-parse-error
+ "parse-lexeme-tree"
+ "Naked literal in lex-tree: ~s"
+ (list bare)
+ #f)]
+
+ [form
+ (scm-error 'c-parse-error
+ "parse-lexeme-tree"
+ "Unknown form in lex-tree: ~s"
+ (list form) #f)
+ ]))
;; https://en.wikipedia.org/wiki/Operators_in_C_and_C%2B%2B
+;; https://en.cppreference.com/w/c/language/operator_precedence
(define order-of-operations
(reverse
- (concatenate
- ;; This is only for binary operations
- `((-> ,(symbol #\.))
- (* / %)
- (+ -)
- (<< >>)
- (< <= > >=)
- (== !=)
- (&)
- (^)
- (,(symbol #\|))
- (&&)
- (,(symbol #\| #\|))
- (= += -= *= /= %= <<= >>= &= ^= ,(symbol #\| #\=))
- (,(symbol #\,))
- ))))
-
-(define (mark-other form)
- (if (list? form) (cons '*other* form) form))
+ ;; This is only for binary operations
+ `((-> ,(symbol #\.))
+ ;; All unary procedures go here, incnluding typecasts, and sizeof
+ (* / %)
+ (+ -)
+ (<< >>)
+ (< <= > >=)
+ (== != not_eq)
+ (& bitand)
+ (^ xorg)
+ (,(symbol #\|) bitor)
+ (&& and)
+ (,(symbol #\| #\|) or)
+ (? :)
+ (= += -= *= /= %= <<= >>= &= ^= ,(symbol #\| #\=)
+ and_eq or_eq xor_eq)
+ (,(symbol #\,))
+ )))
+
+;; a.b->c.d (. (-> (. a b) c) d)
+;; 2 * 3 / 4 * 5 => (* (/ (* 2 3) 4) 5)
+;; eller => (* 2 (/ 3 4) 5)
(define* (resolve-order-of-operations
tree optional: (order order-of-operations))
(if (null? order)
- (car tree)
+ (scm-error 'c-parse-error
+ "resolve-order-of-operations"
+ "Out of operations to resolve when resolving expression ~s"
+ (list tree) #f)
(match tree
- [('*other* body ...) body]
- [(form) (resolve-order-of-operations form order)]
- [(forms ...)
- (match (split-by forms (car order))
- [(group) (resolve-order-of-operations group (cdr order))]
- [groups
- (cons (car order)
- (map (lambda (form) (resolve-order-of-operations
- form order-of-operations))
- groups))])]
- [a a])))
+ [('fixed-infix form) form]
+ [('fixed-infix forms ...)
+ (match (split-by-one-of forms (car order))
+ [(group)
+ (resolve-order-of-operations (cons 'fixed-infix group)
+ (cdr order))]
+ [(a ('? b ...) (': c ...))
+ `(ternary ,(resolve-order-of-operations (cons 'fixed-infix a) (cdr order))
+ ,(resolve-order-of-operations (cons 'fixed-infix b) (cdr order))
+ ,(resolve-order-of-operations (cons 'fixed-infix c) (cdr order)))]
+ [(first rest ...)
+ ;; TODO this is only valid for the associative operators (+, ...)
+ ;; but not some other (<, ...)
+ (if (apply eq? (map car rest))
+ (let ((op (caar rest)))
+ `((resolved-operator ,op)
+ ,@(map (lambda (x) (resolve-order-of-operations (cons 'fixed-infix x)
+ (cdr order)))
+ (cons first (map cdr rest)))))
+ (fold (lambda (item done)
+ (let ((operator args (car+cdr item)))
+ `((resolved-operator ,operator)
+ ,done ,(resolve-order-of-operations
+ (cons 'fixed-infix args)
+ (cdr order)))))
+ (resolve-order-of-operations (cons 'fixed-infix first)
+ (cdr order))
+ rest))])])))
+
+;; 1 * 2 / 3 * 4
+;; ⇒ ((1) (* 2) (/ 3) (* 4))
+;; (1)
+;; (* (1) 2)
+;; (/ (* (1) 2) 3)
+;; (* (/ (* (1) 2) 3) 4)
;; Flatens a tree of infix triples. Stops when it should.
;; (parenthesis, function calls, ...)
(define (flatten-infix form)
- (match form
- [('infix left op ('infix right ...))
- (cons* (parse-lexeme-tree left)
- (parse-lexeme-tree op)
- (flatten-infix (cons 'infix right)))]
-
- [('infix left op right)
- (list (mark-other (parse-lexeme-tree left))
- (parse-lexeme-tree op)
- (mark-other (parse-lexeme-tree right)))]
-
- [other (scm-error 'c-parse-error
- "flatten-infix"
- "Not an infix tree ~a"
- (list other)
- #f)]))
+ (cons 'fixed-infix
+ (let loop ((form form))
+ (match form
+ [('infix left op ('infix right ...))
+ (cons* left
+ (parse-lexeme-tree op)
+ (loop (cons 'infix right)))]
+
+ [('infix left op right)
+ (list left
+ (parse-lexeme-tree op)
+ right)]
+
+ [('infix form) form]
+
+ [other (scm-error 'c-parse-error
+ "flatten-infix"
+ "Not an infix tree ~a"
+ (list other)
+ #f)]))))