(define-module (c lex) :use-module (ice-9 peg) :use-module (c operators) :export (lex)) ;; Like define-peg-pattern, but body is evaluated (define-syntax define-peg-pattern* (lambda (stx) (syntax-case stx () ((_ sym accum pat) #`(define sym (let ((matchf (compile-peg-pattern (datum->syntax #'stx pat) 'accum))) (let ((syn ((@ (ice-9 peg codegen) wrap-parser-for-users) #'stx matchf 'accum 'sym))) ((@ (system base compile) compile) ((@ (ice-9 peg cache) cg-cached-parser) syn))))))))) (define-peg-pattern base-8-digit body (range #\0 #\7)) (define-peg-pattern base-10-digit body (range #\0 #\9)) (define-peg-pattern base-16-digit body (or (range #\0 #\9) (range #\A #\F) (range #\a #\f))) ;; https://en.cppreference.com/w/cpp/language/integer_literal (define-peg-pattern base-10 all (+ base-10-digit)) (define-peg-pattern base-8 all (and (ignore "0") (+ base-8-digit))) (define-peg-pattern base-16 all (and (ignore (and "0" (or "x" "X"))) (+ base-16-digit))) ;; accept anything now, ensure correctnes later (define-peg-pattern integer-suffix all (* (or "u" "U" "l" "L"))) (define-peg-pattern integer all (and (or base-8 base-16 base-10) (? integer-suffix))) (define-peg-pattern float-suffix all (* (or "f" "F" "l" "L"))) (define-peg-pattern exponent all (and (ignore (or "e" "E")) (? (or "+" "-")) base-10)) ;; Helper patterns for creating named groups in float (define-peg-pattern float-integer all base-10) (define-peg-pattern float-decimal all base-10) (define-peg-pattern float all (or (and float-integer exponent (? float-suffix)) (and (? float-integer) (ignore ".") float-decimal (? exponent) (? float-suffix)) (and float-integer (ignore ".") (? exponent) (? float-suffix)))) (define-peg-pattern number body (or float integer)) (define-peg-pattern group all (and (ignore "(") expr (ignore ")"))) (define-peg-pattern base-8-char all (and base-8-digit (? base-8-digit) (? base-8-digit))) (define-peg-pattern base-16-char all (and (ignore "x") base-16-digit (? base-16-digit))) (define-peg-pattern escaped-char all (and (ignore "\\") (or base-16-char base-8-char peg-any))) (define-peg-pattern char all (and (ignore "'") (or escaped-char peg-any) (ignore "'"))) (define-peg-pattern quot none "\"") (define-peg-pattern string all (and quot (* (or escaped-char (and (not-followed-by "\"") peg-any))) quot)) (define-peg-pattern* operator all `(or ,@(map symbol->string symbol-binary-operators) ,@(map (lambda (op) `(and ,(symbol->string op) ws)) wordy-binary-operators) "?" ":")) ;; whitespace (define-peg-pattern ws none (or " " " " "\n")) ;; space (for when whitespace is optional) (define-peg-pattern sp none (* ws)) (define-peg-pattern safe-letter body (or "_" (range #\A #\Z) (range #\a #\z))) (define-peg-pattern variable all (and safe-letter (* (or safe-letter base-10-digit)))) (define-peg-pattern prefix-operator all ;; It's important that ++ and -- are BEFORE + and - ;; otherwise the first + is found, leaving the second +, which fails ;; to lex since it's an invalid token ;; TODO sizeof can be written as a prefix operator ;; (without parenthesis) if the operand is an expression. (or "*" "&" "++" "--" "!" "~" "+" "-")) ;;; Note that stacked pre or postfix operators without parenthesis ;;; dosen't work. So `*&C' is invalid, while `*(&C)' is valid. (define-peg-pattern prefix all (and prefix-operator sp (or variable group funcall literal))) (define-peg-pattern postfix-operator all (or "++" "--" "*")) (define-peg-pattern postfix all ;; literals can't be in-place incremented and decremented ;; Make sure we don't match postfix-operator here, since ;; that also gives us an infinite loop. (and (or prefix funcall group variable) sp postfix-operator)) (define-peg-pattern infix all ;; first case is "same" as expr, but in different order to prevent ;; infinite self reference. Pre and postfix not here, solved by having ;; them before infix in expr (and (or funcall postfix prefix group literal variable) sp operator sp expr)) (define-peg-pattern funcall all (and variable sp group)) (define-peg-pattern literal body (or char string number)) ;;; main parser (define-peg-pattern expr body (+ (and sp (or ;; float must be BEFORE infix, otherwise 3.2 is parsed as (infix 3 (operator ".") 2) ;; that however breaks the infix logic, meaning that floating point numbers can't be ;; used in basic arithmetic. ;; TODO remove all implicit order of operations handling in the lexer, and move it to ;; the parser. This should also fix the case of typecasts being applied incorrectly. float infix postfix prefix funcall group literal variable) sp))) (define (lex string) (peg:tree (match-pattern expr string)))