diff options
Diffstat (limited to 'module/c/lex2.scm')
-rw-r--r-- | module/c/lex2.scm | 549 |
1 files changed, 549 insertions, 0 deletions
diff --git a/module/c/lex2.scm b/module/c/lex2.scm new file mode 100644 index 00000000..af90dcce --- /dev/null +++ b/module/c/lex2.scm @@ -0,0 +1,549 @@ +(define-module (c lex2) + :use-module (ice-9 peg) + :use-module (ice-9 match) + :use-module ((hnh util) :select (->)) + :use-module (hnh util object) + :use-module (hnh util type) + :use-module ((srfi srfi-1) :select (fold)) + :use-module (srfi srfi-88) + :use-module ((c trigraph) :select (replace-trigraphs)) + :use-module ((c line-fold) :select (fold-lines)) + :export (lex + lexeme lexeme? + placemaker + (type . lexeme-type) + (body . lexeme-body) + (noexpand . lexeme-noexpand) + + parse-c-number + + tokenize + )) + +;;; A.1 Lexical grammar +;;; A.1.1 Lexical elements + +;; (6.4) +(define-peg-pattern token all + (or keyword + identifier + constant + string-literal + punctuator + )) + +;; (6.4) +(define-peg-pattern preprocessing-token all + ;; string literal moved before header-name since string literals + ;; otherwise became q-strings + (or string-literal + header-name + character-constant + identifier + pp-number + punctuator + ;; Each non-white-space character that cannot be one of the above + )) + +;;; A.1.2 Keywords + +;; (6.4.1) +(define-peg-pattern keyword all + (or "auto" "break" "case" "char" "const" "continue" "default" + "do" "double" "else" "enum" "extern" "float" "for" "goto" + "if" "inline" "int" "long" "register" "restrict" "return" + "short" "signed" "sizeof" "static" "struct" "switch" + "typedef" "union" "unsigned" "void" "volatile" "while" + "_Alignas" "_Alignof" "_Atomic" "_Bool" "_Complex" + "_Generic" "_Imaginary" "_Noreturn" "_Static_assert" + "_Thread_local")) + +;;; A.1.3 Identifiers + +;; (6.4.2.1) +(define-peg-pattern identifier all + (and identifier-nondigit (* (or identifier-nondigit digit)))) + +;; (6.4.2.1) +(define-peg-pattern identifier-nondigit body + (or nondigit + universal-character-name + ;; TODO other implementation-defined characters + )) + +;; (6.4.2.1) +(define-peg-pattern nondigit body + (or "_" + (range #\A #\Z) + (range #\a #\z))) + +;; (6.4.2.1) +(define-peg-pattern digit body + (range #\0 #\9)) + +;;; A.1.4 Universal character names + +;; (6.4.3) +(define-peg-pattern universal-character-name all + (or (and (ignore "\\u") hex-quad) + (and (ignore "\\U") hex-quad hex-quad))) + +;; (6.4.3) +(define-peg-pattern hex-quad body + (and hexadecimal-digit hexadecimal-digit + hexadecimal-digit hexadecimal-digit)) + +;;; A.1.5 Constants + +;; (6.4.4) +(define-peg-pattern constant all + ;; Int and float swapped from standard since we need to try parsing + ;; the floats beforehand + (or floating-constant + integer-constant + enumeration-constant + character-constant)) + +;; (6.4.4.1) +(define-peg-pattern integer-constant all + (and (or decimal-constant + hexadecimal-constant + octal-constant) + (? integer-suffix))) + +;; (6.4.4.1) +(define-peg-pattern decimal-constant all + (and nonzero-digit (* digit))) + +;; (6.4.4.1) +(define-peg-pattern octal-constant all + (and "0" (* octal-digit))) + +;; (6.4.4.1) +(define-peg-pattern hexadecimal-constant all + (and hexadecimal-prefix (+ hexadecimal-digit))) + +;; (6.4.4.1) +(define-peg-pattern hexadecimal-prefix none + (or "0x" "0X")) + +;; (6.4.4.1) +(define-peg-pattern nonzero-digit body + (range #\1 #\9)) + +;; (6.4.4.1) +(define-peg-pattern octal-digit body + (range #\0 #\7)) + +;; (6.4.4.1) +(define-peg-pattern hexadecimal-digit body + (or (range #\0 #\9) + (range #\a #\f) + (range #\A #\F))) + +;; (6.4.4.1) +(define-peg-pattern integer-suffix all + (or (and unsigned-suffix (? long-suffix)) + (and long-suffix (? unsigned-suffix)))) + +;; (6.4.4.1) +;; This is a merger of long-suffix and long-long-suffix +(define-peg-pattern long-suffix body + (or "l" "L" "ll" "LL")) + +;; (6.4.4.1) +(define-peg-pattern unsigned-suffix body + (or "u" "U")) + +;; (6.4.4.2) +(define-peg-pattern floating-constant all + (or decimal-floating-constant + hexadecimal-floating-constant)) + +;; (6.4.4.2) +(define-peg-pattern decimal-floating-constant all + (or (and fractional-constant (? exponent-part) (? floating-suffix)) + (and digit-sequence exponent-part (? floating-suffix)))) + +;; (6.4.4.2) +(define-peg-pattern hexadecimal-floating-constant all + (and hexadecimal-prefix + (or hexadecimal-fractional-constant + hexadecimal-digit-sequence) + binary-exponent-part + (? floating-suffix))) + +;; (6.4.4.2) +(define-peg-pattern fractional-constant all + (or (and (? digit-sequence) "." digit-sequence) + (and digit-sequence "."))) + +;; (6.4.4.2) +(define-peg-pattern exponent-part all + (and (or "e" "E") (? sign) digit-sequence)) + +;; (6.4.4.2) +(define-peg-pattern sign all + (or "+" "-")) + +;; (6.4.4.2) +(define-peg-pattern digit-sequence body + (+ digit)) + +;; (6.4.4.2) +(define-peg-pattern hexadecimal-fractional-constant all + (or (and (? hexadecimal-digit-sequence) "." hexadecimal-digit-sequence) + (and hexadecimal-digit-sequence "."))) + +;; (6.4.4.2) +(define-peg-pattern binary-exponent-part all + (and (ignore (or "p" "P")) + (? sign) + digit-sequence)) + +;; (6.4.4.2) +(define-peg-pattern hexadecimal-digit-sequence body + (+ hexadecimal-digit)) + +;; (6.4.4.2) +(define-peg-pattern floating-suffix all + (or "f" "l" "F" "L")) + +;; (6.4.4.3) +(define-peg-pattern enumeration-constant all + identifier) + +(define-peg-pattern character-prefix all + (or "L" "u" "U")) + +;; (6.4.4.4) +(define-peg-pattern character-constant all + (and (? character-prefix) + (ignore "'") + (+ c-char) + (ignore "'"))) + +;; (6.4.4.4) +(define-peg-pattern c-char body + (or (and (not-followed-by (or "'" "\\" "\n")) peg-any) + escape-sequence)) + +;; (6.4.4.4) +(define-peg-pattern escape-sequence all + (or simple-escape-sequence + octal-escape-sequence + hexadecimal-escape-sequence + universal-character-name)) + +;; (6.4.4.4) +(define-peg-pattern simple-escape-sequence all + (and (ignore "\\") (or "'" "\"" "?" "\\" + "a" "b" "f" "n" "r" "t" "v"))) + +;; (6.4.4.4) +(define-peg-pattern octal-escape-sequence all + (and (ignore "\\") octal-digit (? octal-digit) (? octal-digit))) + +;; (6.4.4.4) +(define-peg-pattern hexadecimal-escape-sequence all + (and (ignore "\\x") (+ hexadecimal-digit))) + +;; A.1.6 String literals + +;; (6.4.5) +(define-peg-pattern string-literal all + (and (? encoding-prefix) + (ignore "\"") + (* s-char) + (ignore "\""))) + +;; (6.4.5) +(define-peg-pattern encoding-prefix all + (or "u8" "u" "U" "L")) + +;; (6.4.5) +(define-peg-pattern s-char body + (or (and (not-followed-by (or "\"" "\\" "\n")) peg-any) + escape-sequence)) + +;;; A.1.7 + +;; (6.4.6) +(define-peg-pattern punctuator all + (or "[" "]" "(" ")" "{" "}" + "..." ; Moved to be before "." + "." "->" + "&&" "||" + "!=" + "++" "--" + "*=" "/=" "%=" "+=" "-=" "<<=" ">>=" "&=" "^=" "|=" + "<=" ">=" "==" + "=" + "/" "%" "<<" ">>" "<" ">" "^" "|" + "?" ":" ";" + "&" "*" "+" "-" "~" "!" + "," "##" "#" ; # and ## flipped + "<:" ":>" "<%" "%>" "%:%:" "%:" ; %: and %:%: flipped + )) + +;;; A.1.8 Header names + +(define-peg-pattern h-string all (+ h-char)) +(define-peg-pattern q-string all (+ q-char)) + +;; (6.4.7) +(define-peg-pattern header-name all + (or (and (ignore "<") h-string (ignore ">")) + ;; NOTE this case will never be reached, since it's treated as a regular + ;; string instead + (and (ignore "\"") q-string (ignore "\"")))) + +;; (6.4.7) +(define-peg-pattern h-char body + (or (and (not-followed-by (or ">" "\n")) peg-any) + escape-sequence)) + +;; (6.4.7) +(define-peg-pattern q-char body + (or (and (not-followed-by (or "\"" "\n")) peg-any) + escape-sequence)) + +;;; A.1.9 Preprocessing numbers + +;; (6.4.8) +(define-peg-pattern pp-number all + (and (? ".") digit + (* (or digit + identifier-nondigit + (and (or "e" "E" "p" "P") + sign) + ".")))) + + + +(define-peg-pattern whitespace all + (or "\t" "\n" "\v" "\f" " " + ;; "\r" + )) + +(define-peg-pattern block-comment body + (and (ignore "/*") + (* (and (not-followed-by "*/") + peg-any)) + (ignore "*/"))) + +(define-peg-pattern line-comment body + (and (ignore "//") + (* (and (not-followed-by "\n") + peg-any)))) + +(define-peg-pattern comment all + (or line-comment block-comment)) + +(define-peg-pattern non-whitespace all + (and (not-followed-by whitespace) + peg-any)) + +(define-peg-pattern preprocessing-tokens all + (* (or whitespace + comment + preprocessing-token + non-whitespace))) + + + +;; comment could be merged with whitespace, but then unlex would have to know that + +;; other is the "each non-white-space character that cannot be one of the above" +;; clause from 6.4 p. 1 + +(define-type (lexeme) + (type type: (memv '(whitespace comment preprocessing-token other placemaker))) + (body type: (or string? list?)) + (noexpand type: (list-of string?) + default: '())) + +(define (placemaker) + (lexeme type: 'placemaker body: '())) + +(define (lex-output->lexeme-object x) + (match x + (`(non-whitespace ,body) + (lexeme body: body type: 'other)) + (`(whitespace ,body) + (lexeme body: body type: 'whitespace )) + (`(comment ,body) + (lexeme body: body type: 'comment )) + (`(preprocessing-token ,body) + (match body + ('string-literal + ;; Unflatten case + (lexeme body: '(string-literal (encoding-prefix) "") + type: 'preprocessing-token)) + (('string-literal `(encoding-prefix ,px) args ...) + (lexeme body: `(string-literal (encoding-prefix . ,px) ,@args) + type: 'preprocessing-token)) + (('string-literal args ...) + (lexeme body: `(string-literal (encoding-prefix) ,@args) + type: 'preprocessing-token)) + (('character-constant `(character-prefix ,px) args ...) + (lexeme body: `(character-constant (character-prefix . ,px) + ,@args) + type: 'preprocessing-token)) + (('character-constant args ...) + (lexeme body: `(character-constant (character-prefix) ,@args) + type: 'preprocessing-token)) + (body (lexeme body: body type: 'preprocessing-token)))) + + ;; "unflatten" + ('comment (lexeme body: "" type: 'comment)))) + + + + +;; At a number of places I chose token depending on the order of the rule. The +;; standard however says that the longest possible choice should be used. +;; 6.4 p. 4 + +;; returns a list of lexemes +(define (lex string) + (if (string-null? string) + '() + (map lex-output->lexeme-object + (let ((result (match-pattern preprocessing-tokens string))) + (let ((trailing (substring (peg:string result) + (peg:end result)))) + (unless (string-null? trailing) + (scm-error 'cpp-lex-error "lex" + "Failed to lex string, remaining trailing characters: ~s" + (list trailing) #f))) + (unless (list? (peg:tree result)) + (scm-error 'cpp-lex-error "lex" + "Parsing just failed. Chars: ~s" + (list (peg:string result)) #f)) + (cdr (peg:tree result)))))) + + + + + +;; (parse-decimals "555" 10) +;; ⇒ 0.5549999999999999 +;; (parse-decimals "8" 16) +;; ⇒ 0.5 +(define (parse-decimals str base) + (/ (fold (lambda (digit done) + (let ((v (string->number digit base))) + (+ v (/ done base)))) + 0.0 + (map string (string->list str))) + base)) + +;; parse a number on form <digits>.<digits> +(define (parse-fractional str base) + (let* ((pair (string-split str #\.)) + (integer (list-ref pair 0)) + (decimals (list-ref pair 1))) + (+ (if (string-null? integer) + 0 (string->number integer 16)) + (if (string-null? decimals) + 0 (parse-decimals decimals 16))))) + + +(define (parse-float body) + (define (fractional-constant x) + (case x + ((decimal-floating-constant) 'fractional-constant) + ((hexadecimal-floating-constant) 'hexadecimal-fractional-constant))) + + (define (exponent-part x) + (case x + ((decimal-floating-constant) 'exponent-part) + ((hexadecimal-floating-constant) 'binary-exponent-part))) + + (define (expt-base x) + (case x + ((decimal-floating-constant) 10) + ((hexadecimal-floating-constant) 2))) + + (define (base x) + (case x + ((decimal-floating-constant) 10) + ((hexadecimal-floating-constant) 16))) + + (let ((type (car body)) + (body (cdr body))) + (* 1.0 + (cond ((assoc-ref body (fractional-constant type)) + => (lambda (fc) (parse-fractional (car fc) (base type)))) + (else (string->number (car body) (base type)))) + (cond ((assoc-ref body (exponent-part type)) + => (lambda (x) (expt (expt-base type) + (string->number (car x) (base type))))) + (else 1))) + ;; TODO do something with (possible) suffix + ;; (assoc-ref body 'floating-suffix) + )) + +(define (parse-integer body) + (let* (;; (suffix (assoc-ref body 'integer-suffix)) + (value (cadr (car body))) + (value-type (car (car body)))) + ;; TODO do something with suffix + (string->number + value + (case value-type + ((octal-constant) 8) + ((decimal-constant) 10) + ((hexadecimal-constant) 16))))) + +;; (parse-c-number "0x1.8p0") +;; ⇒ 1.5 + +;; TODO is "5ul" equivalent to "((unsigned long) 5)" +(define (parse-c-number string) + (cond ((match-pattern constant string) + => (lambda (m) + (let ((m (cadr (peg:tree m)))) ; Strip 'constant wrapper + (case (car m) + ((floating-constant) + (parse-float (cadr m))) + + ((integer-constant) + (parse-integer (cdr m))) + + ((enumeration-constant character-constant) + (scm-error 'misc-error "parse-c-number" + "Couldn't parse [~a] as a /number/ (~s)" + (list string m) #f)))))) + + (else (scm-error 'misc-error "parse-c-number" + "Couldn't parse [~a] as a number" + (list string) #f)))) + + + + +;;; 5.1.11.2 Translation phases + +(define (tokenize string) + (-> string +;;; 1. trigraph replacement + replace-trigraphs +;;; 2. Line folding + fold-lines +;;; 3. Decomposition into preprocenning tokens, whitespaces, and comments + lex + comments->whitespace)) + +;; These really belong in (c cpp-types), but that would create a dependency cycle + +(define (comment->whitespace token) + (if ;; (comment-token? token) + (and (lexeme? token) + (eq? 'comment (type token))) + (car (lex " ")) + token)) + +(define (comments->whitespace tokens) + (map comment->whitespace tokens)) |