aboutsummaryrefslogtreecommitdiff
path: root/module/c/lex2.scm
diff options
context:
space:
mode:
Diffstat (limited to 'module/c/lex2.scm')
-rw-r--r--module/c/lex2.scm549
1 files changed, 549 insertions, 0 deletions
diff --git a/module/c/lex2.scm b/module/c/lex2.scm
new file mode 100644
index 00000000..af90dcce
--- /dev/null
+++ b/module/c/lex2.scm
@@ -0,0 +1,549 @@
+(define-module (c lex2)
+ :use-module (ice-9 peg)
+ :use-module (ice-9 match)
+ :use-module ((hnh util) :select (->))
+ :use-module (hnh util object)
+ :use-module (hnh util type)
+ :use-module ((srfi srfi-1) :select (fold))
+ :use-module (srfi srfi-88)
+ :use-module ((c trigraph) :select (replace-trigraphs))
+ :use-module ((c line-fold) :select (fold-lines))
+ :export (lex
+ lexeme lexeme?
+ placemaker
+ (type . lexeme-type)
+ (body . lexeme-body)
+ (noexpand . lexeme-noexpand)
+
+ parse-c-number
+
+ tokenize
+ ))
+
+;;; A.1 Lexical grammar
+;;; A.1.1 Lexical elements
+
+;; (6.4)
+(define-peg-pattern token all
+ (or keyword
+ identifier
+ constant
+ string-literal
+ punctuator
+ ))
+
+;; (6.4)
+(define-peg-pattern preprocessing-token all
+ ;; string literal moved before header-name since string literals
+ ;; otherwise became q-strings
+ (or string-literal
+ header-name
+ character-constant
+ identifier
+ pp-number
+ punctuator
+ ;; Each non-white-space character that cannot be one of the above
+ ))
+
+;;; A.1.2 Keywords
+
+;; (6.4.1)
+(define-peg-pattern keyword all
+ (or "auto" "break" "case" "char" "const" "continue" "default"
+ "do" "double" "else" "enum" "extern" "float" "for" "goto"
+ "if" "inline" "int" "long" "register" "restrict" "return"
+ "short" "signed" "sizeof" "static" "struct" "switch"
+ "typedef" "union" "unsigned" "void" "volatile" "while"
+ "_Alignas" "_Alignof" "_Atomic" "_Bool" "_Complex"
+ "_Generic" "_Imaginary" "_Noreturn" "_Static_assert"
+ "_Thread_local"))
+
+;;; A.1.3 Identifiers
+
+;; (6.4.2.1)
+(define-peg-pattern identifier all
+ (and identifier-nondigit (* (or identifier-nondigit digit))))
+
+;; (6.4.2.1)
+(define-peg-pattern identifier-nondigit body
+ (or nondigit
+ universal-character-name
+ ;; TODO other implementation-defined characters
+ ))
+
+;; (6.4.2.1)
+(define-peg-pattern nondigit body
+ (or "_"
+ (range #\A #\Z)
+ (range #\a #\z)))
+
+;; (6.4.2.1)
+(define-peg-pattern digit body
+ (range #\0 #\9))
+
+;;; A.1.4 Universal character names
+
+;; (6.4.3)
+(define-peg-pattern universal-character-name all
+ (or (and (ignore "\\u") hex-quad)
+ (and (ignore "\\U") hex-quad hex-quad)))
+
+;; (6.4.3)
+(define-peg-pattern hex-quad body
+ (and hexadecimal-digit hexadecimal-digit
+ hexadecimal-digit hexadecimal-digit))
+
+;;; A.1.5 Constants
+
+;; (6.4.4)
+(define-peg-pattern constant all
+ ;; Int and float swapped from standard since we need to try parsing
+ ;; the floats beforehand
+ (or floating-constant
+ integer-constant
+ enumeration-constant
+ character-constant))
+
+;; (6.4.4.1)
+(define-peg-pattern integer-constant all
+ (and (or decimal-constant
+ hexadecimal-constant
+ octal-constant)
+ (? integer-suffix)))
+
+;; (6.4.4.1)
+(define-peg-pattern decimal-constant all
+ (and nonzero-digit (* digit)))
+
+;; (6.4.4.1)
+(define-peg-pattern octal-constant all
+ (and "0" (* octal-digit)))
+
+;; (6.4.4.1)
+(define-peg-pattern hexadecimal-constant all
+ (and hexadecimal-prefix (+ hexadecimal-digit)))
+
+;; (6.4.4.1)
+(define-peg-pattern hexadecimal-prefix none
+ (or "0x" "0X"))
+
+;; (6.4.4.1)
+(define-peg-pattern nonzero-digit body
+ (range #\1 #\9))
+
+;; (6.4.4.1)
+(define-peg-pattern octal-digit body
+ (range #\0 #\7))
+
+;; (6.4.4.1)
+(define-peg-pattern hexadecimal-digit body
+ (or (range #\0 #\9)
+ (range #\a #\f)
+ (range #\A #\F)))
+
+;; (6.4.4.1)
+(define-peg-pattern integer-suffix all
+ (or (and unsigned-suffix (? long-suffix))
+ (and long-suffix (? unsigned-suffix))))
+
+;; (6.4.4.1)
+;; This is a merger of long-suffix and long-long-suffix
+(define-peg-pattern long-suffix body
+ (or "l" "L" "ll" "LL"))
+
+;; (6.4.4.1)
+(define-peg-pattern unsigned-suffix body
+ (or "u" "U"))
+
+;; (6.4.4.2)
+(define-peg-pattern floating-constant all
+ (or decimal-floating-constant
+ hexadecimal-floating-constant))
+
+;; (6.4.4.2)
+(define-peg-pattern decimal-floating-constant all
+ (or (and fractional-constant (? exponent-part) (? floating-suffix))
+ (and digit-sequence exponent-part (? floating-suffix))))
+
+;; (6.4.4.2)
+(define-peg-pattern hexadecimal-floating-constant all
+ (and hexadecimal-prefix
+ (or hexadecimal-fractional-constant
+ hexadecimal-digit-sequence)
+ binary-exponent-part
+ (? floating-suffix)))
+
+;; (6.4.4.2)
+(define-peg-pattern fractional-constant all
+ (or (and (? digit-sequence) "." digit-sequence)
+ (and digit-sequence ".")))
+
+;; (6.4.4.2)
+(define-peg-pattern exponent-part all
+ (and (or "e" "E") (? sign) digit-sequence))
+
+;; (6.4.4.2)
+(define-peg-pattern sign all
+ (or "+" "-"))
+
+;; (6.4.4.2)
+(define-peg-pattern digit-sequence body
+ (+ digit))
+
+;; (6.4.4.2)
+(define-peg-pattern hexadecimal-fractional-constant all
+ (or (and (? hexadecimal-digit-sequence) "." hexadecimal-digit-sequence)
+ (and hexadecimal-digit-sequence ".")))
+
+;; (6.4.4.2)
+(define-peg-pattern binary-exponent-part all
+ (and (ignore (or "p" "P"))
+ (? sign)
+ digit-sequence))
+
+;; (6.4.4.2)
+(define-peg-pattern hexadecimal-digit-sequence body
+ (+ hexadecimal-digit))
+
+;; (6.4.4.2)
+(define-peg-pattern floating-suffix all
+ (or "f" "l" "F" "L"))
+
+;; (6.4.4.3)
+(define-peg-pattern enumeration-constant all
+ identifier)
+
+(define-peg-pattern character-prefix all
+ (or "L" "u" "U"))
+
+;; (6.4.4.4)
+(define-peg-pattern character-constant all
+ (and (? character-prefix)
+ (ignore "'")
+ (+ c-char)
+ (ignore "'")))
+
+;; (6.4.4.4)
+(define-peg-pattern c-char body
+ (or (and (not-followed-by (or "'" "\\" "\n")) peg-any)
+ escape-sequence))
+
+;; (6.4.4.4)
+(define-peg-pattern escape-sequence all
+ (or simple-escape-sequence
+ octal-escape-sequence
+ hexadecimal-escape-sequence
+ universal-character-name))
+
+;; (6.4.4.4)
+(define-peg-pattern simple-escape-sequence all
+ (and (ignore "\\") (or "'" "\"" "?" "\\"
+ "a" "b" "f" "n" "r" "t" "v")))
+
+;; (6.4.4.4)
+(define-peg-pattern octal-escape-sequence all
+ (and (ignore "\\") octal-digit (? octal-digit) (? octal-digit)))
+
+;; (6.4.4.4)
+(define-peg-pattern hexadecimal-escape-sequence all
+ (and (ignore "\\x") (+ hexadecimal-digit)))
+
+;; A.1.6 String literals
+
+;; (6.4.5)
+(define-peg-pattern string-literal all
+ (and (? encoding-prefix)
+ (ignore "\"")
+ (* s-char)
+ (ignore "\"")))
+
+;; (6.4.5)
+(define-peg-pattern encoding-prefix all
+ (or "u8" "u" "U" "L"))
+
+;; (6.4.5)
+(define-peg-pattern s-char body
+ (or (and (not-followed-by (or "\"" "\\" "\n")) peg-any)
+ escape-sequence))
+
+;;; A.1.7
+
+;; (6.4.6)
+(define-peg-pattern punctuator all
+ (or "[" "]" "(" ")" "{" "}"
+ "..." ; Moved to be before "."
+ "." "->"
+ "&&" "||"
+ "!="
+ "++" "--"
+ "*=" "/=" "%=" "+=" "-=" "<<=" ">>=" "&=" "^=" "|="
+ "<=" ">=" "=="
+ "="
+ "/" "%" "<<" ">>" "<" ">" "^" "|"
+ "?" ":" ";"
+ "&" "*" "+" "-" "~" "!"
+ "," "##" "#" ; # and ## flipped
+ "<:" ":>" "<%" "%>" "%:%:" "%:" ; %: and %:%: flipped
+ ))
+
+;;; A.1.8 Header names
+
+(define-peg-pattern h-string all (+ h-char))
+(define-peg-pattern q-string all (+ q-char))
+
+;; (6.4.7)
+(define-peg-pattern header-name all
+ (or (and (ignore "<") h-string (ignore ">"))
+ ;; NOTE this case will never be reached, since it's treated as a regular
+ ;; string instead
+ (and (ignore "\"") q-string (ignore "\""))))
+
+;; (6.4.7)
+(define-peg-pattern h-char body
+ (or (and (not-followed-by (or ">" "\n")) peg-any)
+ escape-sequence))
+
+;; (6.4.7)
+(define-peg-pattern q-char body
+ (or (and (not-followed-by (or "\"" "\n")) peg-any)
+ escape-sequence))
+
+;;; A.1.9 Preprocessing numbers
+
+;; (6.4.8)
+(define-peg-pattern pp-number all
+ (and (? ".") digit
+ (* (or digit
+ identifier-nondigit
+ (and (or "e" "E" "p" "P")
+ sign)
+ "."))))
+
+
+
+(define-peg-pattern whitespace all
+ (or "\t" "\n" "\v" "\f" " "
+ ;; "\r"
+ ))
+
+(define-peg-pattern block-comment body
+ (and (ignore "/*")
+ (* (and (not-followed-by "*/")
+ peg-any))
+ (ignore "*/")))
+
+(define-peg-pattern line-comment body
+ (and (ignore "//")
+ (* (and (not-followed-by "\n")
+ peg-any))))
+
+(define-peg-pattern comment all
+ (or line-comment block-comment))
+
+(define-peg-pattern non-whitespace all
+ (and (not-followed-by whitespace)
+ peg-any))
+
+(define-peg-pattern preprocessing-tokens all
+ (* (or whitespace
+ comment
+ preprocessing-token
+ non-whitespace)))
+
+
+
+;; comment could be merged with whitespace, but then unlex would have to know that
+
+;; other is the "each non-white-space character that cannot be one of the above"
+;; clause from 6.4 p. 1
+
+(define-type (lexeme)
+ (type type: (memv '(whitespace comment preprocessing-token other placemaker)))
+ (body type: (or string? list?))
+ (noexpand type: (list-of string?)
+ default: '()))
+
+(define (placemaker)
+ (lexeme type: 'placemaker body: '()))
+
+(define (lex-output->lexeme-object x)
+ (match x
+ (`(non-whitespace ,body)
+ (lexeme body: body type: 'other))
+ (`(whitespace ,body)
+ (lexeme body: body type: 'whitespace ))
+ (`(comment ,body)
+ (lexeme body: body type: 'comment ))
+ (`(preprocessing-token ,body)
+ (match body
+ ('string-literal
+ ;; Unflatten case
+ (lexeme body: '(string-literal (encoding-prefix) "")
+ type: 'preprocessing-token))
+ (('string-literal `(encoding-prefix ,px) args ...)
+ (lexeme body: `(string-literal (encoding-prefix . ,px) ,@args)
+ type: 'preprocessing-token))
+ (('string-literal args ...)
+ (lexeme body: `(string-literal (encoding-prefix) ,@args)
+ type: 'preprocessing-token))
+ (('character-constant `(character-prefix ,px) args ...)
+ (lexeme body: `(character-constant (character-prefix . ,px)
+ ,@args)
+ type: 'preprocessing-token))
+ (('character-constant args ...)
+ (lexeme body: `(character-constant (character-prefix) ,@args)
+ type: 'preprocessing-token))
+ (body (lexeme body: body type: 'preprocessing-token))))
+
+ ;; "unflatten"
+ ('comment (lexeme body: "" type: 'comment))))
+
+
+
+
+;; At a number of places I chose token depending on the order of the rule. The
+;; standard however says that the longest possible choice should be used.
+;; 6.4 p. 4
+
+;; returns a list of lexemes
+(define (lex string)
+ (if (string-null? string)
+ '()
+ (map lex-output->lexeme-object
+ (let ((result (match-pattern preprocessing-tokens string)))
+ (let ((trailing (substring (peg:string result)
+ (peg:end result))))
+ (unless (string-null? trailing)
+ (scm-error 'cpp-lex-error "lex"
+ "Failed to lex string, remaining trailing characters: ~s"
+ (list trailing) #f)))
+ (unless (list? (peg:tree result))
+ (scm-error 'cpp-lex-error "lex"
+ "Parsing just failed. Chars: ~s"
+ (list (peg:string result)) #f))
+ (cdr (peg:tree result))))))
+
+
+
+
+
+;; (parse-decimals "555" 10)
+;; ⇒ 0.5549999999999999
+;; (parse-decimals "8" 16)
+;; ⇒ 0.5
+(define (parse-decimals str base)
+ (/ (fold (lambda (digit done)
+ (let ((v (string->number digit base)))
+ (+ v (/ done base))))
+ 0.0
+ (map string (string->list str)))
+ base))
+
+;; parse a number on form <digits>.<digits>
+(define (parse-fractional str base)
+ (let* ((pair (string-split str #\.))
+ (integer (list-ref pair 0))
+ (decimals (list-ref pair 1)))
+ (+ (if (string-null? integer)
+ 0 (string->number integer 16))
+ (if (string-null? decimals)
+ 0 (parse-decimals decimals 16)))))
+
+
+(define (parse-float body)
+ (define (fractional-constant x)
+ (case x
+ ((decimal-floating-constant) 'fractional-constant)
+ ((hexadecimal-floating-constant) 'hexadecimal-fractional-constant)))
+
+ (define (exponent-part x)
+ (case x
+ ((decimal-floating-constant) 'exponent-part)
+ ((hexadecimal-floating-constant) 'binary-exponent-part)))
+
+ (define (expt-base x)
+ (case x
+ ((decimal-floating-constant) 10)
+ ((hexadecimal-floating-constant) 2)))
+
+ (define (base x)
+ (case x
+ ((decimal-floating-constant) 10)
+ ((hexadecimal-floating-constant) 16)))
+
+ (let ((type (car body))
+ (body (cdr body)))
+ (* 1.0
+ (cond ((assoc-ref body (fractional-constant type))
+ => (lambda (fc) (parse-fractional (car fc) (base type))))
+ (else (string->number (car body) (base type))))
+ (cond ((assoc-ref body (exponent-part type))
+ => (lambda (x) (expt (expt-base type)
+ (string->number (car x) (base type)))))
+ (else 1)))
+ ;; TODO do something with (possible) suffix
+ ;; (assoc-ref body 'floating-suffix)
+ ))
+
+(define (parse-integer body)
+ (let* (;; (suffix (assoc-ref body 'integer-suffix))
+ (value (cadr (car body)))
+ (value-type (car (car body))))
+ ;; TODO do something with suffix
+ (string->number
+ value
+ (case value-type
+ ((octal-constant) 8)
+ ((decimal-constant) 10)
+ ((hexadecimal-constant) 16)))))
+
+;; (parse-c-number "0x1.8p0")
+;; ⇒ 1.5
+
+;; TODO is "5ul" equivalent to "((unsigned long) 5)"
+(define (parse-c-number string)
+ (cond ((match-pattern constant string)
+ => (lambda (m)
+ (let ((m (cadr (peg:tree m)))) ; Strip 'constant wrapper
+ (case (car m)
+ ((floating-constant)
+ (parse-float (cadr m)))
+
+ ((integer-constant)
+ (parse-integer (cdr m)))
+
+ ((enumeration-constant character-constant)
+ (scm-error 'misc-error "parse-c-number"
+ "Couldn't parse [~a] as a /number/ (~s)"
+ (list string m) #f))))))
+
+ (else (scm-error 'misc-error "parse-c-number"
+ "Couldn't parse [~a] as a number"
+ (list string) #f))))
+
+
+
+
+;;; 5.1.11.2 Translation phases
+
+(define (tokenize string)
+ (-> string
+;;; 1. trigraph replacement
+ replace-trigraphs
+;;; 2. Line folding
+ fold-lines
+;;; 3. Decomposition into preprocenning tokens, whitespaces, and comments
+ lex
+ comments->whitespace))
+
+;; These really belong in (c cpp-types), but that would create a dependency cycle
+
+(define (comment->whitespace token)
+ (if ;; (comment-token? token)
+ (and (lexeme? token)
+ (eq? 'comment (type token)))
+ (car (lex " "))
+ token))
+
+(define (comments->whitespace tokens)
+ (map comment->whitespace tokens))