aboutsummaryrefslogtreecommitdiff
path: root/module/c/to-token.scm
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--module/c/to-token.scm161
1 files changed, 161 insertions, 0 deletions
diff --git a/module/c/to-token.scm b/module/c/to-token.scm
new file mode 100644
index 00000000..f5e459de
--- /dev/null
+++ b/module/c/to-token.scm
@@ -0,0 +1,161 @@
+(define-module (c to-token)
+ :use-module ((srfi srfi-1) :select (fold append-map))
+ :use-module (ice-9 match)
+ :use-module ((hnh util) :select (->))
+ :use-module ((system base lalr)
+ :select (make-lexical-token))
+ :use-module (c cpp-types)
+ :use-module ((c lex2) :select (parse-c-number))
+ ;; :use-module (hnh util type)
+ :use-module ((rnrs bytevectors)
+ :select (make-bytevector
+ bytevector-length
+ bytevector-copy!
+ u8-list->bytevector
+ bytevector-u8-ref))
+ :use-module ((rnrs io ports)
+ :select (string->bytevector
+ make-transcoder
+ utf-8-codec))
+ :export (preprocessing-token->token))
+
+(define (pp-number->c-number token)
+ (parse-c-number (pp-number? token)))
+
+(define keywords
+ '(auto break case char const continue default
+ do double else enum extern float for goto
+ if inline int long register restrict return
+ short signed sizeof static struct switch
+ typedef union unsigned void volatile while
+ _Alignas _Alignof _Atomic _Bool _Complex
+ _Generic _Imaginary _Noreturn _Static_assert
+ _Thread_local))
+
+(define (cpp-char->bytevector c)
+ (match c
+ (`(escape-sequence (simple-escape-sequence ,x))
+ (case (string-ref x 0)
+ ((#\a) #vu8(#x07)) ; #\alarm
+ ((#\b) #vu8(#x08)) ; #\backspace
+ ((#\f) #vu8(#x0C)) ; #\page
+ ((#\n) #vu8(#x0A)) ; #\newline
+ ((#\r) #vu8(#x0D)) ; #\return
+ ((#\t) #vu8(#x09)) ; #\tab
+ ((#\v) #vu8(#x0B)) ; #\vtab
+ ;; ' " ? \
+ (else (char->integer (string-ref x 0)))))
+
+ ;; TODO these u8-list->bytevector should depend on the
+ ;; encoding prefix of the string/char
+ (`(escape-sequence (octal-escape-sequence ,x))
+ (-> x (string->number 8) list u8-list->bytevector))
+
+ (`(escape-sequence (hexadecimal-escape-sequence ,x))
+ (-> x (string->number 16) list u8-list->bytevector))
+
+ (`(escape-sequence (universal-character-name ,x))
+ (let ((n (string->number x 16)))
+ (when (<= #xD800 x #xDFFF)
+ (error))
+ (when (and (< x #xA0)
+ (or (not (= x #x24))
+ (not (= x #x40))
+ (not (= x #x60))))
+ (error))
+ (-> n
+ integer->char
+ string
+ (string->bytevector (make-transcoder (utf-8-codec))))))
+ (_ (error))))
+
+(define (concat-bytevectors bvs)
+ (define target (make-bytevector (apply + (map bytevector-length bvs))))
+ (fold (lambda (bv offset)
+ (let ((len (bytevector-length bv)))
+ (bytevector-copy! bv 0 target offset len)
+ (+ offset len)))
+ 0
+ bvs)
+ target)
+
+(define (handle-string-fragments content)
+ (map
+ (lambda (x) (if (string? x)
+ (string->bytevector x (make-transcoder (utf-8-codec)))
+ (cpp-char->bytevector x)))
+ content))
+
+;; 6.4 paragraph 2
+;; Each preprocessing toket thas is converted to a token shall have the
+;; lexcal form of a keyword, an identifier, a constant, a string literal,
+;; or a puncturtor
+(define (preprocessing-token->token cpp-token)
+ ;; Guile's cond handles multiple from expr, if written on the form
+ ;; (cond (expr check => proc) ...)
+ (cond ((string-token? cpp-token)
+ (lambda (a . _) a)
+ => (lambda (encoding . content)
+ (make-lexical-token
+ 'string-literal #f
+ (concat-bytevectors
+ (append
+ ;; TODO this should depend on encoding
+ (handle-string-fragments content)
+ (list #vu8(0)))))))
+
+ ((identifier-token? cpp-token)
+ => (lambda (name)
+ (let ((name (string->symbol name)))
+ (if (memv name keywords)
+ name
+ (make-lexical-token 'identifier #f name)))))
+
+ ((pp-number? cpp-token)
+ => (lambda (content)
+ ;; TOOD should return an integer-constant or a floating-constant
+ (make-lexical-token 'constant #f (parse-c-number content))))
+
+ ((character-constant? cpp-token)
+ (lambda (a . _) a)
+ => (lambda (encoding . content)
+ (make-lexical-token
+ 'constant #f
+ ;; TODO that to do with multi-byte characters?
+ ;; > 'ab' == 'a' << 8 | 'b' == 0x6162
+ ;; > '\x1234' == 0x1234
+ ;; GCC prints 34 for the following expression
+ ;; > printf("%x\n", '\x1234');
+ ;; but 6162 for this
+ ;; > printf("%x\n", 'ab');
+
+ ;; What about
+ ;; > 'a\x1234' == a << 16 | 'b' == 0x611234
+ (let ((bv (concat-bytevectors
+ ;; TODO this should depend on encoding
+ (handle-string-fragments content))))
+ ;; TODO maybe actually store multiple bytes from multi byte literals
+ (bytevector-u8-ref bv (1- (bytevector-length bv)))))))
+
+ ((punctuator-token? cpp-token)
+ => (lambda (s)
+ (cond ((string=? s "{") 'lbrace)
+ ((string=? s "}") 'rbrace)
+ ((string=? s "[") 'lbrack)
+ ((string=? s "]") 'rbrack)
+ ((string=? s "(") 'lparen)
+ ((string=? s ")") 'rparen)
+ ((string=? s ".") 'dot)
+ ((string=? s "|") 'pipe)
+ ((string=? s "||") 'pipe2)
+ ((string=? s ";") 'semicolon)
+ ((string=? s "|=") 'pipe=)
+ ((string=? s ",") 'comma)
+ ((string=? s "#") 'hash)
+ ((string=? s "##") 'hash2)
+ (else (string->symbol s)))))
+
+ (else
+ (scm-error 'cpp-error "preprocessing-token->token"
+ "Can't convert ~s into a \"regular\" token."
+ (list cpp-token) #f))))