aboutsummaryrefslogtreecommitdiff
path: root/module/c/to-token.scm
diff options
context:
space:
mode:
Diffstat (limited to 'module/c/to-token.scm')
-rw-r--r--module/c/to-token.scm121
1 files changed, 106 insertions, 15 deletions
diff --git a/module/c/to-token.scm b/module/c/to-token.scm
index 53db7e59..f5e459de 100644
--- a/module/c/to-token.scm
+++ b/module/c/to-token.scm
@@ -1,23 +1,90 @@
(define-module (c to-token)
+ :use-module ((srfi srfi-1) :select (fold append-map))
+ :use-module (ice-9 match)
+ :use-module ((hnh util) :select (->))
:use-module ((system base lalr)
:select (make-lexical-token))
:use-module (c cpp-types)
:use-module ((c lex2) :select (parse-c-number))
;; :use-module (hnh util type)
+ :use-module ((rnrs bytevectors)
+ :select (make-bytevector
+ bytevector-length
+ bytevector-copy!
+ u8-list->bytevector
+ bytevector-u8-ref))
+ :use-module ((rnrs io ports)
+ :select (string->bytevector
+ make-transcoder
+ utf-8-codec))
:export (preprocessing-token->token))
(define (pp-number->c-number token)
(parse-c-number (pp-number? token)))
(define keywords
- '("auto" "break" "case" "char" "const" "continue" "default"
- "do" "double" "else" "enum" "extern" "float" "for" "goto"
- "if" "inline" "int" "long" "register" "restrict" "return"
- "short" "signed" "sizeof" "static" "struct" "switch"
- "typedef" "union" "unsigned" "void" "volatile" "while"
- "_Alignas" "_Alignof" "_Atomic" "_Bool" "_Complex"
- "_Generic" "_Imaginary" "_Noreturn" "_Static_assert"
- "_Thread_local"))
+ '(auto break case char const continue default
+ do double else enum extern float for goto
+ if inline int long register restrict return
+ short signed sizeof static struct switch
+ typedef union unsigned void volatile while
+ _Alignas _Alignof _Atomic _Bool _Complex
+ _Generic _Imaginary _Noreturn _Static_assert
+ _Thread_local))
+
+(define (cpp-char->bytevector c)
+ (match c
+ (`(escape-sequence (simple-escape-sequence ,x))
+ (case (string-ref x 0)
+ ((#\a) #vu8(#x07)) ; #\alarm
+ ((#\b) #vu8(#x08)) ; #\backspace
+ ((#\f) #vu8(#x0C)) ; #\page
+ ((#\n) #vu8(#x0A)) ; #\newline
+ ((#\r) #vu8(#x0D)) ; #\return
+ ((#\t) #vu8(#x09)) ; #\tab
+ ((#\v) #vu8(#x0B)) ; #\vtab
+ ;; ' " ? \
+ (else (char->integer (string-ref x 0)))))
+
+ ;; TODO these u8-list->bytevector should depend on the
+ ;; encoding prefix of the string/char
+ (`(escape-sequence (octal-escape-sequence ,x))
+ (-> x (string->number 8) list u8-list->bytevector))
+
+ (`(escape-sequence (hexadecimal-escape-sequence ,x))
+ (-> x (string->number 16) list u8-list->bytevector))
+
+ (`(escape-sequence (universal-character-name ,x))
+ (let ((n (string->number x 16)))
+ (when (<= #xD800 x #xDFFF)
+ (error))
+ (when (and (< x #xA0)
+ (or (not (= x #x24))
+ (not (= x #x40))
+ (not (= x #x60))))
+ (error))
+ (-> n
+ integer->char
+ string
+ (string->bytevector (make-transcoder (utf-8-codec))))))
+ (_ (error))))
+
+(define (concat-bytevectors bvs)
+ (define target (make-bytevector (apply + (map bytevector-length bvs))))
+ (fold (lambda (bv offset)
+ (let ((len (bytevector-length bv)))
+ (bytevector-copy! bv 0 target offset len)
+ (+ offset len)))
+ 0
+ bvs)
+ target)
+
+(define (handle-string-fragments content)
+ (map
+ (lambda (x) (if (string? x)
+ (string->bytevector x (make-transcoder (utf-8-codec)))
+ (cpp-char->bytevector x)))
+ content))
;; 6.4 paragraph 2
;; Each preprocessing toket thas is converted to a token shall have the
@@ -28,23 +95,47 @@
;; (cond (expr check => proc) ...)
(cond ((string-token? cpp-token)
(lambda (a . _) a)
- => (lambda content
- (make-lexical-token 'string-literal #f content)))
+ => (lambda (encoding . content)
+ (make-lexical-token
+ 'string-literal #f
+ (concat-bytevectors
+ (append
+ ;; TODO this should depend on encoding
+ (handle-string-fragments content)
+ (list #vu8(0)))))))
((identifier-token? cpp-token)
=> (lambda (name)
- (if (member name keywords)
- (string->symbol name)
- (make-lexical-token 'identifier #f name))))
+ (let ((name (string->symbol name)))
+ (if (memv name keywords)
+ name
+ (make-lexical-token 'identifier #f name)))))
((pp-number? cpp-token)
=> (lambda (content)
;; TOOD should return an integer-constant or a floating-constant
- (make-lexical-token 'constant #f content)))
+ (make-lexical-token 'constant #f (parse-c-number content))))
((character-constant? cpp-token)
(lambda (a . _) a)
- => (lambda content (make-lexical-token 'constant #f content)))
+ => (lambda (encoding . content)
+ (make-lexical-token
+ 'constant #f
+ ;; TODO that to do with multi-byte characters?
+ ;; > 'ab' == 'a' << 8 | 'b' == 0x6162
+ ;; > '\x1234' == 0x1234
+ ;; GCC prints 34 for the following expression
+ ;; > printf("%x\n", '\x1234');
+ ;; but 6162 for this
+ ;; > printf("%x\n", 'ab');
+
+ ;; What about
+ ;; > 'a\x1234' == a << 16 | 'b' == 0x611234
+ (let ((bv (concat-bytevectors
+ ;; TODO this should depend on encoding
+ (handle-string-fragments content))))
+ ;; TODO maybe actually store multiple bytes from multi byte literals
+ (bytevector-u8-ref bv (1- (bytevector-length bv)))))))
((punctuator-token? cpp-token)
=> (lambda (s)