diff options
Diffstat (limited to '')
-rw-r--r-- | module/c/to-token.scm | 161 |
1 files changed, 161 insertions, 0 deletions
diff --git a/module/c/to-token.scm b/module/c/to-token.scm new file mode 100644 index 00000000..f5e459de --- /dev/null +++ b/module/c/to-token.scm @@ -0,0 +1,161 @@ +(define-module (c to-token) + :use-module ((srfi srfi-1) :select (fold append-map)) + :use-module (ice-9 match) + :use-module ((hnh util) :select (->)) + :use-module ((system base lalr) + :select (make-lexical-token)) + :use-module (c cpp-types) + :use-module ((c lex2) :select (parse-c-number)) + ;; :use-module (hnh util type) + :use-module ((rnrs bytevectors) + :select (make-bytevector + bytevector-length + bytevector-copy! + u8-list->bytevector + bytevector-u8-ref)) + :use-module ((rnrs io ports) + :select (string->bytevector + make-transcoder + utf-8-codec)) + :export (preprocessing-token->token)) + +(define (pp-number->c-number token) + (parse-c-number (pp-number? token))) + +(define keywords + '(auto break case char const continue default + do double else enum extern float for goto + if inline int long register restrict return + short signed sizeof static struct switch + typedef union unsigned void volatile while + _Alignas _Alignof _Atomic _Bool _Complex + _Generic _Imaginary _Noreturn _Static_assert + _Thread_local)) + +(define (cpp-char->bytevector c) + (match c + (`(escape-sequence (simple-escape-sequence ,x)) + (case (string-ref x 0) + ((#\a) #vu8(#x07)) ; #\alarm + ((#\b) #vu8(#x08)) ; #\backspace + ((#\f) #vu8(#x0C)) ; #\page + ((#\n) #vu8(#x0A)) ; #\newline + ((#\r) #vu8(#x0D)) ; #\return + ((#\t) #vu8(#x09)) ; #\tab + ((#\v) #vu8(#x0B)) ; #\vtab + ;; ' " ? \ + (else (char->integer (string-ref x 0))))) + + ;; TODO these u8-list->bytevector should depend on the + ;; encoding prefix of the string/char + (`(escape-sequence (octal-escape-sequence ,x)) + (-> x (string->number 8) list u8-list->bytevector)) + + (`(escape-sequence (hexadecimal-escape-sequence ,x)) + (-> x (string->number 16) list u8-list->bytevector)) + + (`(escape-sequence (universal-character-name ,x)) + (let ((n (string->number x 16))) + (when (<= #xD800 x #xDFFF) + (error)) + (when (and (< x #xA0) + (or (not (= x #x24)) + (not (= x #x40)) + (not (= x #x60)))) + (error)) + (-> n + integer->char + string + (string->bytevector (make-transcoder (utf-8-codec)))))) + (_ (error)))) + +(define (concat-bytevectors bvs) + (define target (make-bytevector (apply + (map bytevector-length bvs)))) + (fold (lambda (bv offset) + (let ((len (bytevector-length bv))) + (bytevector-copy! bv 0 target offset len) + (+ offset len))) + 0 + bvs) + target) + +(define (handle-string-fragments content) + (map + (lambda (x) (if (string? x) + (string->bytevector x (make-transcoder (utf-8-codec))) + (cpp-char->bytevector x))) + content)) + +;; 6.4 paragraph 2 +;; Each preprocessing toket thas is converted to a token shall have the +;; lexcal form of a keyword, an identifier, a constant, a string literal, +;; or a puncturtor +(define (preprocessing-token->token cpp-token) + ;; Guile's cond handles multiple from expr, if written on the form + ;; (cond (expr check => proc) ...) + (cond ((string-token? cpp-token) + (lambda (a . _) a) + => (lambda (encoding . content) + (make-lexical-token + 'string-literal #f + (concat-bytevectors + (append + ;; TODO this should depend on encoding + (handle-string-fragments content) + (list #vu8(0))))))) + + ((identifier-token? cpp-token) + => (lambda (name) + (let ((name (string->symbol name))) + (if (memv name keywords) + name + (make-lexical-token 'identifier #f name))))) + + ((pp-number? cpp-token) + => (lambda (content) + ;; TOOD should return an integer-constant or a floating-constant + (make-lexical-token 'constant #f (parse-c-number content)))) + + ((character-constant? cpp-token) + (lambda (a . _) a) + => (lambda (encoding . content) + (make-lexical-token + 'constant #f + ;; TODO that to do with multi-byte characters? + ;; > 'ab' == 'a' << 8 | 'b' == 0x6162 + ;; > '\x1234' == 0x1234 + ;; GCC prints 34 for the following expression + ;; > printf("%x\n", '\x1234'); + ;; but 6162 for this + ;; > printf("%x\n", 'ab'); + + ;; What about + ;; > 'a\x1234' == a << 16 | 'b' == 0x611234 + (let ((bv (concat-bytevectors + ;; TODO this should depend on encoding + (handle-string-fragments content)))) + ;; TODO maybe actually store multiple bytes from multi byte literals + (bytevector-u8-ref bv (1- (bytevector-length bv))))))) + + ((punctuator-token? cpp-token) + => (lambda (s) + (cond ((string=? s "{") 'lbrace) + ((string=? s "}") 'rbrace) + ((string=? s "[") 'lbrack) + ((string=? s "]") 'rbrack) + ((string=? s "(") 'lparen) + ((string=? s ")") 'rparen) + ((string=? s ".") 'dot) + ((string=? s "|") 'pipe) + ((string=? s "||") 'pipe2) + ((string=? s ";") 'semicolon) + ((string=? s "|=") 'pipe=) + ((string=? s ",") 'comma) + ((string=? s "#") 'hash) + ((string=? s "##") 'hash2) + (else (string->symbol s))))) + + (else + (scm-error 'cpp-error "preprocessing-token->token" + "Can't convert ~s into a \"regular\" token." + (list cpp-token) #f)))) |