(define-module (c to-token) :use-module ((srfi srfi-1) :select (fold append-map)) :use-module (ice-9 match) :use-module ((hnh util) :select (->)) :use-module ((system base lalr) :select (make-lexical-token)) :use-module (c cpp-types) :use-module ((c lex2) :select (parse-c-number)) ;; :use-module (hnh util type) :use-module ((rnrs bytevectors) :select (make-bytevector bytevector-length bytevector-copy! u8-list->bytevector bytevector-u8-ref)) :use-module ((rnrs io ports) :select (string->bytevector make-transcoder utf-8-codec)) :export (preprocessing-token->token)) (define (pp-number->c-number token) (parse-c-number (pp-number? token))) (define keywords '(auto break case char const continue default do double else enum extern float for goto if inline int long register restrict return short signed sizeof static struct switch typedef union unsigned void volatile while _Alignas _Alignof _Atomic _Bool _Complex _Generic _Imaginary _Noreturn _Static_assert _Thread_local)) (define (cpp-char->bytevector c) (match c (`(escape-sequence (simple-escape-sequence ,x)) (case (string-ref x 0) ((#\a) #vu8(#x07)) ; #\alarm ((#\b) #vu8(#x08)) ; #\backspace ((#\f) #vu8(#x0C)) ; #\page ((#\n) #vu8(#x0A)) ; #\newline ((#\r) #vu8(#x0D)) ; #\return ((#\t) #vu8(#x09)) ; #\tab ((#\v) #vu8(#x0B)) ; #\vtab ;; ' " ? \ (else (char->integer (string-ref x 0))))) ;; TODO these u8-list->bytevector should depend on the ;; encoding prefix of the string/char (`(escape-sequence (octal-escape-sequence ,x)) (-> x (string->number 8) list u8-list->bytevector)) (`(escape-sequence (hexadecimal-escape-sequence ,x)) (-> x (string->number 16) list u8-list->bytevector)) (`(escape-sequence (universal-character-name ,x)) (let ((n (string->number x 16))) (when (<= #xD800 x #xDFFF) (error)) (when (and (< x #xA0) (or (not (= x #x24)) (not (= x #x40)) (not (= x #x60)))) (error)) (-> n integer->char string (string->bytevector (make-transcoder (utf-8-codec)))))) (_ (error)))) (define (concat-bytevectors bvs) (define target (make-bytevector (apply + (map bytevector-length bvs)))) (fold (lambda (bv offset) (let ((len (bytevector-length bv))) (bytevector-copy! bv 0 target offset len) (+ offset len))) 0 bvs) target) (define (handle-string-fragments content) (map (lambda (x) (if (string? x) (string->bytevector x (make-transcoder (utf-8-codec))) (cpp-char->bytevector x))) content)) ;; 6.4 paragraph 2 ;; Each preprocessing toket thas is converted to a token shall have the ;; lexcal form of a keyword, an identifier, a constant, a string literal, ;; or a puncturtor (define (preprocessing-token->token cpp-token) ;; Guile's cond handles multiple from expr, if written on the form ;; (cond (expr check => proc) ...) (cond ((string-token? cpp-token) (lambda (a . _) a) => (lambda (encoding . content) (make-lexical-token 'string-literal #f (concat-bytevectors (append ;; TODO this should depend on encoding (handle-string-fragments content) (list #vu8(0))))))) ((identifier-token? cpp-token) => (lambda (name) (let ((name (string->symbol name))) (if (memv name keywords) name (make-lexical-token 'identifier #f name))))) ((pp-number? cpp-token) => (lambda (content) ;; TOOD should return an integer-constant or a floating-constant (make-lexical-token 'constant #f (parse-c-number content)))) ((character-constant? cpp-token) (lambda (a . _) a) => (lambda (encoding . content) (make-lexical-token 'constant #f ;; TODO that to do with multi-byte characters? ;; > 'ab' == 'a' << 8 | 'b' == 0x6162 ;; > '\x1234' == 0x1234 ;; GCC prints 34 for the following expression ;; > printf("%x\n", '\x1234'); ;; but 6162 for this ;; > printf("%x\n", 'ab'); ;; What about ;; > 'a\x1234' == a << 16 | 'b' == 0x611234 (let ((bv (concat-bytevectors ;; TODO this should depend on encoding (handle-string-fragments content)))) ;; TODO maybe actually store multiple bytes from multi byte literals (bytevector-u8-ref bv (1- (bytevector-length bv))))))) ((punctuator-token? cpp-token) => (lambda (s) (cond ((string=? s "{") 'lbrace) ((string=? s "}") 'rbrace) ((string=? s "[") 'lbrack) ((string=? s "]") 'rbrack) ((string=? s "(") 'lparen) ((string=? s ")") 'rparen) ((string=? s ".") 'dot) ((string=? s "|") 'pipe) ((string=? s "||") 'pipe2) ((string=? s ";") 'semicolon) ((string=? s "|=") 'pipe=) ((string=? s ",") 'comma) ((string=? s "#") 'hash) ((string=? s "##") 'hash2) (else (string->symbol s))))) (else (scm-error 'cpp-error "preprocessing-token->token" "Can't convert ~s into a \"regular\" token." (list cpp-token) #f))))