From 5a24d35fd7dd0e1f9b117b43e97d54bde84c9ca4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hugo=20H=C3=B6rnquist?= Date: Wed, 20 Jul 2022 16:49:18 +0200 Subject: Major work on to-token. --- module/c/to-token.scm | 121 ++++++++++++++++++++++++++++++++++++++------ tests/test/cpp/to-token.scm | 65 ++++++++++++++++++++++++ 2 files changed, 171 insertions(+), 15 deletions(-) create mode 100644 tests/test/cpp/to-token.scm diff --git a/module/c/to-token.scm b/module/c/to-token.scm index 53db7e59..f5e459de 100644 --- a/module/c/to-token.scm +++ b/module/c/to-token.scm @@ -1,23 +1,90 @@ (define-module (c to-token) + :use-module ((srfi srfi-1) :select (fold append-map)) + :use-module (ice-9 match) + :use-module ((hnh util) :select (->)) :use-module ((system base lalr) :select (make-lexical-token)) :use-module (c cpp-types) :use-module ((c lex2) :select (parse-c-number)) ;; :use-module (hnh util type) + :use-module ((rnrs bytevectors) + :select (make-bytevector + bytevector-length + bytevector-copy! + u8-list->bytevector + bytevector-u8-ref)) + :use-module ((rnrs io ports) + :select (string->bytevector + make-transcoder + utf-8-codec)) :export (preprocessing-token->token)) (define (pp-number->c-number token) (parse-c-number (pp-number? token))) (define keywords - '("auto" "break" "case" "char" "const" "continue" "default" - "do" "double" "else" "enum" "extern" "float" "for" "goto" - "if" "inline" "int" "long" "register" "restrict" "return" - "short" "signed" "sizeof" "static" "struct" "switch" - "typedef" "union" "unsigned" "void" "volatile" "while" - "_Alignas" "_Alignof" "_Atomic" "_Bool" "_Complex" - "_Generic" "_Imaginary" "_Noreturn" "_Static_assert" - "_Thread_local")) + '(auto break case char const continue default + do double else enum extern float for goto + if inline int long register restrict return + short signed sizeof static struct switch + typedef union unsigned void volatile while + _Alignas _Alignof _Atomic _Bool _Complex + _Generic _Imaginary _Noreturn _Static_assert + _Thread_local)) + +(define (cpp-char->bytevector c) + (match c + (`(escape-sequence (simple-escape-sequence ,x)) + (case (string-ref x 0) + ((#\a) #vu8(#x07)) ; #\alarm + ((#\b) #vu8(#x08)) ; #\backspace + ((#\f) #vu8(#x0C)) ; #\page + ((#\n) #vu8(#x0A)) ; #\newline + ((#\r) #vu8(#x0D)) ; #\return + ((#\t) #vu8(#x09)) ; #\tab + ((#\v) #vu8(#x0B)) ; #\vtab + ;; ' " ? \ + (else (char->integer (string-ref x 0))))) + + ;; TODO these u8-list->bytevector should depend on the + ;; encoding prefix of the string/char + (`(escape-sequence (octal-escape-sequence ,x)) + (-> x (string->number 8) list u8-list->bytevector)) + + (`(escape-sequence (hexadecimal-escape-sequence ,x)) + (-> x (string->number 16) list u8-list->bytevector)) + + (`(escape-sequence (universal-character-name ,x)) + (let ((n (string->number x 16))) + (when (<= #xD800 x #xDFFF) + (error)) + (when (and (< x #xA0) + (or (not (= x #x24)) + (not (= x #x40)) + (not (= x #x60)))) + (error)) + (-> n + integer->char + string + (string->bytevector (make-transcoder (utf-8-codec)))))) + (_ (error)))) + +(define (concat-bytevectors bvs) + (define target (make-bytevector (apply + (map bytevector-length bvs)))) + (fold (lambda (bv offset) + (let ((len (bytevector-length bv))) + (bytevector-copy! bv 0 target offset len) + (+ offset len))) + 0 + bvs) + target) + +(define (handle-string-fragments content) + (map + (lambda (x) (if (string? x) + (string->bytevector x (make-transcoder (utf-8-codec))) + (cpp-char->bytevector x))) + content)) ;; 6.4 paragraph 2 ;; Each preprocessing toket thas is converted to a token shall have the @@ -28,23 +95,47 @@ ;; (cond (expr check => proc) ...) (cond ((string-token? cpp-token) (lambda (a . _) a) - => (lambda content - (make-lexical-token 'string-literal #f content))) + => (lambda (encoding . content) + (make-lexical-token + 'string-literal #f + (concat-bytevectors + (append + ;; TODO this should depend on encoding + (handle-string-fragments content) + (list #vu8(0))))))) ((identifier-token? cpp-token) => (lambda (name) - (if (member name keywords) - (string->symbol name) - (make-lexical-token 'identifier #f name)))) + (let ((name (string->symbol name))) + (if (memv name keywords) + name + (make-lexical-token 'identifier #f name))))) ((pp-number? cpp-token) => (lambda (content) ;; TOOD should return an integer-constant or a floating-constant - (make-lexical-token 'constant #f content))) + (make-lexical-token 'constant #f (parse-c-number content)))) ((character-constant? cpp-token) (lambda (a . _) a) - => (lambda content (make-lexical-token 'constant #f content))) + => (lambda (encoding . content) + (make-lexical-token + 'constant #f + ;; TODO that to do with multi-byte characters? + ;; > 'ab' == 'a' << 8 | 'b' == 0x6162 + ;; > '\x1234' == 0x1234 + ;; GCC prints 34 for the following expression + ;; > printf("%x\n", '\x1234'); + ;; but 6162 for this + ;; > printf("%x\n", 'ab'); + + ;; What about + ;; > 'a\x1234' == a << 16 | 'b' == 0x611234 + (let ((bv (concat-bytevectors + ;; TODO this should depend on encoding + (handle-string-fragments content)))) + ;; TODO maybe actually store multiple bytes from multi byte literals + (bytevector-u8-ref bv (1- (bytevector-length bv))))))) ((punctuator-token? cpp-token) => (lambda (s) diff --git a/tests/test/cpp/to-token.scm b/tests/test/cpp/to-token.scm new file mode 100644 index 00000000..b633ce12 --- /dev/null +++ b/tests/test/cpp/to-token.scm @@ -0,0 +1,65 @@ +(define-module (test cpp to-token) + :use-module (srfi srfi-64) + :use-module (srfi srfi-64 test-error) + :use-module (srfi srfi-88) + :use-module (c to-token) + :use-module ((system base lalr) + :select (lexical-token-category + lexical-token-value)) + :use-module ((c lex2) :select (lex)) + ) + +(test-group "string tokens" + (let ((v (preprocessing-token->token (car (lex "\"Hello\""))))) + (test-equal 'string-literal (lexical-token-category v)) + (test-equal #vu8(#x48 #x65 #x6C #x6C #x6F 0) (lexical-token-value v)) + ;; TODO prefixes + )) + +(test-group "identifier tokens" + (let ((v (preprocessing-token->token (car (lex "hello"))))) + (test-equal 'identifier (lexical-token-category v)) + (test-equal 'hello (lexical-token-value v)))) + +(test-group "keywords" + (test-equal 'auto (preprocessing-token->token (car (lex "auto"))))) + +(test-group "numbers" + (test-group "Integers" + (test-group "Base-10" + (let ((v (preprocessing-token->token (car (lex "1"))))) + (test-equal 'constant (lexical-token-category v)) + (test-equal 1 (lexical-token-value v)))) + + (test-equal "Base-16" + 16 (lexical-token-value (preprocessing-token->token (car (lex "0x10"))))) + (test-equal "Base-8" + 8 (lexical-token-value (preprocessing-token->token (car (lex "010"))))) + (test-group "Suffixes" + 'TODO + )) + + ;; TODO floats + ) + +(test-group "character constants" + (let ((v (preprocessing-token->token (car (lex "'a'"))))) + (test-equal 'constant (lexical-token-category v)) + (test-equal (char->integer #\a) (lexical-token-value v)) ) + (let ((v (preprocessing-token->token (car (lex "'ab'"))))) + (test-equal 'constant (lexical-token-category v)) + (test-equal (char->integer #\b) (lexical-token-value v))) + (let ((v (preprocessing-token->token (car (lex "'\\x41'"))))) + (test-equal 'constant (lexical-token-category v)) + (test-equal #x41 (lexical-token-value v))) + ;; (lex "'\\x4142'") + ;; (lex "'L\\x4142'") + ) + +(test-group "punctuators" + (test-equal '+ (preprocessing-token->token (car (lex "+")))) + (test-equal 'lbrace (preprocessing-token->token (car (lex "{"))))) + +(test-group "other" + (test-error 'cpp-error (preprocessing-token->token (car (lex " "))))) + -- cgit v1.2.3