aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHugo Hörnquist <hugo@lysator.liu.se>2022-07-20 16:49:18 +0200
committerHugo Hörnquist <hugo@lysator.liu.se>2022-07-21 14:27:23 +0200
commit5a24d35fd7dd0e1f9b117b43e97d54bde84c9ca4 (patch)
treee2caab98e204998563702041badcfa14ebbc705d
parentAcknowledge string prefixes. (diff)
downloadcalp-5a24d35fd7dd0e1f9b117b43e97d54bde84c9ca4.tar.gz
calp-5a24d35fd7dd0e1f9b117b43e97d54bde84c9ca4.tar.xz
Major work on to-token.
-rw-r--r--module/c/to-token.scm121
-rw-r--r--tests/test/cpp/to-token.scm65
2 files changed, 171 insertions, 15 deletions
diff --git a/module/c/to-token.scm b/module/c/to-token.scm
index 53db7e59..f5e459de 100644
--- a/module/c/to-token.scm
+++ b/module/c/to-token.scm
@@ -1,23 +1,90 @@
(define-module (c to-token)
+ :use-module ((srfi srfi-1) :select (fold append-map))
+ :use-module (ice-9 match)
+ :use-module ((hnh util) :select (->))
:use-module ((system base lalr)
:select (make-lexical-token))
:use-module (c cpp-types)
:use-module ((c lex2) :select (parse-c-number))
;; :use-module (hnh util type)
+ :use-module ((rnrs bytevectors)
+ :select (make-bytevector
+ bytevector-length
+ bytevector-copy!
+ u8-list->bytevector
+ bytevector-u8-ref))
+ :use-module ((rnrs io ports)
+ :select (string->bytevector
+ make-transcoder
+ utf-8-codec))
:export (preprocessing-token->token))
(define (pp-number->c-number token)
(parse-c-number (pp-number? token)))
(define keywords
- '("auto" "break" "case" "char" "const" "continue" "default"
- "do" "double" "else" "enum" "extern" "float" "for" "goto"
- "if" "inline" "int" "long" "register" "restrict" "return"
- "short" "signed" "sizeof" "static" "struct" "switch"
- "typedef" "union" "unsigned" "void" "volatile" "while"
- "_Alignas" "_Alignof" "_Atomic" "_Bool" "_Complex"
- "_Generic" "_Imaginary" "_Noreturn" "_Static_assert"
- "_Thread_local"))
+ '(auto break case char const continue default
+ do double else enum extern float for goto
+ if inline int long register restrict return
+ short signed sizeof static struct switch
+ typedef union unsigned void volatile while
+ _Alignas _Alignof _Atomic _Bool _Complex
+ _Generic _Imaginary _Noreturn _Static_assert
+ _Thread_local))
+
+(define (cpp-char->bytevector c)
+ (match c
+ (`(escape-sequence (simple-escape-sequence ,x))
+ (case (string-ref x 0)
+ ((#\a) #vu8(#x07)) ; #\alarm
+ ((#\b) #vu8(#x08)) ; #\backspace
+ ((#\f) #vu8(#x0C)) ; #\page
+ ((#\n) #vu8(#x0A)) ; #\newline
+ ((#\r) #vu8(#x0D)) ; #\return
+ ((#\t) #vu8(#x09)) ; #\tab
+ ((#\v) #vu8(#x0B)) ; #\vtab
+ ;; ' " ? \
+ (else (char->integer (string-ref x 0)))))
+
+ ;; TODO these u8-list->bytevector should depend on the
+ ;; encoding prefix of the string/char
+ (`(escape-sequence (octal-escape-sequence ,x))
+ (-> x (string->number 8) list u8-list->bytevector))
+
+ (`(escape-sequence (hexadecimal-escape-sequence ,x))
+ (-> x (string->number 16) list u8-list->bytevector))
+
+ (`(escape-sequence (universal-character-name ,x))
+ (let ((n (string->number x 16)))
+ (when (<= #xD800 x #xDFFF)
+ (error))
+ (when (and (< x #xA0)
+ (or (not (= x #x24))
+ (not (= x #x40))
+ (not (= x #x60))))
+ (error))
+ (-> n
+ integer->char
+ string
+ (string->bytevector (make-transcoder (utf-8-codec))))))
+ (_ (error))))
+
+(define (concat-bytevectors bvs)
+ (define target (make-bytevector (apply + (map bytevector-length bvs))))
+ (fold (lambda (bv offset)
+ (let ((len (bytevector-length bv)))
+ (bytevector-copy! bv 0 target offset len)
+ (+ offset len)))
+ 0
+ bvs)
+ target)
+
+(define (handle-string-fragments content)
+ (map
+ (lambda (x) (if (string? x)
+ (string->bytevector x (make-transcoder (utf-8-codec)))
+ (cpp-char->bytevector x)))
+ content))
;; 6.4 paragraph 2
;; Each preprocessing toket thas is converted to a token shall have the
@@ -28,23 +95,47 @@
;; (cond (expr check => proc) ...)
(cond ((string-token? cpp-token)
(lambda (a . _) a)
- => (lambda content
- (make-lexical-token 'string-literal #f content)))
+ => (lambda (encoding . content)
+ (make-lexical-token
+ 'string-literal #f
+ (concat-bytevectors
+ (append
+ ;; TODO this should depend on encoding
+ (handle-string-fragments content)
+ (list #vu8(0)))))))
((identifier-token? cpp-token)
=> (lambda (name)
- (if (member name keywords)
- (string->symbol name)
- (make-lexical-token 'identifier #f name))))
+ (let ((name (string->symbol name)))
+ (if (memv name keywords)
+ name
+ (make-lexical-token 'identifier #f name)))))
((pp-number? cpp-token)
=> (lambda (content)
;; TOOD should return an integer-constant or a floating-constant
- (make-lexical-token 'constant #f content)))
+ (make-lexical-token 'constant #f (parse-c-number content))))
((character-constant? cpp-token)
(lambda (a . _) a)
- => (lambda content (make-lexical-token 'constant #f content)))
+ => (lambda (encoding . content)
+ (make-lexical-token
+ 'constant #f
+ ;; TODO that to do with multi-byte characters?
+ ;; > 'ab' == 'a' << 8 | 'b' == 0x6162
+ ;; > '\x1234' == 0x1234
+ ;; GCC prints 34 for the following expression
+ ;; > printf("%x\n", '\x1234');
+ ;; but 6162 for this
+ ;; > printf("%x\n", 'ab');
+
+ ;; What about
+ ;; > 'a\x1234' == a << 16 | 'b' == 0x611234
+ (let ((bv (concat-bytevectors
+ ;; TODO this should depend on encoding
+ (handle-string-fragments content))))
+ ;; TODO maybe actually store multiple bytes from multi byte literals
+ (bytevector-u8-ref bv (1- (bytevector-length bv)))))))
((punctuator-token? cpp-token)
=> (lambda (s)
diff --git a/tests/test/cpp/to-token.scm b/tests/test/cpp/to-token.scm
new file mode 100644
index 00000000..b633ce12
--- /dev/null
+++ b/tests/test/cpp/to-token.scm
@@ -0,0 +1,65 @@
+(define-module (test cpp to-token)
+ :use-module (srfi srfi-64)
+ :use-module (srfi srfi-64 test-error)
+ :use-module (srfi srfi-88)
+ :use-module (c to-token)
+ :use-module ((system base lalr)
+ :select (lexical-token-category
+ lexical-token-value))
+ :use-module ((c lex2) :select (lex))
+ )
+
+(test-group "string tokens"
+ (let ((v (preprocessing-token->token (car (lex "\"Hello\"")))))
+ (test-equal 'string-literal (lexical-token-category v))
+ (test-equal #vu8(#x48 #x65 #x6C #x6C #x6F 0) (lexical-token-value v))
+ ;; TODO prefixes
+ ))
+
+(test-group "identifier tokens"
+ (let ((v (preprocessing-token->token (car (lex "hello")))))
+ (test-equal 'identifier (lexical-token-category v))
+ (test-equal 'hello (lexical-token-value v))))
+
+(test-group "keywords"
+ (test-equal 'auto (preprocessing-token->token (car (lex "auto")))))
+
+(test-group "numbers"
+ (test-group "Integers"
+ (test-group "Base-10"
+ (let ((v (preprocessing-token->token (car (lex "1")))))
+ (test-equal 'constant (lexical-token-category v))
+ (test-equal 1 (lexical-token-value v))))
+
+ (test-equal "Base-16"
+ 16 (lexical-token-value (preprocessing-token->token (car (lex "0x10")))))
+ (test-equal "Base-8"
+ 8 (lexical-token-value (preprocessing-token->token (car (lex "010")))))
+ (test-group "Suffixes"
+ 'TODO
+ ))
+
+ ;; TODO floats
+ )
+
+(test-group "character constants"
+ (let ((v (preprocessing-token->token (car (lex "'a'")))))
+ (test-equal 'constant (lexical-token-category v))
+ (test-equal (char->integer #\a) (lexical-token-value v)) )
+ (let ((v (preprocessing-token->token (car (lex "'ab'")))))
+ (test-equal 'constant (lexical-token-category v))
+ (test-equal (char->integer #\b) (lexical-token-value v)))
+ (let ((v (preprocessing-token->token (car (lex "'\\x41'")))))
+ (test-equal 'constant (lexical-token-category v))
+ (test-equal #x41 (lexical-token-value v)))
+ ;; (lex "'\\x4142'")
+ ;; (lex "'L\\x4142'")
+ )
+
+(test-group "punctuators"
+ (test-equal '+ (preprocessing-token->token (car (lex "+"))))
+ (test-equal 'lbrace (preprocessing-token->token (car (lex "{")))))
+
+(test-group "other"
+ (test-error 'cpp-error (preprocessing-token->token (car (lex " ")))))
+