From 7b78cf68200118fac395592da3d78b22b4be0cd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hugo=20H=C3=B6rnquist?= Date: Wed, 13 Jul 2022 11:35:39 +0200 Subject: Add support for "other" in preprocessing-tokens. --- module/c/cpp-types.scm | 7 ++++++- module/c/lex2.scm | 16 ++++++++++++++-- module/c/unlex.scm | 6 ++++-- tests/test/cpp/lex2.scm | 12 ++++++++++++ 4 files changed, 36 insertions(+), 5 deletions(-) diff --git a/module/c/cpp-types.scm b/module/c/cpp-types.scm index 1df70594..82ebb922 100644 --- a/module/c/cpp-types.scm +++ b/module/c/cpp-types.scm @@ -5,8 +5,9 @@ :export (whitespace-token? comment-token? preprocessing-token? - newline-token? + other-token? placemaker-token? + newline-token? identifier-token? punctuator-token? number-token? @@ -27,6 +28,10 @@ (and (lexeme? x) (eq? 'preprocessing-token (lexeme-type x)))) +(define (other-token? x) + (and (lexeme? x) + (eq? 'other (lexeme-type x)))) + (define (placemaker-token? x) (and (lexeme? x) (eq? 'placemaker (lexeme-type x)))) diff --git a/module/c/lex2.scm b/module/c/lex2.scm index fcddcdc4..72f79f55 100644 --- a/module/c/lex2.scm +++ b/module/c/lex2.scm @@ -325,15 +325,25 @@ (define-peg-pattern comment all (or line-comment block-comment)) +(define-peg-pattern non-whitespace all + (and (not-followed-by whitespace) + peg-any)) + (define-peg-pattern preprocessing-tokens all (* (or whitespace comment - preprocessing-token))) + preprocessing-token + non-whitespace))) +;; comment could be merged with whitespace, but then unlex would have to know that + +;; other is the "each non-white-space character that cannot be one of the above" +;; clause from 6.4 p. 1 + (define-type (lexeme) - (type type: (memv '(whitespace comment preprocessing-token placemaker))) + (type type: (memv '(whitespace comment preprocessing-token other placemaker))) (body type: (or string? list?)) (noexpand type: (list-of string?) default: '())) @@ -343,6 +353,8 @@ (define (lex-output->lexeme-object x) (match x + (`(non-whitespace ,body) + (lexeme body: body type: 'other)) (`(whitespace ,body) (lexeme body: body type: 'whitespace )) (`(comment ,body) diff --git a/module/c/unlex.scm b/module/c/unlex.scm index e3d36f86..e714816d 100644 --- a/module/c/unlex.scm +++ b/module/c/unlex.scm @@ -13,7 +13,8 @@ (typecheck tokens (list-of lexeme?)) (string-concatenate (map (lambda (x) (cond (x preprocessing-token? => stringify-token) - ((whitespace-token? x) (lexeme-body x)))) + ((whitespace-token? x) (lexeme-body x)) + ((other-token? x) (lexeme-body x)))) tokens))) ;; takes a list of preprocessing-token's, and return a "source" string @@ -22,7 +23,8 @@ (string-concatenate (map (lambda (x) (cond ((preprocessing-token? x) (stringify-token x)) - ((whitespace-token? x) " "))) + ((whitespace-token? x) " ") + ((other-token? x) (lexeme-body x)))) (squeeze-whitespace tokens)))) (define (stringify-escape-sequence sub-token) diff --git a/tests/test/cpp/lex2.scm b/tests/test/cpp/lex2.scm index b80bcf37..47bb4a16 100644 --- a/tests/test/cpp/lex2.scm +++ b/tests/test/cpp/lex2.scm @@ -80,3 +80,15 @@ (lexeme type: 'whitespace body: "\n") (lexeme type: 'whitespace body: " ")) (lex " \n ")) + + +;; Refer to 6.4 p.1 for the syntax requirement +;; 6.10.9 p. 2 for the sample string +(test-equal "each non-white-space character that cannot be one of the above" + (list (lexeme type: 'preprocessing-token body: '(punctuator ".")) + (lexeme type: 'preprocessing-token body: '(punctuator ".")) + (lexeme type: 'other body: "\\") ; <- Interesting part + (lexeme type: 'preprocessing-token body: '(identifier "listing")) + (lexeme type: 'preprocessing-token body: '(punctuator ".")) + (lexeme type: 'preprocessing-token body: '(identifier "dir"))) + (lex "..\\listing.dir")) -- cgit v1.2.3