From 7b78cf68200118fac395592da3d78b22b4be0cd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hugo=20H=C3=B6rnquist?= <hugo@lysator.liu.se>
Date: Wed, 13 Jul 2022 11:35:39 +0200
Subject: Add support for "other" in preprocessing-tokens.

---
 module/c/cpp-types.scm  |  7 ++++++-
 module/c/lex2.scm       | 16 ++++++++++++++--
 module/c/unlex.scm      |  6 ++++--
 tests/test/cpp/lex2.scm | 12 ++++++++++++
 4 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/module/c/cpp-types.scm b/module/c/cpp-types.scm
index 1df70594..82ebb922 100644
--- a/module/c/cpp-types.scm
+++ b/module/c/cpp-types.scm
@@ -5,8 +5,9 @@
   :export (whitespace-token?
            comment-token?
            preprocessing-token?
-           newline-token?
+           other-token?
            placemaker-token?
+           newline-token?
            identifier-token?
            punctuator-token?
            number-token?
@@ -27,6 +28,10 @@
   (and (lexeme? x)
        (eq? 'preprocessing-token (lexeme-type x))))
 
+(define (other-token? x)
+  (and (lexeme? x)
+       (eq? 'other (lexeme-type x))))
+
 (define (placemaker-token? x)
   (and (lexeme? x)
        (eq? 'placemaker (lexeme-type x))))
diff --git a/module/c/lex2.scm b/module/c/lex2.scm
index fcddcdc4..72f79f55 100644
--- a/module/c/lex2.scm
+++ b/module/c/lex2.scm
@@ -325,15 +325,25 @@
 (define-peg-pattern comment all
   (or line-comment block-comment))
 
+(define-peg-pattern non-whitespace all
+  (and (not-followed-by whitespace)
+       peg-any))
+
 (define-peg-pattern preprocessing-tokens all
   (* (or whitespace
          comment
-         preprocessing-token)))
+         preprocessing-token
+         non-whitespace)))
 
 
 
+;; comment could be merged with whitespace, but then unlex would have to know that
+
+;; other is the "each non-white-space character that cannot be one of the above"
+;; clause from 6.4 p. 1
+
 (define-type (lexeme)
-  (type type: (memv '(whitespace comment preprocessing-token placemaker)))
+  (type type: (memv '(whitespace comment preprocessing-token other placemaker)))
   (body type: (or string? list?))
   (noexpand type: (list-of string?)
             default: '()))
@@ -343,6 +353,8 @@
 
 (define (lex-output->lexeme-object x)
   (match x
+    (`(non-whitespace ,body)
+     (lexeme body: body type: 'other))
     (`(whitespace ,body)
      (lexeme body: body type: 'whitespace ))
     (`(comment ,body)
diff --git a/module/c/unlex.scm b/module/c/unlex.scm
index e3d36f86..e714816d 100644
--- a/module/c/unlex.scm
+++ b/module/c/unlex.scm
@@ -13,7 +13,8 @@
   (typecheck tokens (list-of lexeme?))
   (string-concatenate
    (map (lambda (x) (cond (x preprocessing-token? => stringify-token)
-                     ((whitespace-token? x) (lexeme-body x))))
+                     ((whitespace-token? x) (lexeme-body x))
+                     ((other-token? x) (lexeme-body x))))
         tokens)))
 
 ;; takes a list of preprocessing-token's, and return a "source" string
@@ -22,7 +23,8 @@
   (string-concatenate
    (map (lambda (x)
           (cond ((preprocessing-token? x) (stringify-token x))
-                ((whitespace-token? x) " ")))
+                ((whitespace-token? x) " ")
+                ((other-token? x) (lexeme-body x))))
         (squeeze-whitespace tokens))))
 
 (define (stringify-escape-sequence sub-token)
diff --git a/tests/test/cpp/lex2.scm b/tests/test/cpp/lex2.scm
index b80bcf37..47bb4a16 100644
--- a/tests/test/cpp/lex2.scm
+++ b/tests/test/cpp/lex2.scm
@@ -80,3 +80,15 @@
         (lexeme type: 'whitespace body: "\n")
         (lexeme type: 'whitespace body: " "))
   (lex "  \n "))
+
+
+;; Refer to 6.4 p.1 for the syntax requirement
+;; 6.10.9 p. 2 for the sample string
+(test-equal "each non-white-space character that cannot be one of the above"
+  (list (lexeme type: 'preprocessing-token body: '(punctuator "."))
+        (lexeme type: 'preprocessing-token body: '(punctuator "."))
+        (lexeme type: 'other body: "\\")  ; <- Interesting part
+        (lexeme type: 'preprocessing-token body: '(identifier "listing"))
+        (lexeme type: 'preprocessing-token body: '(punctuator "."))
+        (lexeme type: 'preprocessing-token body: '(identifier "dir")))
+  (lex "..\\listing.dir"))
-- 
cgit v1.2.3