aboutsummaryrefslogtreecommitdiff
path: root/module/c/unlex.scm
blob: 139686984e4788c8c04455b8283a78c08a0b6a50 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
(define-module (c unlex)
  :use-module (hnh util type)
  :use-module (ice-9 match)
  :use-module (c lex2)
  :use-module (c cpp-types)
  :use-module (c cpp-util)
  :use-module ((texinfo string-utils) :select (escape-special-chars))
  :export (unlex
           unlex-aggressive
           stringify-token
           stringify-tokens))

(define (unlex tokens)
  (typecheck tokens (list-of lexeme?))
  (string-concatenate
   (map (lambda (x) (cond (x preprocessing-token? => stringify-token)
                     ((whitespace-token? x) (lexeme-body x))
                     ((other-token? x) (lexeme-body x))))
        tokens)))

;; takes a list of preprocessing-token's, and return a "source" string
(define (unlex-aggressive tokens)
  (typecheck tokens (list-of lexeme?))
  (string-concatenate
   (map (lambda (x)
          (cond ((preprocessing-token? x) (stringify-token x))
                ((whitespace-token? x) " ")
                ((other-token? x) (lexeme-body x))))
        (squeeze-whitespace tokens))))

(define (stringify-escape-sequence sub-token)
  (match sub-token
    (`(simple-escape-sequence ,x)
     (format #f "\\~a" x))
    (`(octal-escape-sequence ,x)
     (format #f "\\~a" x))
    (`(hexadecimal-escape-sequence ,x)
     (format #f "\\x~a" x))
    (`(universal-character-name ,x)
     (case (string-length x)
       ((4) (format #f "\\u~a" x))
       ((8) (format #f "\\U~a" x))))))

(define (stringify-string-tokens fragments)
  (with-output-to-string
    (lambda ()
      (display #\")
      (for-each (match-lambda
                  (`(escape-sequence ,x)
                   (display (stringify-escape-sequence x)))
                  ;; Backslash in source strings is usually encoded by an
                  ;; 'escape-sequence, but literal backslashes can be in
                  ;; "regular" string fragments as result of the stringification
                  ;; operator (#).
                  (s (display (escape-special-chars s "\"\\" #\\))))
                fragments)
      (display #\"))))

;; Returns the "source" of the token, as a preprocessing string literal token
(define (stringify-token preprocessing-token)
  (match (lexeme-body preprocessing-token)
    (('string-literal `(encoding-prefix . ,prefix) parts ...)
     (stringify-string-tokens parts))

    (`(header-name (q-string ,s))
     (format #f "~s" s))

    (`(header-name (h-string ,s))
     (format #f "<~a>" s))

    (`(identifier ,id) id)

    (`(pp-number ,n) n)

    (('character-constant `(character-prefix . ,x) c parts ...)
     (with-output-to-string
       (lambda ()
         (unless (null? x)
           (display (car x)))
         (display #\')
         (match c
           (`(escape-sequence (simple-escape-sequence ,x))
            (format #t "\\~a" x))
           (`(escape-sequence (octal-escape-sequence ,x))
            (format #t "\\~a" x))
           (`(escape-sequence (octal-escape-sequence ,x))
            (format #t "\\x~a" x))
           (`(escape-sequence (universal-character-name ,x))
            (format #t "\\~a~a"
                    (case (string-length x)
                      ((4) #\u) ((8) #\U))
                    x))
           (else (display c)))
         ;; TODO remaining parts
         (display #\'))))

    (`(punctuator ,p) p)
    (_ (scm-error 'cpp-error "stringify-token"
                  "No matching patterns for: ~s"
                  (list preprocessing-token) #f))))

;; takes a token list, and return a single string literal token
(define (stringify-tokens tokens)
  (lexeme type: 'preprocessing-token
          body: `(string-literal (encoding-prefix) ,(unlex-aggressive tokens))))