aboutsummaryrefslogtreecommitdiff
path: root/module/c/unlex.scm
blob: 1f8ba0a1d4ef8394a6dfd905f92adc6a2471de22 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
(define-module (c unlex)
  :use-module (hnh util type)
  :use-module (ice-9 match)
  :use-module (c lex2)
  :use-module (c cpp-types)
  :use-module (c cpp-util)
  :use-module ((texinfo string-utils) :select (escape-special-chars))
  :export (unlex
           unlex-aggressive
           stringify-token
           stringify-tokens))

(define (unlex tokens)
  (typecheck tokens (list-of lexeme?))
  (string-concatenate
   (map (lambda (x) (cond (x preprocessing-token? => stringify-token)
                     ((whitespace-token? x) (lexeme-body x))
                     ((other-token? x) (lexeme-body x))))
        tokens)))

;; takes a list of preprocessing-token's, and return a "source" string
(define (unlex-aggressive tokens)
  (typecheck tokens (list-of lexeme?))
  (string-concatenate
   (map (lambda (x)
          (cond ((preprocessing-token? x) (stringify-token x))
                ((whitespace-token? x) " ")
                ((other-token? x) (lexeme-body x))))
        (squeeze-whitespace tokens))))

(define (stringify-escape-sequence sub-token)
  (match sub-token
    (`(simple-escape-sequence ,x)
     (format #f "\\~a" x))
    (`(octal-escape-sequence ,x)
     (format #f "\\~a" x))
    (`(hexadecimal-escape-sequence ,x)
     (format #f "\\x~a" x))
    (`(universal-character-name ,x)
     (case (string-length x)
       ((4) (format #f "\\u~a" x))
       ((8) (format #f "\\U~a" x))))))

(define (stringify-string-tokens fragments)
  (with-output-to-string
    (lambda ()
      (display #\")
      (for-each (match-lambda
                  (`(escape-sequence ,x)
                   (display (stringify-escape-sequence x)))
                  ;; Backslash in source strings is usually encoded by an
                  ;; 'escape-sequence, but literal backslashes can be in
                  ;; "regular" string fragments as result of the stringification
                  ;; operator (#).
                  (s (display (escape-special-chars s "\"\\" #\\))))
                fragments)
      (display #\"))))

;; Returns the "source" of the token, as a preprocessing string literal token
(define (stringify-token preprocessing-token)
  (match (lexeme-body preprocessing-token)
    (('string-literal `(encoding-prefix ,prefix) parts ...)
     (stringify-string-tokens parts))
    (('string-literal parts ...)
     (stringify-string-tokens parts))
    (`(header-name (q-string ,s))
     (format #f "~s" s))
    (`(header-name (h-string ,s))
     (format #f "<~a>" s))
    (`(identifier ,id) id)
    (`(pp-number ,n) n)
    (`(character-constant ,c)
     (format #f "'~a'" c))
    (`(punctuator ,p) p)))

;; takes a token list, and return a single string literal token
(define (stringify-tokens tokens)
  (lexeme type: 'preprocessing-token
          body: `(string-literal ,(unlex-aggressive tokens))))