1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
#include <gumbo.h>
#include <libguile.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
SCM handle_node (GumboNode* n) {
SCM dummytagname, scm_children = SCM_EOL;
GumboNode* ch;
switch (n->type) {
case GUMBO_NODE_DOCUMENT:
;
GumboDocument doc = n->v.document;
for (int i = 0; i < doc.children.length; i++) {
ch = doc.children.data[i];
scm_children = scm_cons (handle_node(ch), scm_children);
}
scm_children = scm_reverse_x(scm_children, SCM_EOL);
return scm_cons ( scm_from_utf8_symbol("*TOP*"),
scm_cons ( SCM_BOOL_F, scm_children));
case GUMBO_NODE_TEMPLATE:
case GUMBO_NODE_ELEMENT:
;
SCM attributes;
GumboElement el = n->v.element;
GumboVector attr = el.attributes;
attributes = scm_c_eval_string("(make-hash-table)");
GumboAttribute* a;
for (int i = 0; i < attr.length; i++) {
a = attr.data[i];
scm_hashq_set_x (attributes,
scm_from_utf8_symbol(a->name),
scm_from_utf8_stringn(a->value, strlen(a->value)));
}
for (int i = 0; i < el.children.length; i++) {
ch = el.children.data[i];
scm_children = scm_cons (handle_node(ch), scm_children);
}
scm_children = scm_reverse_x(scm_children, SCM_EOL);
return scm_cons (scm_from_utf8_symbol(gumbo_normalized_tagname(el.tag)),
scm_cons (attributes, scm_children));
case GUMBO_NODE_TEXT:
case GUMBO_NODE_WHITESPACE:
return scm_from_utf8_stringn(n->v.text.text, strlen(n->v.text.text));
case GUMBO_NODE_CDATA:
dummytagname = scm_from_utf8_symbol("cdata");
goto rettext;
case GUMBO_NODE_COMMENT:
dummytagname = scm_from_utf8_symbol("comment");
rettext:
return scm_cons (dummytagname,
scm_cons ( SCM_BOOL_F,
scm_from_utf8_stringn(n->v.text.text, strlen(n->v.text.text))));
}
return SCM_BOOL_F;
}
SCM_DEFINE (parse_html, "parse-html", 1, 0, 0,
(SCM filename),
"")
{
char* fname = scm_to_utf8_string(filename);
int fd = open(fname, O_RDONLY);
struct stat sb;
fstat(fd, &sb);
const char* buf = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0);
if (buf == NULL) {
fprintf(stderr, "Error MMAPing file\n");
return scm_from_utf8_symbol("mmap-err");
}
GumboOutput* output = gumbo_parse(buf);
SCM ret = handle_node (output->root);
// GumboNode* p = n->parent;
gumbo_destroy_output(&kGumboDefaultOptions, output);
return ret;
}
void init_html (void) {
#ifndef SCM_MAGIC_SNARFER
#include "html.x"
#endif
}
|