aboutsummaryrefslogtreecommitdiff
path: root/html.c
blob: 79d861563baab35d3cec53b26d2ef7ed2cfa9ab8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#include <gumbo.h>
#include <libguile.h>

#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

SCM handle_node (GumboNode* n) {

	SCM dummytagname, scm_children = SCM_EOL;
	GumboNode* ch;
	switch (n->type) {
		case GUMBO_NODE_DOCUMENT:
			;
			GumboDocument doc = n->v.document;
			for (int i = 0; i < doc.children.length; i++) {
				ch = doc.children.data[i];
				scm_children = scm_cons (handle_node(ch), scm_children);
			}
			scm_children = scm_reverse_x(scm_children, SCM_EOL);

			return scm_cons ( scm_from_utf8_symbol("*TOP*"),
					scm_cons ( SCM_BOOL_F, scm_children));

		case GUMBO_NODE_TEMPLATE:
		case GUMBO_NODE_ELEMENT:
			;
			SCM attributes;

			GumboElement el = n->v.element;
			GumboVector attr = el.attributes;

			attributes = scm_c_eval_string("(make-hash-table)");

			GumboAttribute* a;
			for (int i = 0; i < attr.length; i++) {
				a = attr.data[i];
				scm_hashq_set_x (attributes,
						scm_from_utf8_symbol(a->name),
						scm_from_utf8_stringn(a->value, strlen(a->value)));
			}

			for (int i = 0; i < el.children.length; i++) {
				ch = el.children.data[i];
				scm_children = scm_cons (handle_node(ch), scm_children);
			}
			scm_children = scm_reverse_x(scm_children, SCM_EOL);

			return scm_cons (scm_from_utf8_symbol(gumbo_normalized_tagname(el.tag)),
					scm_cons (attributes, scm_children));

		case GUMBO_NODE_TEXT:
		case GUMBO_NODE_WHITESPACE:
			return scm_from_utf8_stringn(n->v.text.text, strlen(n->v.text.text));

		case GUMBO_NODE_CDATA:
			dummytagname = scm_from_utf8_symbol("cdata");
			goto rettext;
		case GUMBO_NODE_COMMENT:
			dummytagname = scm_from_utf8_symbol("comment");

rettext:
			return scm_cons (dummytagname,
				scm_cons ( SCM_BOOL_F,
					scm_from_utf8_stringn(n->v.text.text, strlen(n->v.text.text))));
	}
	return SCM_BOOL_F;
}

SCM_DEFINE (parse_html, "parse-html", 1, 0, 0,
		(SCM filename),
		"")
{
	char* fname = scm_to_utf8_string(filename);
	int fd = open(fname, O_RDONLY);
	struct stat sb;
	fstat(fd, &sb);
	const char* buf = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0);


	if (buf == NULL) {
		fprintf(stderr, "Error MMAPing file\n");
		return scm_from_utf8_symbol("mmap-err");
	}

	GumboOutput* output = gumbo_parse(buf);

	SCM ret = handle_node (output->root);
	// GumboNode* p = n->parent;

	gumbo_destroy_output(&kGumboDefaultOptions, output);

	return ret;
}

void init_html (void) {
#ifndef SCM_MAGIC_SNARFER
#include "html.x"
#endif
}