aboutsummaryrefslogtreecommitdiff
path: root/html.c
diff options
context:
space:
mode:
Diffstat (limited to 'html.c')
-rw-r--r--html.c101
1 files changed, 101 insertions, 0 deletions
diff --git a/html.c b/html.c
new file mode 100644
index 0000000..79d8615
--- /dev/null
+++ b/html.c
@@ -0,0 +1,101 @@
+#include <gumbo.h>
+#include <libguile.h>
+
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+SCM handle_node (GumboNode* n) {
+
+ SCM dummytagname, scm_children = SCM_EOL;
+ GumboNode* ch;
+ switch (n->type) {
+ case GUMBO_NODE_DOCUMENT:
+ ;
+ GumboDocument doc = n->v.document;
+ for (int i = 0; i < doc.children.length; i++) {
+ ch = doc.children.data[i];
+ scm_children = scm_cons (handle_node(ch), scm_children);
+ }
+ scm_children = scm_reverse_x(scm_children, SCM_EOL);
+
+ return scm_cons ( scm_from_utf8_symbol("*TOP*"),
+ scm_cons ( SCM_BOOL_F, scm_children));
+
+ case GUMBO_NODE_TEMPLATE:
+ case GUMBO_NODE_ELEMENT:
+ ;
+ SCM attributes;
+
+ GumboElement el = n->v.element;
+ GumboVector attr = el.attributes;
+
+ attributes = scm_c_eval_string("(make-hash-table)");
+
+ GumboAttribute* a;
+ for (int i = 0; i < attr.length; i++) {
+ a = attr.data[i];
+ scm_hashq_set_x (attributes,
+ scm_from_utf8_symbol(a->name),
+ scm_from_utf8_stringn(a->value, strlen(a->value)));
+ }
+
+ for (int i = 0; i < el.children.length; i++) {
+ ch = el.children.data[i];
+ scm_children = scm_cons (handle_node(ch), scm_children);
+ }
+ scm_children = scm_reverse_x(scm_children, SCM_EOL);
+
+ return scm_cons (scm_from_utf8_symbol(gumbo_normalized_tagname(el.tag)),
+ scm_cons (attributes, scm_children));
+
+ case GUMBO_NODE_TEXT:
+ case GUMBO_NODE_WHITESPACE:
+ return scm_from_utf8_stringn(n->v.text.text, strlen(n->v.text.text));
+
+ case GUMBO_NODE_CDATA:
+ dummytagname = scm_from_utf8_symbol("cdata");
+ goto rettext;
+ case GUMBO_NODE_COMMENT:
+ dummytagname = scm_from_utf8_symbol("comment");
+
+rettext:
+ return scm_cons (dummytagname,
+ scm_cons ( SCM_BOOL_F,
+ scm_from_utf8_stringn(n->v.text.text, strlen(n->v.text.text))));
+ }
+ return SCM_BOOL_F;
+}
+
+SCM_DEFINE (parse_html, "parse-html", 1, 0, 0,
+ (SCM filename),
+ "")
+{
+ char* fname = scm_to_utf8_string(filename);
+ int fd = open(fname, O_RDONLY);
+ struct stat sb;
+ fstat(fd, &sb);
+ const char* buf = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0);
+
+
+ if (buf == NULL) {
+ fprintf(stderr, "Error MMAPing file\n");
+ return scm_from_utf8_symbol("mmap-err");
+ }
+
+ GumboOutput* output = gumbo_parse(buf);
+
+ SCM ret = handle_node (output->root);
+ // GumboNode* p = n->parent;
+
+ gumbo_destroy_output(&kGumboDefaultOptions, output);
+
+ return ret;
+}
+
+void init_html (void) {
+#ifndef SCM_MAGIC_SNARFER
+#include "html.x"
+#endif
+}