From 7352d1932e15b6da85774853e6953c0b390fd75b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hugo=20H=C3=B6rnquist?= Date: Mon, 18 Mar 2019 14:57:14 +0100 Subject: Working. --- html.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 html.c (limited to 'html.c') diff --git a/html.c b/html.c new file mode 100644 index 0000000..79d8615 --- /dev/null +++ b/html.c @@ -0,0 +1,101 @@ +#include +#include + +#include +#include +#include +#include + +SCM handle_node (GumboNode* n) { + + SCM dummytagname, scm_children = SCM_EOL; + GumboNode* ch; + switch (n->type) { + case GUMBO_NODE_DOCUMENT: + ; + GumboDocument doc = n->v.document; + for (int i = 0; i < doc.children.length; i++) { + ch = doc.children.data[i]; + scm_children = scm_cons (handle_node(ch), scm_children); + } + scm_children = scm_reverse_x(scm_children, SCM_EOL); + + return scm_cons ( scm_from_utf8_symbol("*TOP*"), + scm_cons ( SCM_BOOL_F, scm_children)); + + case GUMBO_NODE_TEMPLATE: + case GUMBO_NODE_ELEMENT: + ; + SCM attributes; + + GumboElement el = n->v.element; + GumboVector attr = el.attributes; + + attributes = scm_c_eval_string("(make-hash-table)"); + + GumboAttribute* a; + for (int i = 0; i < attr.length; i++) { + a = attr.data[i]; + scm_hashq_set_x (attributes, + scm_from_utf8_symbol(a->name), + scm_from_utf8_stringn(a->value, strlen(a->value))); + } + + for (int i = 0; i < el.children.length; i++) { + ch = el.children.data[i]; + scm_children = scm_cons (handle_node(ch), scm_children); + } + scm_children = scm_reverse_x(scm_children, SCM_EOL); + + return scm_cons (scm_from_utf8_symbol(gumbo_normalized_tagname(el.tag)), + scm_cons (attributes, scm_children)); + + case GUMBO_NODE_TEXT: + case GUMBO_NODE_WHITESPACE: + return scm_from_utf8_stringn(n->v.text.text, strlen(n->v.text.text)); + + case GUMBO_NODE_CDATA: + dummytagname = scm_from_utf8_symbol("cdata"); + goto rettext; + case GUMBO_NODE_COMMENT: + dummytagname = scm_from_utf8_symbol("comment"); + +rettext: + return scm_cons (dummytagname, + scm_cons ( SCM_BOOL_F, + scm_from_utf8_stringn(n->v.text.text, strlen(n->v.text.text)))); + } + return SCM_BOOL_F; +} + +SCM_DEFINE (parse_html, "parse-html", 1, 0, 0, + (SCM filename), + "") +{ + char* fname = scm_to_utf8_string(filename); + int fd = open(fname, O_RDONLY); + struct stat sb; + fstat(fd, &sb); + const char* buf = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0); + + + if (buf == NULL) { + fprintf(stderr, "Error MMAPing file\n"); + return scm_from_utf8_symbol("mmap-err"); + } + + GumboOutput* output = gumbo_parse(buf); + + SCM ret = handle_node (output->root); + // GumboNode* p = n->parent; + + gumbo_destroy_output(&kGumboDefaultOptions, output); + + return ret; +} + +void init_html (void) { +#ifndef SCM_MAGIC_SNARFER +#include "html.x" +#endif +} -- cgit v1.2.3