From 7352d1932e15b6da85774853e6953c0b390fd75b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hugo=20H=C3=B6rnquist?= Date: Mon, 18 Mar 2019 14:57:14 +0100 Subject: Working. --- .gitignore | 5 ++ 100-2.html | 25 ---------- 100.html | 46 ------------------ Main.hs | 8 ---- Makefile | 27 +++++++++++ README | 4 ++ fmt-stack.scm | 55 ++++++++++++++++++++++ html.c | 101 ++++++++++++++++++++++++++++++++++++++++ html.scm | 19 ++++++++ json.c | 112 ++++++++++++++++++++++++++++++++++++++++++++ json.scm | 8 ++++ main.scm | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ monad | 2 +- script.scm | 136 ----------------------------------------------------- texttv.py | 24 ---------- ttv | 10 ++++ wrapper.sh | 2 - 17 files changed, 489 insertions(+), 242 deletions(-) create mode 100644 .gitignore delete mode 100644 100-2.html delete mode 100644 100.html delete mode 100644 Main.hs create mode 100644 Makefile create mode 100644 README create mode 100644 fmt-stack.scm create mode 100644 html.c create mode 100644 html.scm create mode 100644 json.c create mode 100644 json.scm create mode 100755 main.scm delete mode 100755 script.scm delete mode 100755 texttv.py create mode 100755 ttv delete mode 100755 wrapper.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5abbd84 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.last-updated +texttv.json +cache/ +html +json diff --git a/100-2.html b/100-2.html deleted file mode 100644 index 8871c2d..0000000 --- a/100-2.html +++ /dev/null @@ -1,25 +0,0 @@ -
100 SVT Text Fredag 09 nov 2018 - - - - - -

Ätstörningar bland ungdomar fördubblas

- Allt fler i åldern 10-14 år söker vård - 106 - - Man död efter skjutning - i Uppsala i går kväll - 108 - -

Kalifornien-stad förstörd i skogsbrand

- Räddningstjänsten: Staden helt ödelagd - 130 - - Österrikisk överste spionmisstänkt 136 - - - - -

Inrikes 101 Utrikes 104 Innehåll 700

-
diff --git a/100.html b/100.html deleted file mode 100644 index defc465..0000000 --- a/100.html +++ /dev/null @@ -1,46 +0,0 @@ - - -
100 SVT Text Fredag 09 nov 2018 - - - - - -

Ätstörningar bland ungdomar fördubblas

- Allt fler i åldern 10-14 år söker vård - 106 - - Man död efter skjutning - i Uppsala i går kväll - 108 - -

Kalifornien-stad förstörd i skogsbrand

- Räddningstjänsten: Staden helt ödelagd - 130 - - Österrikisk överste spionmisstänkt 136 - - - - -

Inrikes 101 Utrikes 104 Innehåll 700

-
diff --git a/Main.hs b/Main.hs deleted file mode 100644 index c0e4ed0..0000000 --- a/Main.hs +++ /dev/null @@ -1,8 +0,0 @@ -import Text.ParserCombinators.Parsec - -docParser :: GenParser Char () Doc - -parseLine :: - -main :: IO () -main = putStrLn "HaHa" diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8f867c9 --- /dev/null +++ b/Makefile @@ -0,0 +1,27 @@ +CFLAGS := -fPIC $(shell guile-config compile) \ + $(shell pkg-config --cflags gumbo) \ + -I/usr/local/include/ -Wall +LFLAGS := $(shell guile-config link) \ + $(shell pkg-config --libs gumbo) \ + -L/usr/local/lib -lcjson +CC := gcc + +.PHONY: all clean + +all: libguile-json.so libguile-html.so + +%: %.oo + $(CC) -o $@ $< $(LFLAGS) + +%.x: %.c + guile-snarf -o $@ $< $(CFLAGS) + +libguile-%.so: %.oo + $(CC) -shared -o $@ $< $(LFLAGS) + +%.oo: %.c %.x + $(CC) -c -o $@ $(CFLAGS) $< + +clean: + -rm *.oo + -rm libguile-*.so diff --git a/README b/README new file mode 100644 index 0000000..b744e05 --- /dev/null +++ b/README @@ -0,0 +1,4 @@ +Dependencies +------------ +- cJSON +- Gumbo diff --git a/fmt-stack.scm b/fmt-stack.scm new file mode 100644 index 0000000..ced6be3 --- /dev/null +++ b/fmt-stack.scm @@ -0,0 +1,55 @@ +(define-module (fmt-stack) + #:export (get-attr + set-fg set-bg set-style + make-fmt-frame empty-fmt-frame + fmt-frame->ansi-esc) + + #:use-module (control monad) + #:use-module (control monad state) + + #:use-module (data optional) + + #:use-module (srfi srfi-1) + #:use-module (srfi srfi-9 gnu)) + +(define-immutable-record-type fmt-frame + (make-fmt-frame style fg bg) + fmt-frame? + (style get-style set-style) + (fg get-fg set-fg) + (bg get-bg set-bg)) + +(define (empty-fmt-frame) + (make-fmt-frame #f #f #f)) + +(define (fmt-frame->ansi-esc frame) + (string-append + "\x1b[m" + (case (get-fg frame) + ((B) "\x1b[0;34m") + ((Y) "\x1b[0;33m") + ((C) "\x1b[0;36m") + (else "")) + + (case (get-bg frame) + ((B) "\x1b[44m") + ((Y) "\x1b[43m") + ((C) "\x1b[46m") + (else "")) + + (case (get-style frame) + ((underline) "\x1b[4m") + ((italic) "\x1b[3m") + ((bold) "\x1b[1m") + (else "")))) + +(define (get-attr) + (do stack <- (get) + (return-state + (fmt-frame->ansi-esc + (make-fmt-frame + (get-style (find get-style stack)) + (get-fg (find get-fg stack)) + (get-bg (find get-bg stack))))))) + + diff --git a/html.c b/html.c new file mode 100644 index 0000000..79d8615 --- /dev/null +++ b/html.c @@ -0,0 +1,101 @@ +#include +#include + +#include +#include +#include +#include + +SCM handle_node (GumboNode* n) { + + SCM dummytagname, scm_children = SCM_EOL; + GumboNode* ch; + switch (n->type) { + case GUMBO_NODE_DOCUMENT: + ; + GumboDocument doc = n->v.document; + for (int i = 0; i < doc.children.length; i++) { + ch = doc.children.data[i]; + scm_children = scm_cons (handle_node(ch), scm_children); + } + scm_children = scm_reverse_x(scm_children, SCM_EOL); + + return scm_cons ( scm_from_utf8_symbol("*TOP*"), + scm_cons ( SCM_BOOL_F, scm_children)); + + case GUMBO_NODE_TEMPLATE: + case GUMBO_NODE_ELEMENT: + ; + SCM attributes; + + GumboElement el = n->v.element; + GumboVector attr = el.attributes; + + attributes = scm_c_eval_string("(make-hash-table)"); + + GumboAttribute* a; + for (int i = 0; i < attr.length; i++) { + a = attr.data[i]; + scm_hashq_set_x (attributes, + scm_from_utf8_symbol(a->name), + scm_from_utf8_stringn(a->value, strlen(a->value))); + } + + for (int i = 0; i < el.children.length; i++) { + ch = el.children.data[i]; + scm_children = scm_cons (handle_node(ch), scm_children); + } + scm_children = scm_reverse_x(scm_children, SCM_EOL); + + return scm_cons (scm_from_utf8_symbol(gumbo_normalized_tagname(el.tag)), + scm_cons (attributes, scm_children)); + + case GUMBO_NODE_TEXT: + case GUMBO_NODE_WHITESPACE: + return scm_from_utf8_stringn(n->v.text.text, strlen(n->v.text.text)); + + case GUMBO_NODE_CDATA: + dummytagname = scm_from_utf8_symbol("cdata"); + goto rettext; + case GUMBO_NODE_COMMENT: + dummytagname = scm_from_utf8_symbol("comment"); + +rettext: + return scm_cons (dummytagname, + scm_cons ( SCM_BOOL_F, + scm_from_utf8_stringn(n->v.text.text, strlen(n->v.text.text)))); + } + return SCM_BOOL_F; +} + +SCM_DEFINE (parse_html, "parse-html", 1, 0, 0, + (SCM filename), + "") +{ + char* fname = scm_to_utf8_string(filename); + int fd = open(fname, O_RDONLY); + struct stat sb; + fstat(fd, &sb); + const char* buf = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0); + + + if (buf == NULL) { + fprintf(stderr, "Error MMAPing file\n"); + return scm_from_utf8_symbol("mmap-err"); + } + + GumboOutput* output = gumbo_parse(buf); + + SCM ret = handle_node (output->root); + // GumboNode* p = n->parent; + + gumbo_destroy_output(&kGumboDefaultOptions, output); + + return ret; +} + +void init_html (void) { +#ifndef SCM_MAGIC_SNARFER +#include "html.x" +#endif +} diff --git a/html.scm b/html.scm new file mode 100644 index 0000000..90dd039 --- /dev/null +++ b/html.scm @@ -0,0 +1,19 @@ +(define-module (html) + #:export (parse-html) + #:re-export (write) + #:use-module (oop goops) + ) + +;; (setenv "LD_LIBRARY_PATH" (dirname (current-filename))) + +;; exposes the parse-html symbol +(load-extension "libguile-html" "init_html") + +(define (fmt-hash-table t) + `(@ ,@(hash-map->list cons t))) + +(define-method (write (t ) port) + (display (fmt-hash-table t) port)) + +(define-method (display (t ) port) + (display (fmt-hash-table t) port)) diff --git a/json.c b/json.c new file mode 100644 index 0000000..9504e35 --- /dev/null +++ b/json.c @@ -0,0 +1,112 @@ +#include +#include + +#include +#include +#include +#include + +static SCM json_type; + +static inline void assert_json (SCM item) + { scm_assert_foreign_object_type (json_type, item); } +static inline cJSON* to_json (SCM item) + { return scm_foreign_object_ref (item, 0); } +static inline SCM from_json (cJSON* item) + { return scm_make_foreign_object_1 (json_type, item); } + +SCM_DEFINE (parse_json, "parse-json", 1, 0, 0, + (SCM filename), + "") +{ + char* fname = scm_to_utf8_stringn (filename, NULL); + + int fd = open(fname, O_RDONLY); + struct stat sb; + fstat(fd, &sb); + const char* buf = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0); + + return from_json (cJSON_Parse(buf)); +} + +SCM json_to_scheme (cJSON* j) { + cJSON* node; + switch (j->type) { + case cJSON_Invalid: + return scm_from_utf8_symbol("invalid"); + + case cJSON_False: + return SCM_BOOL_F; + + case cJSON_True: + return SCM_BOOL_T; + + case cJSON_NULL: + return SCM_EOL; + + case cJSON_Number: + return scm_values(scm_list_2( + scm_from_int(j->valueint), + scm_from_double(j->valuedouble))); + + case cJSON_String: + return scm_from_utf8_string(j->valuestring); + + case cJSON_Array: + node = j->child; + + SCM list = SCM_EOL; + while (node != NULL) { + list = scm_cons(json_to_scheme(node), list); + node = node->next; + } + return scm_vector(scm_reverse_x (list, SCM_EOL)); + + case cJSON_Object: + node = j->child; + SCM table = scm_c_eval_string ("(make-hash-table)"); + while (node->next != NULL) { + scm_hashq_set_x (table, + scm_from_utf8_symbol(node->string), + json_to_scheme(node)); + node = node->next; + } + return table; + + + case cJSON_Raw: + return scm_from_utf8_symbol("raw"); + } + + return scm_from_utf8_symbol("nocase"); +} + +SCM_DEFINE(json_to_scheme_, "json->scm", 1, 0, 0, + (SCM json), + "") +{ + assert_json(json); + cJSON* j = to_json(json); + + return json_to_scheme (j); +} + +static void finilize_json (SCM json) { + cJSON* j = to_json(json); + cJSON_Delete (j); +} + +static void init_json_type (void) { + SCM name = scm_from_utf8_symbol("json"); + SCM slot = scm_list_1(scm_from_utf8_symbol("data")); + scm_t_struct_finalize finilizer = finilize_json;; + json_type = scm_make_foreign_object_type (name, slot, finilizer); +} + +void init_json (void) { + init_json_type (); + +#ifndef SCM_MAGIC_SNARFER +#include "json.x" +#endif +} diff --git a/json.scm b/json.scm new file mode 100644 index 0000000..bb33e5d --- /dev/null +++ b/json.scm @@ -0,0 +1,8 @@ +(define-module (json) + #:export (get-json parse-json json->scm)) +;; (setenv "LD_LIBRARY_PATH" (string-append "/usr/local/lib:" (getcwd))) + +(load-extension "libguile-json" "init_json") + +(define (get-json filename) + (json->scm (parse-json filename))) diff --git a/main.scm b/main.scm new file mode 100755 index 0000000..fd4489e --- /dev/null +++ b/main.scm @@ -0,0 +1,147 @@ +#!/usr/bin/guile \ +-q -e main -s +!# + +(add-to-load-path (dirname (current-filename))) +(add-to-load-path (string-append (dirname (current-filename)) "/monad/")) +(add-to-load-path "/home/hugo/lib/guile") + +(add-to-load-path "/home/hugo/code/calparse") ; For (util), move that to a library + +(setenv "LD_LIBRARY_PATH" (dirname (current-filename))) + +(use-modules #; (sxml simple) + ;; (sxml match) + + (srfi srfi-1) + (srfi srfi-26) + (srfi srfi-43) ; Vector iteration + + ;; (ice-9 rdelim) + (ice-9 regex) + (ice-9 popen) + + (macros arrow) + (util) + + (control monad) + (control monad state) + (data stack) + + (fmt-stack) + (html) + (json)) + +(define-macro (regex-case str . cases) + `(cond + ,@(map (lambda (case) + (let ((pattern (car case)) + (rest (cdr case))) + (if (eq? pattern 'else) + `(else ,@rest) + `((string-match ,pattern ,str) ,@rest)))) + cases))) + +(define (substr-1 match) + (match:substring match 1)) + +(define (class-handlers class) + (fold (lambda (cl obj) + (regex-case + cl + ("^DH$" (set-style obj 'bold)) + ("^bg([BYC])$" => (compose (cut set-bg obj <>) string->symbol substr-1)) + ("^([BYC])$" => (compose (cut set-fg obj <>) string->symbol substr-1)) + (else obj))) + (empty-fmt-frame) + (string-split class #\space))) + +;;; TODO every push and pop should emit current ANSI-escape after it has run. + +(use-modules (ice-9 match)) + +;;; TODO Every clause should return a string, in the state context of a state. + +;; +-- State storage, "mutable" +;; | +- Return value, static +;; V V +;; sxml → State +(define (fmt-tag tag) + (match tag + + [('a _ body ...) + (do fmt-before <- (get-attr) + (push (make-fmt-frame 'underline 'B #f)) + fmt-with <- (get-attr) + (pop) + (return-state + (string-append fmt-with (car body) fmt-before)))] + + [((or 'h1 'span) attrs nodes ...) + (do let class = (hashq-ref attrs 'class) + fmt-before <- (get-attr) + (push (class-handlers class)) + fmt-with <- (get-attr) + ret <- (fmap (cut string-append fmt-with <> fmt-before) + (fmap string-concatenate (sequence (map fmt-tag nodes)))) + (pop) + (return-state ret))] + + ;; Default rule, since the above case requires a class list + [(tag _ node nodes ...) + (fmap string-concatenate (sequence (map fmt-tag (cons node nodes))))] + + ;; Just ignore tags without children + [(tag _) (return-state "")] + + [(? string? str) (return-state str)] + + [default (return-state (format #f "[|~a|]" default))])) + +(define (parse-html-string str) + (let ((fname (tmpnam))) + (with-output-to-file fname + (lambda () (display str))) + (parse-html fname))) + +(define (text-for-num j int) + (let* ((table (vector-ref j (- int 100))) + (arr (hashq-ref table 'content)) + (text (array-ref arr 0))) + text)) + +(define (max-date vect) + (vector-fold (lambda (i accum el) (max accum (hashq-ref el 'date_updated_unix))) + 0 vect)) + +(define (main args) + (let* (((self filename pagestr) args) + (page (string->number pagestr))) + + (let ((last-updated (call-with-input-file ".last-updated" read))) + (when (> (- (current-time) last-updated) 10000) + (display "Downloading new data, please stand by...") + (system* "curl" + "-s" "http://api.texttv.nu/api/get/100-999?app=hugonikanor" + "-o" filename) + + (let* ((json (get-json filename)) + (update (max-date json))) + + (with-output-to-file ".last-updated" (lambda () (display update))) + + (vector-for-each + (lambda (i el) + (let ((num (hashq-ref el 'num)) + (text (vector-ref (hashq-ref el 'content) 0))) + (with-output-to-file (format #f "cache/~a.ansi" num) + (lambda () (-> (parse-html-string text) + fmt-tag + (run-state (list (make-fmt-frame "" "" ""))) + car display))))) + json))) + + (let ((p1 (format #f "cache/~a.ansi" page)) + (p2 (format #f "cache/~a.ansi" (1+ page)))) + (system* "paste" p1 p2)) + ))) diff --git a/monad b/monad index f122520..a4a7edd 160000 --- a/monad +++ b/monad @@ -1 +1 @@ -Subproject commit f1225201c9ded1078ef1f98fbf4969a8480d3b38 +Subproject commit a4a7edd487d4d2207829462d06c40578dc757337 diff --git a/script.scm b/script.scm deleted file mode 100755 index b43428d..0000000 --- a/script.scm +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/guile \ --e main -s -!# - -(use-modules (sxml simple) - (sxml match) - (ice-9 match) - (srfi srfi-1)) - -(define (setmode . args) - "Sets display mode" - (with-output-to-string - (lambda () - (for-each display - `(#\escape #\[ ,@args #\m))))) - - -(define* (deffunc name idx - #:optional (func-pre '#{}#) (mode-pre "")) - (let ((str (gensym))) - `(define (,(symbol-append func-pre name) . ,str) - (string-append - (setmode ,mode-pre ,idx) - #; (string #\escape #\[ #\K) - (string-concatenate ,str) - (setmode 0))))) - -(define-macro (create-modes modes) - `(begin - ,@(map deffunc - modes - (iota (length modes))))) - -(define-macro (create-colors modes) - `(begin - ,@(apply append - (map (lambda (itm idx) - (list (deffunc itm idx 'fg- "0;3") - (deffunc itm idx 'bg- "0;4"))) - modes - (iota (length modes)))))) - -(create-modes (off bold dim slant underline)) -(create-colors (black red green yellow blue purple cyan white)) - -(define classmap - (match-lambda ("B" fg-blue) - ("bgB" bg-blue) - ("Y" fg-yellow) - ("C" fg-cyan) - ("DH" bold) - (_ identity))) - -(define (push item stack) (cons item stack)) -(define (pop stack) (unless (null? stack) - (car+cdr stack))) -(define (peek stack) (if (null? stack) '() (car stack))) - -(define (class-handlers class-str) - (fold compose identity - (map classmap (string-split class-str #\space)))) - -(define (fmt-sub nodes) - (string-concatenate (map fmt-tag nodes))) - -(define (fmt-tag tag) - (sxml-match tag - [(a #; (@ (class ,class)) ,text) - (underline text)] - - [(h1 (@ (class ,class)) ,nodes ...) - ((class-handlers class) - (bold (fmt-sub nodes)))] - - [(span (@ (class ,class)) ,nodes ...) - ((class-handlers class) - (fmt-sub nodes))] - - [,str (guard (string? str)) str] - - [,default (format #f "[|~a|]" default)] - )) - -;;; -;;; -;;; - -(define (fmt-tag tag) - (sxml-match tag - [(a #; (@ (class ,class)) ,text) - (underline text)] - - [(h1 (@ (class ,class)) ,nodes ...) - ((class-handlers class) - (bold (fmt-sub nodes)))] - - [(span (@ (class ,class)) ,nodes ...) - ((class-handlers class) - (fmt-sub nodes))] - - [,str (guard (string? str)) str] - - [,default (format #f "[|~a|]" default)] - )) - -(define (parse-doc sxml) - (sxml-match sxml - [(*TOP* (div #; (span (@ (class "toprow")) ,top-row ...) - ,spans ...)) - spans - ])) - -'(span (@ (class "B")) - "A" - (span (@ (class "G")) - "B") - "C") - -;; => (blue A (green B) C) -;; => set-blue A set-green B unset-green/set-blue C unset-blue - -;; push blue : print-esc -;; print A -;; push green : print-esc -;; print B -;; pop : print-esc -;; print C -;; pop : print-esc - - - - -(define (main args) - (define d (call-with-input-file "100-2.html" xml->sxml)) - (display (string-concatenate (map fmt-tag (parse-doc d)))) - (newline)) diff --git a/texttv.py b/texttv.py deleted file mode 100755 index a6cf2b1..0000000 --- a/texttv.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python2 - -from httplib2 import Http -from urllib import urlencode -import json -import re -import sys - -def cleanhtml(raw_html): - cleanr = re.compile('<.*?>') - cleantext = re.sub(cleanr, '', raw_html) - return cleantext - -if len(sys.argv) > 1: - page = sys.argv[1] -else: - page = '100' - -http = Http() -url = 'http://api.texttv.nu/api/get/' + page + '?app=python' -headers = {'Accept': 'application/json', 'Accept-Language': 'en-US,en'} -response, content = http.request(url, 'GET', headers=headers) -data = json.loads(content) -print cleanhtml(data[0]['content'][0]) diff --git a/ttv b/ttv new file mode 100755 index 0000000..6e527c6 --- /dev/null +++ b/ttv @@ -0,0 +1,10 @@ +#!/bin/bash + +pushd $(dirname $(realpath $0)) > /dev/null + +LD_LIBRARY_PATH=$PWD + +file=texttv.json +page=${1:- 100} + +./main.scm $file $page diff --git a/wrapper.sh b/wrapper.sh deleted file mode 100755 index 11d9236..0000000 --- a/wrapper.sh +++ /dev/null @@ -1,2 +0,0 @@ -curl http://api.texttv.nu/api/get/100?app=hugonikanor \ - | jq -j '.[0].content[0]' \ -- cgit v1.2.3