#!/usr/bin/env python3 import re import os import pwd import subprocess from subprocess import PIPE import hashlib import os.path as path from os.path import dirname import email from email.policy import default from datetime import datetime from dateutil import parser as dtparser import argparse import sys import shutil path_base = '/' global_bilaga = 0 self_prog = os.path.realpath(sys.argv[0]) def anglify(str): table = { 'å': 'a', 'ä': 'a', 'ö': 'o', } return ''.join(table.get(c, c) for c in str) # ------------------------------------------------------------ def includepdf(mode, addr, full_file): if mode == 'tex': arg = 'frame' arg += ',pages={-}' pagecommand = '\\thispagestyle{fancy}' arg += f',pagecommand={{{pagecommand}}}' arg += ',width=\\textwidth' before = f'\\rhead{{Bilaga \\Alph{{section}}.\\arabic{{subsection}} {texify_filename(addr)} }}\n' before += f'\\phantomsection\\stepcounter{{subsection}}\n' before += f'\\includepdf[{arg}]{{\\detokenize{{{full_file}}}}}\n' return before elif mode == 'txt': return f"Se pdf {addr}\n" def includeimg(mode, addr, full_file): if mode == 'tex': pagecommand = '\\thispagestyle{fancy}' pagecommand += f'\\lhead{{Bilaga \\Alph{{section}}.\\arabic{{subsection}} {texify_filename(addr)} }}' arg = 'width=\\textwidth' return f'\\phantomsection\\stepcounter{{subsection}}\n\\includegraphics[{arg}]{{{full_file}}}\n' elif mode == 'txt': return f"Se bild {addr}\n" def shorten_mail(str): last_arrow = False out = [] for line in str.split('\n'): if not line: out += [''] continue if line[0] == '>': arrow = True if arrow and not last_arrow: out += ['> [...]'] last_arrow = True continue out += [line] return '\n'.join(out) def texify_filename(str): out = '' for c in str: if c == '_': out += '\\_' else: out += c return out def split_into_paragraphs(str): lines = str.split('\n') paragraphs = [] paragraph = [] for line in lines: if line == '': if paragraph == []: continue else: paragraphs.append(paragraph) paragraph = [] else: paragraph.append(line) return paragraphs def parse_options_paragraph(paragraphs): """ check the first paragraph for a definition list, and parse that into variables """ options = {} for i in range(len(paragraphs[0])): line = paragraphs[0][i] if m := re.match('^(\w+)\s*::\s*(.*)', line): options[m[1]] = m[2] else: # remove handled lines, keep remaining lines in paragraphs list paragraphs[0] = paragraphs[0][i:] break else: # remove first paragraph from paragraphs list paragraphs = paragraphs[1:] return options, paragraphs def resolve_mail(mode, source_file_name, addr, frag, comment): outstr = '' msg_id = addr mailfile = subprocess.run(f"mu find -u 'i:{msg_id}' --fields 'l'", shell=True, stdout=PIPE).stdout.decode('UTF-8').strip() # mail = subprocess.run(f"mu view {mailfile}", shell=True, stdout=PIPE).stdout.decode('UTF-8') with open(mailfile, 'rb') as f: msg = email.message_from_binary_file(f, policy=default) if not frag == 'attachments-only': if mode == 'tex': outstr += '\\begin{verbatim}\n' # mail += msg.get_content_type() + '\n' body = msg.get_body(preferencelist=('plain', 'html')) for field in ['Date', 'From', 'To' 'Cc', 'Subject']: h = msg.get(field) if h: outstr += f'{field}: {h}\n' outstr += '\n' if frag == 'short': outstr += shorten_mail(body.get_content()) else: outstr += body.get_content() if mode == 'tex': outstr += '\\end{verbatim}\n' # TODO merge this logic with the resolve file logic below for part in msg.iter_attachments(): if part.get_content_type() == 'application/pdf' or part.get_content_type().split('/')[0] == 'image': safe_filename = part.get_filename() \ .replace('/', '_') \ .replace('å', 'a') \ .replace('ä', 'a') \ .replace('ö', 'o') with open('/tmp/' + safe_filename, 'wb') as f: f.write(part.get_content()) # outstr += part.get_filename() + '\n' if part.get_content_type() == 'application/pdf': outstr += includepdf(mode, safe_filename, '/tmp/' + safe_filename) else: outstr += includeimg(mode, safe_filename, '/tmp/' + safe_filename) else: if mode == 'tex': outstr += f'\\subsection{{{texify_filename(part.get_filename())}}}\n' elif mode == 'txt': global subsectioncounter subsectioncounter += 1 outstr += f' {sectioncounter}.{subsectioncounter}. {part.get_filename()}\n' content = part.get_content() if type(content) == bytes: content = content.decode('utf-8') if max(len(s) for s in content.split('\n')) > 100: # outstr += '\\setlength\\parindent{24pt}' outstr += content + '\n' else: if mode == 'tex': outstr += '\n\\begin{verbatim}' outstr += content + '\n' if mode == 'tex': outstr += '\\end{verbatim}' # mail += f'{part.get_content_type()} - {part.get_filename()}\n' # for part in msg.walk(): # mail += f'{part.get_content_type()} - {part.get_filename()}\n' # for part in msg.walk(): #for part in msg.walk(): # if part.get_content_maintype() == 'multipart': # continue # mail += f'== {part.get_content_type()} ==\n\n' # if part.get_content_type() == 'application/pdf': # mail += part.get_filename() + '\n\n' # else: # mail += part.as_string(policy=default) # mail = msg.as_string() return outstr def resolve_help(mode, source_file_name, addr, frag, comment): pass return '' def resolve_file(mode, source_file_name, addr, frag, comment): extension = addr.split('.')[-1] full_file = path.join(path_base, addr) if extension == 'pdf': s = '' if mode == 'tex': s += '\\stepcounter{section}\n' s += '\\phantomsection\n' # TODO header for txt mode? # s += f'\\addcontentsline{{toc}}{{chapter}}{{{texify_filename(addr)}}}\n' # s = f'\\includepdf[pages={{1}},width=\\textwidth]{{{full_file}}}\n' s += includepdf(mode, addr, full_file) return s elif extension.lower() in ['jpg', 'jpeg', 'png', 'gif']: return includeimg(mode, addr, full_file) # TODO # elif extension == 'tex': # return f'\\subfile{{{full_file}}}\n' else: print('PWD =', os.getcwd(), full_file) with open(full_file, 'r') as f: s = f.read() + '\n' if mode == 'tex': st = f'\\section{{{comment}}}' st += r'\begin{verbatim}' + s + r'\end{verbatim}' + '\n' return st else: return s def generate_attachement(mode, data): outstr = '' opts, paragraphs = parse_options_paragraph(split_into_paragraphs(data)) options = { 'mode': 'verbatim', **opts } if mode == 'tex': if options['mode'] == 'tex': pass else: outstr += '\\begin{verbatim}\n' for paragraph in paragraphs: outstr += '\n'.join(paragraph) outstr += '\n\n' if mode == 'tex': if options['mode'] == 'tex': pass else: outstr += '\\end{verbatim}\n' return outstr def resolve_vimwiki(mode, source_file_name, addr, frag, comment): here = os.getcwd() print('Resolving vimwiki') fname = anglify(f'/tmp/{addr}-{frag}.{mode}') cmd = subprocess.run([self_prog, '--mode', mode, '--output', fname, '--attach-prefix', f'{global_bilaga}', f'{here}/{addr}.wiki', frag]) if cmd.returncode != 0: print(f'Recursive call failed with {cmd.returncode}') if mode == 'tex': try: os.chdir(os.path.dirname(fname)); cmd = subprocess.run(['latexmk', '-lualatex', fname]) print(f'latexmk failed with {cmd.returncode}') finally: os.chdir(here) ofile = anglify(f'/tmp/{addr}-{frag}.pdf') else: ofile = fname return resolve_file(mode, source_file_name, ofile, '', comment) def resolve_default(mode, source_file_name, addr, frag, comment): outstr = '' if addr == '': outstr = generate_attachement(mode, get_heading(frag, data)) else: with open('{}/{}.wiki'.format(dirname(source_file_name), addr)) as f: dat = f.read() if frag == '': outstr = dat else: outstr = generate_attachement(mode, get_heading(frag, dat)) # print(addr) # print("Vimwiki intra-page links not yet supported") # outstr = '' return outstr # Should all take (mode string, url without protocol, fragment, comment) # and return the complete text of the attachement, formatted according # to mode resolvers = { 'mail': resolve_mail, 'file': resolve_file, 'local': resolve_file, 'help': resolve_help, 'vimwiki': resolve_vimwiki, } # ------------------------------------------------------------ # TODO handle case where heading is missing def get_heading(name, data): print('data =', data) print('name =', name) start_match = re.search(f'(?m)^(=+) {name} =+$', data) print('start_match =', start_match) heading_level = len(start_match.group(1)) print(start_match.end()) tail = data[start_match.end():] pat = f'(?m)^(={{1,{heading_level}}} .*|---*)$' print(pat) end_match = re.search(pat, tail) print(end_match) if end_match: return tail[:end_match.start() - 1] else: return tail # ------------------------------------------------------------ def renumber_wiki_headings(text): # Re-level all headings longest = 0 shortest = 1000 for m in re.finditer(r'(?m)^ *(=+).*=+ *$', text): longest = max(longest, len(m[1])) shortest = min(shortest, len(m[1])) for length in range(longest, shortest + 1): text = re.sub('=' * length, '=' * (length - shortest + 1), text) return text def format_wikitext(output, mode, paragraphs): if mode == 'txt': for paragraph in paragraphs: output.write('\n'.join(paragraph)) output.write('\n\n') elif mode == 'tex': if shutil.which('pandoc'): wiki_file = '/tmp/file.vimwiki' tex_file = '/tmp/file.tex' text = '\n\n'.join('\n'.join(par) for par in paragraphs) + '\n\n' full_text = renumber_wiki_headings(text) with open(wiki_file, 'w') as f: f.write(full_text) cmd = subprocess.run(['pandoc', '-f', 'vimwiki', '-t', 'latex', '-o', tex_file, wiki_file]) with open(tex_file) as f: for line in f: output.write(line) else: # no pandoc if options['mode'] == 'tex': pass else: output.write('\\begin{verbatim}\n') for paragraph in paragraphs: output.write('\n'.join(paragraph)) output.write('\n\n') if options['mode'] == 'tex': pass else: output.write('\\end{verbatim}\n') def main(): global path_base parser = argparse.ArgumentParser( description='parse vimwiki files', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--mode', choices=["tex", "txt"], default='txt', help="Format of output") parser.add_argument('--output') parser.add_argument('--attach-prefix', dest='attach_prefix') parser.add_argument('infile', help="Wiki-file to use for input") parser.add_argument('heading', help="Which heading to output") args = parser.parse_args() if pre := args.attach_prefix: attach_prefix = f'{pre}.' else: attach_prefix = '' mode = args.mode infile = args.infile heading = args.heading print(f'Running on {infile} # [{heading}]') outfile = os.path.realpath(args.output or f'doc.{mode}') print(f'outfile = {outfile}') os.chdir(os.path.dirname(infile)) file = open(infile, 'r') data = file.read() path_base = path.dirname(infile) try: os.mkdir(os.path.dirname(outfile)) except FileExistsError: pass output = open(outfile, 'w') if mode == 'tex': output.write(r''' \PassOptionsToPackage{unicode}{hyperref} \PassOptionsToPackage{hyphens}{url} \documentclass[a4paper]{article} \usepackage[T1]{fontenc} \usepackage[utf8]{inputenc} \usepackage[swedish]{babel} \usepackage{verbatim} \usepackage{fullpage} \usepackage{pdfpages} \usepackage{subfiles} \usepackage{parskip} \usepackage{fancyhdr} \usepackage{hyperref} \usepackage{soul} \usepackage{graphics} \usepackage{amsmath,amssymb} \usepackage{lmodern} \usepackage{iftex} \usepackage{xcolor} \providecommand{\tightlist}{\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} ''') elif mode == 'txt': sectioncounter = 0 subsectioncounter = 0 # '\\verbatiminput page1 = get_heading(heading, data) # bilagor = get_heading('Bilagor', page1).strip().split('\n') # Find all links r = r"\[\[([^|#\]]*)(?:#([^|\]]*))?(?:[|]([^\]]*))?\]\]" # bilagor = re.findall(r, page1) tag_iter = re.finditer(r, page1) # bilagor = [m[0] for m in re.findall(r, data)] bilagor = [] outstr = "" pos = 0 # Resolve all links bilaga_nr = 1 global_bilaga = bilaga_nr for match in tag_iter: url = match.group(1) or '' frag = match.group(2) or '' comment = match.group(3) or '' outstr += page1[pos:match.start()] pos = match.end() uid = hashlib.md5((url + frag).encode('UTF-8')).hexdigest() title = comment or url or frag try: proto, addr = url.split(":", 1) except ValueError: proto = 'vimwiki' addr = url print('proto, addr =', proto, addr) if proto in ['http', 'https']: url = f'{proto}:{addr}' if frag: url += f'#{frag}' if mode == 'tex': # outstr += f" \\href{{{url}}}{{{title} \\nolinkurl{{{url}}}}}" outstr += f'[[{url}|{title}]] [[{url}|{url}]])' else: outstr += f' {title} <{url}>' continue # outstr += title + f"(bilaga~\\ref{{{uid}}})" # TODO outstr += title + f' (bilaga {attach_prefix}{chr(bilaga_nr + 64)})' # outstr += title + " (se bilaga)" bilaga_nr += 1 global_bilaga = bilaga_nr # disabled for full PDF:s attach_str = '' if mode == 'tex': if proto not in ['file', 'local', 'vimwiki']: attach_str += f"\\section{{{title}}}\n\\label{{{uid}}}\n" else: attach_str += f"\\phantomsection\n\\label{{{uid}}}\n" elif mode == 'txt': sectioncounter += 1 subsectioncounter = 0 if proto not in ['file', 'local']: st = f"{attach_prefix}{chr(sectioncounter + 64)}. {title}" attach_str += f'\n{st}\n{"="*len(st)}\n' resolver = resolvers.get(proto, resolve_default) attach_str += resolver(mode, infile, addr, frag, comment) bilagor += [attach_str] outstr += page1[pos:] options = { 'date': '\\today' if mode == 'tex' else f'{datetime.now():%Y-%m-%d}', 'mode': 'vimwiki', 'author': pwd.getpwuid(os.getuid()).pw_gecos } # split the text into paragraph blocks paragraphs = split_into_paragraphs(outstr) override_options, paragraphs = parse_options_paragraph(paragraphs) # Merge found options with default options for key, value in override_options.items(): # Parse and recard date option if key == 'date': dt = dtparser.parse(value) if mode == 'tex': options['date'] = f'{dt:%Y--%m--%d}' else: options['date'] = f'{dt:%Y-%m-%d}' else: options[key] = value # Preamble if mode == 'tex': output.write(f""" \\title{{{heading}}} \\date{{{options['date']}}} \\author{{{options['author']}}} \\begin{{document}} \\maketitle % \\tableofcontents """) elif mode == 'txt': d = options['date'] output.write(f'\n{heading.center(40)}\n{d.center(40)}\n') format_wikitext(output, mode, paragraphs) # Postamble if mode == 'tex': output.write('\\appendix\n') output.write('\n'.join(bilagor)) if mode == 'tex': output.write('\\end{document}') if __name__ == '__main__': main()