aboutsummaryrefslogtreecommitdiff
path: root/tests/validate-html/fetch_data.py
blob: 14ecca7571c0724dc4fc2ece7550001cbf3fa246 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/usr/bin/env python

import subprocess
import urllib.request

from selenium import webdriver
from selenium.webdriver.firefox.options import Options

def fetch_rendered(url, port):
    options = Options()
    options.add_argument('--headless')
    driver = webdriver.Firefox(options=options)

    driver.get(url)
    page_source = driver.page_source

    # TODO check encoding from driver
    page_encoded = page_source.encode('utf-8')

    cmd = subprocess.run(['xmllint', '--format', '-'],
            input=page_encoded,
            capture_output=True)

    if cmd.returncode == 0:
        port.write(cmd.stdout)
    else:
        port.write(page_encoded)

def fetch_raw(url, port):
    response = urllib.request.urlopen(url)
    data = response.read()
    port.write(data)
    
url = 'http://localhost:8080/week/2022-03-31.html'

with open('raw.xhtml', 'wb') as f:
    fetch_raw(url, f)

# with open('raw.html', 'wb') as f:
#     fetch_raw(f'{url}?html', f)

with open('selenium.xhtml', 'wb') as f:
    fetch_rendered(url, f)

# with open('selenium.html', 'wb') as f:
#     fetch_rendered(f'{url}?html', f)