1 files changed, 63 insertions, 19 deletions
diff --git a/mu4web/main.py b/mu4web/main.py
index e6b1092..981682a 100644
--- a/mu4web/main.py
+++ b/mu4web/main.py
@@ -472,25 +472,62 @@ class IMGParser(HTMLParser):
         self.result = result
         self.msg_id = msg_id
 
-    def handle_starttag(self, _, attrs):
-        self.result += '<img '
-        for key, value in attrs:
-            if key == 'src':
-                if m := IMGParser.rx.match(value):
-                    params = urlencode({'id': self.msg_id, 'cid': m[1]})
-                    data = '/cid?' + params
+    def handle_starttag(self, tag, attrs):
+        # TODO this will also get called for self closing tags
+        # (<img/>), which will drop that slash. FIX
+
+        if tag == 'img':
+            # - Expand img tags with CID: url's to point to our server.
+            #   These should be safe (from a tracking perspective) since
+            #   they are downloaded as part of the mail.
+            # - Other images are blocked, a piece of javascript is
+            #   later added to unblock them on click
+            self.result += '<img '
+            for key, value in attrs:
+                if key == 'src':
+                    if m := IMGParser.rx.match(value):
+                        params = urlencode({'id': self.msg_id, 'cid': m[1]})
+                        key = html.escape(key)
+                        data = '/cid?' + params
+                        self.result += f' {key}="{data}"'
+                    else:
+                        # TODO Insert information about the blocked
+                        # resource into the image, such as title text,
+                        # original src, approximate size (and from
+                        # that: infer if it's an image only used for tracking)
+                        key = 'data-src'
+                        data = html.escape(value, quote=True)
+                        self.result += f' {key}="{data}"'
+
+                        key = 'src'
+                        data = '/static/content-blocked.svg'
+                        self.result += f' {key}="{data}"'
                 else:
-                    data = value
-            else:
-                data = value
-            key = html.escape(key)
-            data = html.escape(data, quote=True)
-
-            self.result += f'{key}="{data}"'
+                    key = html.escape(key)
+                    data = html.escape(value, quote=True)
+
+                    self.result += f' {key}="{data}"'
+            self.result += '>'
+
+        elif tag == 'script':
+            # Keep script tag contents, but change it to text. I'm not
+            # sure how many try to inject javascript into their
+            # emails, but we don't want any of it.
+            args = ' '.join(f'{key}={value}' for (key, value) in attrs)
+            self.result += '<pre>' + html.escape(f'<script {args}>') + '</pre>'
+
+        elif tag == 'a':
+            # Add target="_parent" to all anchors. This causes links
+            # in iframe:s (where the content will probably be shown)
+            # to open in the current (top level) page, instead of
+            # inside the iframe.
+            args = ' '.join(f'{html.escape(key)}={html.escape(value)}'
+                            for (key, value)
+                            in [*attrs, ('target', '_parent')])
+            self.result += f'<a {args}>'
 
-        # TODO this will also get called for self closing tags
-        # (<a/>), which will drop that slash. FIX
-        self.result += '>'
+        else:
+            assert False, 'Should never be reached'
 
 
 @app.route('/part')
@@ -519,15 +556,22 @@ def attachement_part_page():
         # on or off
 
         result = MutableString()
-        idx = 0
         # Content encoding here?
         source = attachement.get_content()
         parser = IMGParser(result, msg_id)
 
-        for m in re.finditer(r'< *img[^>]*>', source):
+        idx = 0
+        for m in re.finditer(r'< *(a|img|script)[^>]*>', source):
             result += source[idx:m.start()]
             idx = m.end()
             parser.feed(m[0])
+        result += source[idx:]
+
+        # This script adds an onclick event for each image we blocked
+        # above, which unblocks it.
+        # TODO this "fails" for images wrapped in anchor tags, since
+        # the anchor tag has priority.
+        result += "\n<script src='/static/enable_images.js'></script>"
 
         return str(result)