1 files changed, 188 insertions, 137 deletions
diff --git a/pyenc/enumerate_classes.py b/pyenc/enumerate_classes.py
index 822852e..d4197bd 100644
--- a/pyenc/enumerate_classes.py
+++ b/pyenc/enumerate_classes.py
@@ -1,44 +1,39 @@
-
 """
 Loads all puppet files in environment, parse them, and store the
 parsed data in the database.
 """
 
+# TODO write propper tests
+# Which escpecially tests environments
+
+from typing import Union
 import hashlib
 import json
 import os
 import os.path
 import subprocess
-import time
-from sqlalchemy.sql import text
-
-import threading
-from threading import Lock, Thread
-from queue import Queue
+# import time
 
+from sqlalchemy.sql import text
 
 # import pyenc
-from pyenc.model import db
-from pyenc import model
+from pyenc.app.model import db
+from pyenc.app import model
 
-from typing import Union, Generator
 
-
-def with_lock(lock, proc):
-    try:
-        lock.acquire()
-        proc()
-    finally:
-        lock.release()
+Path = Union[str, bytes]
 
 
-def call(proc, *args):
-    proc(*args)
+def find(path: Path, **kvs) -> list[bytes]:
+    """
+    Wrapper around find(1).
 
-path = Union[str, bytes]
+    variables:
+    path -- base path for the find operation
 
-def find(path: path, **kvs) -> list[bytes]:
-    """Wrapper around find(1)."""
+    key word args:
+    any that find(1) takes, but with the leading dash (-) removed.
+    """
     cmdline = ['find', path]
     for key, value in kvs.items():
         cmdline.append(f'-{key}')
@@ -46,10 +41,13 @@ def find(path: path, **kvs) -> list[bytes]:
     cmdline.append('-print0')
 
     cmd = subprocess.run(cmdline, capture_output=True, check=True)
-    return (f for f in cmd.stdout.split(b'\0') if f)
+    return (f.decode('UTF-8') for f in cmd.stdout.split(b'\0') if f)
 
 
 class PuppetParseError(Exception):
+    """
+    Error holding failure result of `puppet parser dump`.
+    """
     def __init__(self, code, msg):
         super().__init__()
         self.code = code
@@ -62,62 +60,39 @@ class PuppetParseError(Exception):
         return repr(self)
 
 
-def puppet_parse(file: path) -> bytes:
+def puppet_parse(file: Path) -> bytes:
+    """
+    Runs the external puppet parser, and returns json as bytes.
+
+    Note that this is really slow.
+
+    file -- Path to the file to check
+    """
     with subprocess.Popen(
             ['puppet', 'parser', 'dump', '--format', 'json', file],
+            text=True,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE) as cmd:
-        if cmd.retuncode and cmd.returncode != 0:
-            raise PuppetParseError(cmd.returncode, cmd.stderr.read().decode('UTF-8'))
+        if cmd.returncode and cmd.returncode != 0:
+            raise PuppetParseError(cmd.returncode, cmd.stderr.read())
 
         json_data = cmd.stdout.read()
 
         if (value := cmd.wait()) != 0:
-            raise PuppetParseError(value, cmd.stderr.read().decode('UTF-8'))
+            raise PuppetParseError(value, cmd.stderr.read())
 
         return json_data
 
 
-def parse_files(files: list[path]) -> Generator[model.PuppetFile]:
-    for i, file in enumerate(files):
-        try:
-            stat = os.stat(file)
-
-            last_modify = stat.st_mtime
-            old_object = model.PuppetFile.query \
-                              .where(model.PuppetFile.path == file) \
-                              .first()
-
-            if old_object and old_object.last_parse > last_modify:
-                # file unchanged since our last parse, skip
-                continue
-
-            print(f'{i}/{len(files)}: {file}')
-
-            if old_object:
-                puppet_file = old_object
-            else:
-                puppet_file = model.PuppetFile(path=file)
-            puppet_file.last_parse = time.time()
-            # m.json = puppet_parse(file)
-
-            yield puppet_file
-
-        except PuppetParseError as err:
-            # TODO cache error
-            print('Error:', err)
-            continue
-
-
 def interpret_file(json_data: dict) -> list[str]:
-    """Find all classes in json-representation of file."""
+    """Find all puppet class names in json-representation of file."""
     top = json_data['^']
     if top[0] == 'class':
         tmp = top[1]['#']
         idx = tmp.index('name')
         return [tmp[idx + 1]]
         # print(tmp[idx + 1])
-    elif top[0] == 'block':
+    if top[0] == 'block':
         ret_value = []
         for element in top[1:]:
             if element['^'][0] == 'class':
@@ -125,111 +100,187 @@ def interpret_file(json_data: dict) -> list[str]:
                 idx = tmp.index('name')
                 ret_value.append(tmp[idx + 1])
         return ret_value
-    else:
-        return []
+    return []
 
 
+def enumerate_files(path_base, environment):
+    """
+    Enumerate all puppete files in a puppet environment.
 
-def enumerate_files(path_base, environment_name):
+    Updates the database so that the query
+    >>> SELECT path FROM puppet_file WHERE environment = :environment
+    Returns how the directory tree looks *right now*.
+    """
     path = os.path.join(path_base, environment.name)
-
-    files = list(find(path, type='f', name='*.pp'))
+    files = find(path, type='f', name='*.pp')
 
     try:
-        for puppet_file in parse_files(files):
+        for puppet_file in (model.PuppetFile(path=file) for file in files):
             with open(puppet_file.path, 'rb') as f:
                 checksum = hashlib.sha256(f.read()).hexdigest()
             # Returns puppet_file.path, relative to path_base
-            puppet_file.path = os.path.relpath(puppet_file.path, path.encode('UTF-8'))
-            # TODO does flask want the whole environment object?
-            puppet_file.environment = environment.id
-            puppet_file.checksum = checksum
-            db.session.add(puppet_file)
+
+            # This works in at least postgres and sqlite3
+            db.engine.execute(text("""
+            INSERT INTO puppet_file (path, environment, checksum)
+            VALUES (:path, :environment, :checksum)
+            ON CONFLICT (path, environment)
+            DO UPDATE SET checksum = EXCLUDED.checksum
+            """), {
+                'path': os.path.relpath(puppet_file.path, path),
+                'environment': environment.id,
+                'checksum': checksum,
+            })
 
     finally:
         db.session.commit()
 
 
-def run(path_base: path, environment_name: str):
+def ensure_environment(name):
+    """
+    Returns a valid PuppetEnvironment object for the named environment.
+
+    If it already exists in the database the return the existing,
+    otherwise create it and return that.
+    """
+    environment = model \
+        .PuppetEnvironment \
+        .query \
+        .where(model.PuppetEnvironment.name == name) \
+        .one_or_none()
 
-    ### Ensure that we have oru environment
-    environment = model.PuppetEnvironment.query.where(model.PuppetEnvironment.name == environment_name).first()
     if not environment:
-        environment = model.PuppetEnvironment(name=environment_name)
+        environment = model.PuppetEnvironment(name=name)
         db.session.add(environment)
-        # TODO does this update the environment object
+        # This also updates our environment object, filling in
+        # autogenerated fieldsfields.
         db.session.commit()
 
-    ### Identify all puppet files, and note the base of their content
-    # enumerate_files(path_base, environment_name)
+    return environment
+
 
+def run(path_base: Path = '/etc/puppetlabs/code/environments',
+        environment_name: str = 'production'):
+    """
+    Runs the class enumeration.
+
+    Arguments:
+    path_base -- Path to where each puppet environment resides
+    environment_name -- Which puppet environment to parse
+    """
+
+    ### Ensure that we have our environment
+    environment = ensure_environment(environment_name)
+
+    ### Identify all puppet files, and note the base of their content
+    enumerate_files(path_base, environment)
 
     ### Find all puppet files which we haven't parsed
 
-    result = db.engine.execute(text("""
-    SELECT 
-        f.id,
-        f.path, 
-        f.last_parse, 
-        f.checksum,
-        env.name
-    FROM puppet_file f
-    LEFT OUTER JOIN puppet_file_content c
-        ON f.checksum = c.checksum
-    LEFT JOIN puppet_environment env
-        ON f.environment = env.id
-    WHERE c.json IS NULL
-    """))
-
-    # db_lock = Lock()
-    threads = []
-    q = Queue()
-    for (id, path, last, checksum, environment) in result:
-        print(environment, path)
-        # return
-        full_path = os.path.join(path_base.encode('UTF-8'), environment.encode('UTF-8'), path)
-
-        with open(full_path, 'rb') as f:
-            current_checksum = hashlib.sha256(f.read()).hexdigest()
-
-        if current_checksum != checksum:
-            print(f'Checksum changed for {environment}/{path}')
-            # db.engine.execute(model.PuppetFile.delete().where(model.PuppetFile.id == id))
-            continue
-
-        thread = Thread(target=lambda checksum, full_path: (checksum, puppet_parse(full_path)),
-                args=(checksum, full_path),
-                name=f'{environment}/{path}')
-        thread.start()
-        threads.append(thread)
+    base = model.PuppetFile \
+        .query \
+        .outerjoin(model.PuppetFileContent,
+                   model.PuppetFile.checksum == model.PuppetFileContent.checksum) \
+        .where(model.PuppetFileContent.json == None)  # noqa: E711
 
-    try:
-        # for thread in threads:
-            # print(f'Waiting on {thread.name}')
-            # thread.join()
-            # print(f'{thread.name} joined')
-        while not q.empty():
-            print('Getting something from queue')
-            (checksum, item) = q.get()
-            print(checksum)
-            pfc = model.PuppetFileContent(checksum=checksum, json=item)
-            db.session.add(pfc)
-            q.task_done()
-    finally:
-        db.session.commit()
+    # count for progress bar
+    count = base.count()
 
-    return
+    result = base \
+        .join(model.PuppetEnvironment) \
+        .add_column(model.PuppetEnvironment.name) \
+        .all()
 
+    db.session.commit()
     try:
-        for puppet_file in model.PuppetFile.query.all():
+        for (i, (puppet_file, env)) in enumerate(result):
+            print(env, puppet_file.path)
+            print(f'{i} / {count}', end='\r')
+
+            full_path = os.path.join(path_base, env, puppet_file.path)
+
             try:
-                class_names = interpret_file(json.loads(os.path.join(path, puppet_file.json)))
-                for class_name in class_names:
-                    db.session.add(model.PuppetClass(
-                        class_name=class_name,
-                        comes_from=puppet_file))
-            except Exception as e:
+                item = puppet_parse(full_path)
+            except PuppetParseError as e:
                 print(e)
-                print(f'Failed: {puppet_file.path}')
+                continue
+
+            # Check that the file we just parsed is the file we
+            # expected.
+            # NOTE this is technically incorrect, consider
+            # | Us                     | Attacker     |
+            # |------------------------|--------------|
+            # | initial checksum       |              |
+            # |                        | replace file |
+            # | parse                  |              |
+            # |                        | restore file |
+            # | second checksum (this) |              |
+
+            with open(full_path, 'rb') as f:
+                current_checksum = hashlib.sha256(f.read()).hexdigest()
+
+            if current_checksum != puppet_file.checksum:
+                print(f'Checksum changed for {env}/{puppet_file.path}')
+                continue
+
+            # File parsed was file we expected to parse, addit to the
+            # database
+            pfc = model.PuppetFileContent(file_id=puppet_file.id,
+                                          checksum=puppet_file.checksum,
+                                          json=item)
+            db.session.add(pfc)
+
+        print('loop finished')
     finally:
+        # TODO sqlite fails here, complains that the "database is locked"
         db.session.commit()
+
+    for file_content in model.PuppetFileContent.query.all():
+        try:
+            class_names = interpret_file(json.loads(file_content.json))
+            for class_name in class_names:
+                # cls = model.PuppetClass(class_name=class_name)
+                # cls.environments.append(environment)
+                # cls.files.append(file_content.file)
+
+                # Add classs (if not exists)
+                db.engine.execute(text("""
+                INSERT INTO puppet_class (class_name)
+                VALUES (:name)
+                ON CONFLICT (class_name) DO NOTHING
+                """), {'name': class_name})
+
+                # Add class to environment (if not already there)
+                db.engine.execute(text("""
+                INSERT INTO environment_classes (environment_id, class_id)
+                SELECT :env, id FROM puppet_class WHERE class_name = :name
+                ON CONFLICT (environment_id, class_id) DO NOTHING
+                """), {'env': environment.id, 'name': class_name})
+
+                # Add class to file mapping (if not already there)
+                db.engine.execute(text("""
+                INSERT INTO class_files (file_id, class_id)
+                SELECT :file, id FROM puppet_class WHERE class_name = :name
+                ON CONFLICT (file_id, class_id) DO NOTHING
+                """), {'file': file_content.file_id, 'name': class_name})
+
+        except Exception as e:
+            print(e)
+            # print(f'Failed: {puppet_file.path}')
+
+    db.session.commit()
+
+
+def gc_puppet_files():
+    """
+    Remove unused puppet file content.
+
+    Removes all puppet file contents which no longer has an "owning" file.
+    """
+
+    db.engine.execute(text("""
+    DELETE FROM puppet_file_content WHERE pfc.id IN
+    ( SELECT pfc.id FROM puppet_file_content pfc
+      LEFT JOIN puppet_file f ON pfc.checksum = f.checksum
+      WHERE f.id IS NULL
+    ) """))