diff options
Diffstat (limited to 'pyenc/enumerate_classes.py')
-rw-r--r-- | pyenc/enumerate_classes.py | 325 |
1 files changed, 188 insertions, 137 deletions
diff --git a/pyenc/enumerate_classes.py b/pyenc/enumerate_classes.py index 822852e..d4197bd 100644 --- a/pyenc/enumerate_classes.py +++ b/pyenc/enumerate_classes.py @@ -1,44 +1,39 @@ - """ Loads all puppet files in environment, parse them, and store the parsed data in the database. """ +# TODO write propper tests +# Which escpecially tests environments + +from typing import Union import hashlib import json import os import os.path import subprocess -import time -from sqlalchemy.sql import text - -import threading -from threading import Lock, Thread -from queue import Queue +# import time +from sqlalchemy.sql import text # import pyenc -from pyenc.model import db -from pyenc import model +from pyenc.app.model import db +from pyenc.app import model -from typing import Union, Generator - -def with_lock(lock, proc): - try: - lock.acquire() - proc() - finally: - lock.release() +Path = Union[str, bytes] -def call(proc, *args): - proc(*args) +def find(path: Path, **kvs) -> list[bytes]: + """ + Wrapper around find(1). -path = Union[str, bytes] + variables: + path -- base path for the find operation -def find(path: path, **kvs) -> list[bytes]: - """Wrapper around find(1).""" + key word args: + any that find(1) takes, but with the leading dash (-) removed. + """ cmdline = ['find', path] for key, value in kvs.items(): cmdline.append(f'-{key}') @@ -46,10 +41,13 @@ def find(path: path, **kvs) -> list[bytes]: cmdline.append('-print0') cmd = subprocess.run(cmdline, capture_output=True, check=True) - return (f for f in cmd.stdout.split(b'\0') if f) + return (f.decode('UTF-8') for f in cmd.stdout.split(b'\0') if f) class PuppetParseError(Exception): + """ + Error holding failure result of `puppet parser dump`. + """ def __init__(self, code, msg): super().__init__() self.code = code @@ -62,62 +60,39 @@ class PuppetParseError(Exception): return repr(self) -def puppet_parse(file: path) -> bytes: +def puppet_parse(file: Path) -> bytes: + """ + Runs the external puppet parser, and returns json as bytes. + + Note that this is really slow. + + file -- Path to the file to check + """ with subprocess.Popen( ['puppet', 'parser', 'dump', '--format', 'json', file], + text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as cmd: - if cmd.retuncode and cmd.returncode != 0: - raise PuppetParseError(cmd.returncode, cmd.stderr.read().decode('UTF-8')) + if cmd.returncode and cmd.returncode != 0: + raise PuppetParseError(cmd.returncode, cmd.stderr.read()) json_data = cmd.stdout.read() if (value := cmd.wait()) != 0: - raise PuppetParseError(value, cmd.stderr.read().decode('UTF-8')) + raise PuppetParseError(value, cmd.stderr.read()) return json_data -def parse_files(files: list[path]) -> Generator[model.PuppetFile]: - for i, file in enumerate(files): - try: - stat = os.stat(file) - - last_modify = stat.st_mtime - old_object = model.PuppetFile.query \ - .where(model.PuppetFile.path == file) \ - .first() - - if old_object and old_object.last_parse > last_modify: - # file unchanged since our last parse, skip - continue - - print(f'{i}/{len(files)}: {file}') - - if old_object: - puppet_file = old_object - else: - puppet_file = model.PuppetFile(path=file) - puppet_file.last_parse = time.time() - # m.json = puppet_parse(file) - - yield puppet_file - - except PuppetParseError as err: - # TODO cache error - print('Error:', err) - continue - - def interpret_file(json_data: dict) -> list[str]: - """Find all classes in json-representation of file.""" + """Find all puppet class names in json-representation of file.""" top = json_data['^'] if top[0] == 'class': tmp = top[1]['#'] idx = tmp.index('name') return [tmp[idx + 1]] # print(tmp[idx + 1]) - elif top[0] == 'block': + if top[0] == 'block': ret_value = [] for element in top[1:]: if element['^'][0] == 'class': @@ -125,111 +100,187 @@ def interpret_file(json_data: dict) -> list[str]: idx = tmp.index('name') ret_value.append(tmp[idx + 1]) return ret_value - else: - return [] + return [] +def enumerate_files(path_base, environment): + """ + Enumerate all puppete files in a puppet environment. -def enumerate_files(path_base, environment_name): + Updates the database so that the query + >>> SELECT path FROM puppet_file WHERE environment = :environment + Returns how the directory tree looks *right now*. + """ path = os.path.join(path_base, environment.name) - - files = list(find(path, type='f', name='*.pp')) + files = find(path, type='f', name='*.pp') try: - for puppet_file in parse_files(files): + for puppet_file in (model.PuppetFile(path=file) for file in files): with open(puppet_file.path, 'rb') as f: checksum = hashlib.sha256(f.read()).hexdigest() # Returns puppet_file.path, relative to path_base - puppet_file.path = os.path.relpath(puppet_file.path, path.encode('UTF-8')) - # TODO does flask want the whole environment object? - puppet_file.environment = environment.id - puppet_file.checksum = checksum - db.session.add(puppet_file) + + # This works in at least postgres and sqlite3 + db.engine.execute(text(""" + INSERT INTO puppet_file (path, environment, checksum) + VALUES (:path, :environment, :checksum) + ON CONFLICT (path, environment) + DO UPDATE SET checksum = EXCLUDED.checksum + """), { + 'path': os.path.relpath(puppet_file.path, path), + 'environment': environment.id, + 'checksum': checksum, + }) finally: db.session.commit() -def run(path_base: path, environment_name: str): +def ensure_environment(name): + """ + Returns a valid PuppetEnvironment object for the named environment. + + If it already exists in the database the return the existing, + otherwise create it and return that. + """ + environment = model \ + .PuppetEnvironment \ + .query \ + .where(model.PuppetEnvironment.name == name) \ + .one_or_none() - ### Ensure that we have oru environment - environment = model.PuppetEnvironment.query.where(model.PuppetEnvironment.name == environment_name).first() if not environment: - environment = model.PuppetEnvironment(name=environment_name) + environment = model.PuppetEnvironment(name=name) db.session.add(environment) - # TODO does this update the environment object + # This also updates our environment object, filling in + # autogenerated fieldsfields. db.session.commit() - ### Identify all puppet files, and note the base of their content - # enumerate_files(path_base, environment_name) + return environment + +def run(path_base: Path = '/etc/puppetlabs/code/environments', + environment_name: str = 'production'): + """ + Runs the class enumeration. + + Arguments: + path_base -- Path to where each puppet environment resides + environment_name -- Which puppet environment to parse + """ + + ### Ensure that we have our environment + environment = ensure_environment(environment_name) + + ### Identify all puppet files, and note the base of their content + enumerate_files(path_base, environment) ### Find all puppet files which we haven't parsed - result = db.engine.execute(text(""" - SELECT - f.id, - f.path, - f.last_parse, - f.checksum, - env.name - FROM puppet_file f - LEFT OUTER JOIN puppet_file_content c - ON f.checksum = c.checksum - LEFT JOIN puppet_environment env - ON f.environment = env.id - WHERE c.json IS NULL - """)) - - # db_lock = Lock() - threads = [] - q = Queue() - for (id, path, last, checksum, environment) in result: - print(environment, path) - # return - full_path = os.path.join(path_base.encode('UTF-8'), environment.encode('UTF-8'), path) - - with open(full_path, 'rb') as f: - current_checksum = hashlib.sha256(f.read()).hexdigest() - - if current_checksum != checksum: - print(f'Checksum changed for {environment}/{path}') - # db.engine.execute(model.PuppetFile.delete().where(model.PuppetFile.id == id)) - continue - - thread = Thread(target=lambda checksum, full_path: (checksum, puppet_parse(full_path)), - args=(checksum, full_path), - name=f'{environment}/{path}') - thread.start() - threads.append(thread) + base = model.PuppetFile \ + .query \ + .outerjoin(model.PuppetFileContent, + model.PuppetFile.checksum == model.PuppetFileContent.checksum) \ + .where(model.PuppetFileContent.json == None) # noqa: E711 - try: - # for thread in threads: - # print(f'Waiting on {thread.name}') - # thread.join() - # print(f'{thread.name} joined') - while not q.empty(): - print('Getting something from queue') - (checksum, item) = q.get() - print(checksum) - pfc = model.PuppetFileContent(checksum=checksum, json=item) - db.session.add(pfc) - q.task_done() - finally: - db.session.commit() + # count for progress bar + count = base.count() - return + result = base \ + .join(model.PuppetEnvironment) \ + .add_column(model.PuppetEnvironment.name) \ + .all() + db.session.commit() try: - for puppet_file in model.PuppetFile.query.all(): + for (i, (puppet_file, env)) in enumerate(result): + print(env, puppet_file.path) + print(f'{i} / {count}', end='\r') + + full_path = os.path.join(path_base, env, puppet_file.path) + try: - class_names = interpret_file(json.loads(os.path.join(path, puppet_file.json))) - for class_name in class_names: - db.session.add(model.PuppetClass( - class_name=class_name, - comes_from=puppet_file)) - except Exception as e: + item = puppet_parse(full_path) + except PuppetParseError as e: print(e) - print(f'Failed: {puppet_file.path}') + continue + + # Check that the file we just parsed is the file we + # expected. + # NOTE this is technically incorrect, consider + # | Us | Attacker | + # |------------------------|--------------| + # | initial checksum | | + # | | replace file | + # | parse | | + # | | restore file | + # | second checksum (this) | | + + with open(full_path, 'rb') as f: + current_checksum = hashlib.sha256(f.read()).hexdigest() + + if current_checksum != puppet_file.checksum: + print(f'Checksum changed for {env}/{puppet_file.path}') + continue + + # File parsed was file we expected to parse, addit to the + # database + pfc = model.PuppetFileContent(file_id=puppet_file.id, + checksum=puppet_file.checksum, + json=item) + db.session.add(pfc) + + print('loop finished') finally: + # TODO sqlite fails here, complains that the "database is locked" db.session.commit() + + for file_content in model.PuppetFileContent.query.all(): + try: + class_names = interpret_file(json.loads(file_content.json)) + for class_name in class_names: + # cls = model.PuppetClass(class_name=class_name) + # cls.environments.append(environment) + # cls.files.append(file_content.file) + + # Add classs (if not exists) + db.engine.execute(text(""" + INSERT INTO puppet_class (class_name) + VALUES (:name) + ON CONFLICT (class_name) DO NOTHING + """), {'name': class_name}) + + # Add class to environment (if not already there) + db.engine.execute(text(""" + INSERT INTO environment_classes (environment_id, class_id) + SELECT :env, id FROM puppet_class WHERE class_name = :name + ON CONFLICT (environment_id, class_id) DO NOTHING + """), {'env': environment.id, 'name': class_name}) + + # Add class to file mapping (if not already there) + db.engine.execute(text(""" + INSERT INTO class_files (file_id, class_id) + SELECT :file, id FROM puppet_class WHERE class_name = :name + ON CONFLICT (file_id, class_id) DO NOTHING + """), {'file': file_content.file_id, 'name': class_name}) + + except Exception as e: + print(e) + # print(f'Failed: {puppet_file.path}') + + db.session.commit() + + +def gc_puppet_files(): + """ + Remove unused puppet file content. + + Removes all puppet file contents which no longer has an "owning" file. + """ + + db.engine.execute(text(""" + DELETE FROM puppet_file_content WHERE pfc.id IN + ( SELECT pfc.id FROM puppet_file_content pfc + LEFT JOIN puppet_file f ON pfc.checksum = f.checksum + WHERE f.id IS NULL + ) """)) |