""" Loads all puppet files in environment, parse them, and store the parsed data in the database. """ # TODO write propper tests # Which escpecially tests environments from typing import Union import hashlib import json import os import os.path import subprocess # import time import traceback from sqlalchemy.sql import text # import pyenc from pyenc.app.model import db from pyenc.app import model Path = Union[str, bytes] def find(path: Path, **kvs) -> list[bytes]: """ Wrapper around find(1). variables: path -- base path for the find operation key word args: any that find(1) takes, but with the leading dash (-) removed. """ cmdline = ['find', path] for key, value in kvs.items(): cmdline.append(f'-{key}') cmdline.append(value) cmdline.append('-print0') cmd = subprocess.run(cmdline, capture_output=True, check=True) return (f.decode('UTF-8') for f in cmd.stdout.split(b'\0') if f) class PuppetParseError(Exception): """ Error holding failure result of `puppet parser dump`. """ def __init__(self, code, msg): super().__init__() self.code = code self.msg = msg def __repr__(self): return f'PuppetParserError({self.code}, {self.msg})' def __str__(self): return repr(self) def puppet_parse(file: Path) -> bytes: """ Runs the external puppet parser, and returns json as bytes. Note that this is really slow. file -- Path to the file to check """ with subprocess.Popen( ['puppet', 'parser', 'dump', '--format', 'json', file], text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as cmd: if cmd.returncode and cmd.returncode != 0: raise PuppetParseError(cmd.returncode, cmd.stderr.read()) json_data = cmd.stdout.read() if (value := cmd.wait()) != 0: raise PuppetParseError(value, cmd.stderr.read()) return json_data def interpret_file(json_data: dict) -> list[str]: """Find all puppet class names in json-representation of file.""" top = json_data['^'] if top[0] == 'class': tmp = top[1]['#'] idx = tmp.index('name') return [tmp[idx + 1]] # print(tmp[idx + 1]) if top[0] == 'block': ret_value = [] for element in top[1:]: if element['^'][0] == 'class': tmp = element['^'][1]['#'] idx = tmp.index('name') ret_value.append(tmp[idx + 1]) return ret_value return [] def enumerate_files(path_base, environment): """ Enumerate all puppete files in a puppet environment. Updates the database so that the query >>> SELECT path FROM puppet_file WHERE environment = :environment Returns how the directory tree looks *right now*. """ path = os.path.join(path_base, environment.name) files = find(path, type='f', name='*.pp') files = [f for f in find(path, type='f', name='*.pp') if os.path.basename(os.path.dirname(f)) == 'manifests'] try: for puppet_file in (model.PuppetFile(path=file) for file in files): with open(puppet_file.path, 'rb') as f: checksum = hashlib.sha256(f.read()).hexdigest() # Returns puppet_file.path, relative to path_base # This works in at least postgres and sqlite3 db.engine.execute(text(""" INSERT INTO puppet_file (path, environment_id, checksum) VALUES (:path, :environment, :checksum) ON CONFLICT (path, environment_id) DO UPDATE SET checksum = EXCLUDED.checksum """), { 'path': os.path.relpath(puppet_file.path, path), 'environment': environment.id, 'checksum': checksum, }) finally: db.session.commit() def ensure_environment(name): """ Returns a valid PuppetEnvironment object for the named environment. If it already exists in the database the return the existing, otherwise create it and return that. """ environment = model \ .Environment \ .query \ .where(model.Environment.name == name) \ .one_or_none() if not environment: environment = model.Environment(name=name) db.session.add(environment) # This also updates our environment object, filling in # autogenerated fieldsfields. db.session.commit() return environment def run(path_base: Path = '/etc/puppetlabs/code/environments', environment_name: str = 'production'): """ Runs the class enumeration. Arguments: path_base -- Path to where each puppet environment resides environment_name -- Which puppet environment to parse """ ### Ensure that we have our environment environment = ensure_environment(environment_name) ### Identify all puppet files, and note the base of their content enumerate_files(path_base, environment) ### Find all puppet files which we haven't parsed subexpr = model.db.session \ .query(model.PuppetFile.path, model.PuppetFile.checksum, # Selects any of the availably environmentns. Since the checksum # is the same the file should also be the same, regardles of # which environment we chose model.db.func.min(model.PuppetFile.environment_id).label('env_id')) \ .outerjoin(model.PuppetFileContent, model.PuppetFile.checksum == model.PuppetFileContent.checksum) \ .where(model.PuppetFileContent.json == None) \ .group_by(model.PuppetFile.checksum, model.PuppetFile.path) \ .cte() base = model.db.session \ .query(subexpr.c.path, subexpr.c.checksum, model.Environment.name) \ .join(model.Environment, model.Environment.id == subexpr.c.env_id) files = base.all() count = base.count() db.session.commit() # Parse all puppet files, and store their output into pupet_file_content try: for (i, (path, checksum, env)) in enumerate(files): print(f'\x1b[2K{env} {path}') print(f'{i} / {count}', end='\r') full_path = os.path.join(path_base, env, path) try: item = puppet_parse(full_path) except PuppetParseError as e: print(e) continue # Check that the file we just parsed is the file we # expected. # NOTE this is technically incorrect, consider # | Us | Attacker | # |------------------------|--------------| # | initial checksum | | # | | replace file | # | parse | | # | | restore file | # | second checksum (this) | | with open(full_path, 'rb') as f: current_checksum = hashlib.sha256(f.read()).hexdigest() if current_checksum != checksum: print(f'Checksum changed for {env}/{path}') continue # File parsed was file we expected to parse, addit to the # database pfc = model.PuppetFileContent(checksum=checksum, json=item) db.session.add(pfc) print('loop finished') finally: # TODO sqlite fails here, complains that the "database is locked" db.session.commit() # Interpret the parsed result of all parsed puppet files # This takes a few seconds for file in model.PuppetFile.query.where(model.PuppetFile.content).all(): try: class_names = interpret_file(json.loads(file.content.json)) for class_name in class_names: db.engine.execute(text(""" INSERT INTO puppet_class (name) VALUES (:name) ON CONFLICT (name) DO NOTHING """), {'name': class_name}) # Add class to environment (if not already there) # TODO this adds to much db.engine.execute(text(""" INSERT INTO environment_classes (environment_id, class_id) SELECT :env, id FROM puppet_class WHERE puppet_class.name = :name ON CONFLICT (environment_id, class_id) DO NOTHING """), {'env': environment.id, 'name': class_name}) # Add class to file mapping (if not already there) db.engine.execute(text(""" INSERT INTO class_files (file_id, class_id) SELECT :file, id FROM puppet_class WHERE puppet_class.name = :name ON CONFLICT (file_id, class_id) DO NOTHING """), {'file': file.id, 'name': class_name}) except Exception as e: print(f'Error for {file.id} ({file.path}) - {e}') traceback.print_exc() db.session.commit() def gc_puppet_files(): """ Remove unused puppet file content. Removes all puppet file contents which no longer has an "owning" file. """ db.engine.execute(text(""" DELETE FROM puppet_file_content WHERE pfc.id IN ( SELECT pfc.id FROM puppet_file_content pfc LEFT JOIN puppet_file f ON pfc.checksum = f.checksum WHERE f.id IS NULL ) """))