aboutsummaryrefslogtreecommitdiff
path: root/pyenc/enumerate_classes.py
diff options
context:
space:
mode:
Diffstat (limited to 'pyenc/enumerate_classes.py')
-rw-r--r--pyenc/enumerate_classes.py325
1 files changed, 188 insertions, 137 deletions
diff --git a/pyenc/enumerate_classes.py b/pyenc/enumerate_classes.py
index 822852e..d4197bd 100644
--- a/pyenc/enumerate_classes.py
+++ b/pyenc/enumerate_classes.py
@@ -1,44 +1,39 @@
-
"""
Loads all puppet files in environment, parse them, and store the
parsed data in the database.
"""
+# TODO write propper tests
+# Which escpecially tests environments
+
+from typing import Union
import hashlib
import json
import os
import os.path
import subprocess
-import time
-from sqlalchemy.sql import text
-
-import threading
-from threading import Lock, Thread
-from queue import Queue
+# import time
+from sqlalchemy.sql import text
# import pyenc
-from pyenc.model import db
-from pyenc import model
+from pyenc.app.model import db
+from pyenc.app import model
-from typing import Union, Generator
-
-def with_lock(lock, proc):
- try:
- lock.acquire()
- proc()
- finally:
- lock.release()
+Path = Union[str, bytes]
-def call(proc, *args):
- proc(*args)
+def find(path: Path, **kvs) -> list[bytes]:
+ """
+ Wrapper around find(1).
-path = Union[str, bytes]
+ variables:
+ path -- base path for the find operation
-def find(path: path, **kvs) -> list[bytes]:
- """Wrapper around find(1)."""
+ key word args:
+ any that find(1) takes, but with the leading dash (-) removed.
+ """
cmdline = ['find', path]
for key, value in kvs.items():
cmdline.append(f'-{key}')
@@ -46,10 +41,13 @@ def find(path: path, **kvs) -> list[bytes]:
cmdline.append('-print0')
cmd = subprocess.run(cmdline, capture_output=True, check=True)
- return (f for f in cmd.stdout.split(b'\0') if f)
+ return (f.decode('UTF-8') for f in cmd.stdout.split(b'\0') if f)
class PuppetParseError(Exception):
+ """
+ Error holding failure result of `puppet parser dump`.
+ """
def __init__(self, code, msg):
super().__init__()
self.code = code
@@ -62,62 +60,39 @@ class PuppetParseError(Exception):
return repr(self)
-def puppet_parse(file: path) -> bytes:
+def puppet_parse(file: Path) -> bytes:
+ """
+ Runs the external puppet parser, and returns json as bytes.
+
+ Note that this is really slow.
+
+ file -- Path to the file to check
+ """
with subprocess.Popen(
['puppet', 'parser', 'dump', '--format', 'json', file],
+ text=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE) as cmd:
- if cmd.retuncode and cmd.returncode != 0:
- raise PuppetParseError(cmd.returncode, cmd.stderr.read().decode('UTF-8'))
+ if cmd.returncode and cmd.returncode != 0:
+ raise PuppetParseError(cmd.returncode, cmd.stderr.read())
json_data = cmd.stdout.read()
if (value := cmd.wait()) != 0:
- raise PuppetParseError(value, cmd.stderr.read().decode('UTF-8'))
+ raise PuppetParseError(value, cmd.stderr.read())
return json_data
-def parse_files(files: list[path]) -> Generator[model.PuppetFile]:
- for i, file in enumerate(files):
- try:
- stat = os.stat(file)
-
- last_modify = stat.st_mtime
- old_object = model.PuppetFile.query \
- .where(model.PuppetFile.path == file) \
- .first()
-
- if old_object and old_object.last_parse > last_modify:
- # file unchanged since our last parse, skip
- continue
-
- print(f'{i}/{len(files)}: {file}')
-
- if old_object:
- puppet_file = old_object
- else:
- puppet_file = model.PuppetFile(path=file)
- puppet_file.last_parse = time.time()
- # m.json = puppet_parse(file)
-
- yield puppet_file
-
- except PuppetParseError as err:
- # TODO cache error
- print('Error:', err)
- continue
-
-
def interpret_file(json_data: dict) -> list[str]:
- """Find all classes in json-representation of file."""
+ """Find all puppet class names in json-representation of file."""
top = json_data['^']
if top[0] == 'class':
tmp = top[1]['#']
idx = tmp.index('name')
return [tmp[idx + 1]]
# print(tmp[idx + 1])
- elif top[0] == 'block':
+ if top[0] == 'block':
ret_value = []
for element in top[1:]:
if element['^'][0] == 'class':
@@ -125,111 +100,187 @@ def interpret_file(json_data: dict) -> list[str]:
idx = tmp.index('name')
ret_value.append(tmp[idx + 1])
return ret_value
- else:
- return []
+ return []
+def enumerate_files(path_base, environment):
+ """
+ Enumerate all puppete files in a puppet environment.
-def enumerate_files(path_base, environment_name):
+ Updates the database so that the query
+ >>> SELECT path FROM puppet_file WHERE environment = :environment
+ Returns how the directory tree looks *right now*.
+ """
path = os.path.join(path_base, environment.name)
-
- files = list(find(path, type='f', name='*.pp'))
+ files = find(path, type='f', name='*.pp')
try:
- for puppet_file in parse_files(files):
+ for puppet_file in (model.PuppetFile(path=file) for file in files):
with open(puppet_file.path, 'rb') as f:
checksum = hashlib.sha256(f.read()).hexdigest()
# Returns puppet_file.path, relative to path_base
- puppet_file.path = os.path.relpath(puppet_file.path, path.encode('UTF-8'))
- # TODO does flask want the whole environment object?
- puppet_file.environment = environment.id
- puppet_file.checksum = checksum
- db.session.add(puppet_file)
+
+ # This works in at least postgres and sqlite3
+ db.engine.execute(text("""
+ INSERT INTO puppet_file (path, environment, checksum)
+ VALUES (:path, :environment, :checksum)
+ ON CONFLICT (path, environment)
+ DO UPDATE SET checksum = EXCLUDED.checksum
+ """), {
+ 'path': os.path.relpath(puppet_file.path, path),
+ 'environment': environment.id,
+ 'checksum': checksum,
+ })
finally:
db.session.commit()
-def run(path_base: path, environment_name: str):
+def ensure_environment(name):
+ """
+ Returns a valid PuppetEnvironment object for the named environment.
+
+ If it already exists in the database the return the existing,
+ otherwise create it and return that.
+ """
+ environment = model \
+ .PuppetEnvironment \
+ .query \
+ .where(model.PuppetEnvironment.name == name) \
+ .one_or_none()
- ### Ensure that we have oru environment
- environment = model.PuppetEnvironment.query.where(model.PuppetEnvironment.name == environment_name).first()
if not environment:
- environment = model.PuppetEnvironment(name=environment_name)
+ environment = model.PuppetEnvironment(name=name)
db.session.add(environment)
- # TODO does this update the environment object
+ # This also updates our environment object, filling in
+ # autogenerated fieldsfields.
db.session.commit()
- ### Identify all puppet files, and note the base of their content
- # enumerate_files(path_base, environment_name)
+ return environment
+
+def run(path_base: Path = '/etc/puppetlabs/code/environments',
+ environment_name: str = 'production'):
+ """
+ Runs the class enumeration.
+
+ Arguments:
+ path_base -- Path to where each puppet environment resides
+ environment_name -- Which puppet environment to parse
+ """
+
+ ### Ensure that we have our environment
+ environment = ensure_environment(environment_name)
+
+ ### Identify all puppet files, and note the base of their content
+ enumerate_files(path_base, environment)
### Find all puppet files which we haven't parsed
- result = db.engine.execute(text("""
- SELECT
- f.id,
- f.path,
- f.last_parse,
- f.checksum,
- env.name
- FROM puppet_file f
- LEFT OUTER JOIN puppet_file_content c
- ON f.checksum = c.checksum
- LEFT JOIN puppet_environment env
- ON f.environment = env.id
- WHERE c.json IS NULL
- """))
-
- # db_lock = Lock()
- threads = []
- q = Queue()
- for (id, path, last, checksum, environment) in result:
- print(environment, path)
- # return
- full_path = os.path.join(path_base.encode('UTF-8'), environment.encode('UTF-8'), path)
-
- with open(full_path, 'rb') as f:
- current_checksum = hashlib.sha256(f.read()).hexdigest()
-
- if current_checksum != checksum:
- print(f'Checksum changed for {environment}/{path}')
- # db.engine.execute(model.PuppetFile.delete().where(model.PuppetFile.id == id))
- continue
-
- thread = Thread(target=lambda checksum, full_path: (checksum, puppet_parse(full_path)),
- args=(checksum, full_path),
- name=f'{environment}/{path}')
- thread.start()
- threads.append(thread)
+ base = model.PuppetFile \
+ .query \
+ .outerjoin(model.PuppetFileContent,
+ model.PuppetFile.checksum == model.PuppetFileContent.checksum) \
+ .where(model.PuppetFileContent.json == None) # noqa: E711
- try:
- # for thread in threads:
- # print(f'Waiting on {thread.name}')
- # thread.join()
- # print(f'{thread.name} joined')
- while not q.empty():
- print('Getting something from queue')
- (checksum, item) = q.get()
- print(checksum)
- pfc = model.PuppetFileContent(checksum=checksum, json=item)
- db.session.add(pfc)
- q.task_done()
- finally:
- db.session.commit()
+ # count for progress bar
+ count = base.count()
- return
+ result = base \
+ .join(model.PuppetEnvironment) \
+ .add_column(model.PuppetEnvironment.name) \
+ .all()
+ db.session.commit()
try:
- for puppet_file in model.PuppetFile.query.all():
+ for (i, (puppet_file, env)) in enumerate(result):
+ print(env, puppet_file.path)
+ print(f'{i} / {count}', end='\r')
+
+ full_path = os.path.join(path_base, env, puppet_file.path)
+
try:
- class_names = interpret_file(json.loads(os.path.join(path, puppet_file.json)))
- for class_name in class_names:
- db.session.add(model.PuppetClass(
- class_name=class_name,
- comes_from=puppet_file))
- except Exception as e:
+ item = puppet_parse(full_path)
+ except PuppetParseError as e:
print(e)
- print(f'Failed: {puppet_file.path}')
+ continue
+
+ # Check that the file we just parsed is the file we
+ # expected.
+ # NOTE this is technically incorrect, consider
+ # | Us | Attacker |
+ # |------------------------|--------------|
+ # | initial checksum | |
+ # | | replace file |
+ # | parse | |
+ # | | restore file |
+ # | second checksum (this) | |
+
+ with open(full_path, 'rb') as f:
+ current_checksum = hashlib.sha256(f.read()).hexdigest()
+
+ if current_checksum != puppet_file.checksum:
+ print(f'Checksum changed for {env}/{puppet_file.path}')
+ continue
+
+ # File parsed was file we expected to parse, addit to the
+ # database
+ pfc = model.PuppetFileContent(file_id=puppet_file.id,
+ checksum=puppet_file.checksum,
+ json=item)
+ db.session.add(pfc)
+
+ print('loop finished')
finally:
+ # TODO sqlite fails here, complains that the "database is locked"
db.session.commit()
+
+ for file_content in model.PuppetFileContent.query.all():
+ try:
+ class_names = interpret_file(json.loads(file_content.json))
+ for class_name in class_names:
+ # cls = model.PuppetClass(class_name=class_name)
+ # cls.environments.append(environment)
+ # cls.files.append(file_content.file)
+
+ # Add classs (if not exists)
+ db.engine.execute(text("""
+ INSERT INTO puppet_class (class_name)
+ VALUES (:name)
+ ON CONFLICT (class_name) DO NOTHING
+ """), {'name': class_name})
+
+ # Add class to environment (if not already there)
+ db.engine.execute(text("""
+ INSERT INTO environment_classes (environment_id, class_id)
+ SELECT :env, id FROM puppet_class WHERE class_name = :name
+ ON CONFLICT (environment_id, class_id) DO NOTHING
+ """), {'env': environment.id, 'name': class_name})
+
+ # Add class to file mapping (if not already there)
+ db.engine.execute(text("""
+ INSERT INTO class_files (file_id, class_id)
+ SELECT :file, id FROM puppet_class WHERE class_name = :name
+ ON CONFLICT (file_id, class_id) DO NOTHING
+ """), {'file': file_content.file_id, 'name': class_name})
+
+ except Exception as e:
+ print(e)
+ # print(f'Failed: {puppet_file.path}')
+
+ db.session.commit()
+
+
+def gc_puppet_files():
+ """
+ Remove unused puppet file content.
+
+ Removes all puppet file contents which no longer has an "owning" file.
+ """
+
+ db.engine.execute(text("""
+ DELETE FROM puppet_file_content WHERE pfc.id IN
+ ( SELECT pfc.id FROM puppet_file_content pfc
+ LEFT JOIN puppet_file f ON pfc.checksum = f.checksum
+ WHERE f.id IS NULL
+ ) """))