From 57a283a5634d47f559557f966e64d621843e4035 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hugo=20H=C3=B6rnquist?= Date: Wed, 3 Aug 2022 16:24:08 +0200 Subject: work --- .pylintrc | 3 + enumerate_classes.py | 142 ------------------------ pyenc/__init__.py | 8 +- pyenc/cmdline.py | 29 +++++ pyenc/db.py | 24 ---- pyenc/enc.py | 16 --- pyenc/enumerate_classes.py | 235 ++++++++++++++++++++++++++++++++++++++++ pyenc/model.py | 44 +++++++- pyenc/templates/start_page.html | 1 + 9 files changed, 314 insertions(+), 188 deletions(-) delete mode 100755 enumerate_classes.py create mode 100644 pyenc/cmdline.py delete mode 100644 pyenc/db.py create mode 100644 pyenc/enumerate_classes.py diff --git a/.pylintrc b/.pylintrc index ab6f45d..85507dc 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,3 +1,6 @@ [MESSAGES CONTROL] disable=consider-using-f-string, missing-module-docstring + +[FORMAT] +good-names=f,i diff --git a/enumerate_classes.py b/enumerate_classes.py deleted file mode 100755 index c9e1c4b..0000000 --- a/enumerate_classes.py +++ /dev/null @@ -1,142 +0,0 @@ -#!/usr/bin/env python3 - -""" -Loads all puppet files in environment, parse them, and store the -parsed data in the database. -""" - -import subprocess -import json -import os -import time - -import pyenc -from pyenc.db import db -import pyenc.model as model - - -def find(path, **kvs): - """Wrapper around find(1).""" - cmdline = ['find', path] - for k, v in kvs.items(): - cmdline.append(f'-{k}') - cmdline.append(v) - cmdline.append('-print0') - - cmd = subprocess.run(cmdline, capture_output=True) - return (f for f in cmd.stdout.split(b'\0') if f) - - -class PuppetParseError(Exception): - def __init__(self, code, msg): - self.code = code - self.msg = msg - - def __repr__(self): - return f'PuppetParserError({self.code}, {self.msg})' - - def __str__(self): - return repr(self) - - -def puppet_parse(file): - cmd = subprocess.Popen( - ['puppet', 'parser', 'dump', '--format', 'json', file], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - if cmd.returncode and cmd.returncode != 0: - raise PuppetParseError(cmd.returncode, cmd.stderr.read().decode('UTF-8')) - else: - json = cmd.stdout.read() - - if (value := cmd.wait()) != 0: - raise PuppetParseError(value, cmd.stderr.read().decode('UTF-8')) - - return json - - -def parse_files(files): - for i, file in enumerate(files): - try: - st = os.stat(file) - - last_modify = st.st_mtime - old_object = model.PuppetFile.query \ - .where(model.PuppetFile.path == file) \ - .first() - - if old_object and old_object.last_parse > last_modify: - # file unchanged since our last parse, skip - continue - - print(f'{i}/{len(files)}: {file}') - - if old_object: - m = old_object - else: - m = model.PuppetFile(path=file) - m.last_parse = time.time() - m.json = puppet_parse(file) - - yield m - - except PuppetParseError as e: - # TODO cache error - print('Error:', e) - continue - - -def interpret_file(json_data): - """Find all classes in json-representation of file.""" - top = json_data['^'] - if top[0] == 'class': - tmp = top[1]['#'] - idx = tmp.index('name') - return [tmp[idx + 1]] - # print(tmp[idx + 1]) - elif top[0] == 'block': - ret_value = [] - for element in top[1:]: - if element['^'][0] == 'class': - tmp = element['^'][1]['#'] - idx = tmp.index('name') - ret_value.append(tmp[idx + 1]) - return ret_value - else: - return [] - - - - -def main(): - app = pyenc.create_app() - app.app_context().push() - - path = '/var/lib/machines/busting/etc/puppetlabs/code/environments/production' - - files_gen = find(path, type='f', name='*.pp') - files = [f for f in files_gen] - - try: - for puppet_file in parse_files(files): - db.session.add(puppet_file) - finally: - db.session.commit() - - - try: - for puppet_file in model.PuppetFile.query.all(): - try: - class_names = interpret_file(json.loads(puppet_file.json)) - for class_name in class_names: - db.session.add(model.PuppetClass( - class_name=class_name, - comes_from=puppet_file)) - except Exception as e: - print(e) - print(f'Failed: {puppet_file.path}') - finally: - db.session.commit() - -if __name__ == '__main__': - main() diff --git a/pyenc/__init__.py b/pyenc/__init__.py index cd0d57d..7249936 100644 --- a/pyenc/__init__.py +++ b/pyenc/__init__.py @@ -15,8 +15,7 @@ from flask import ( ) from . import model -from . import db -from . import enc +from . import cmdline from . import api @@ -33,9 +32,10 @@ def create_app(): app.config.from_pyfile('settings.py') + model.db.init_app(app) + for module in [ - db, - enc, + cmdline, api, ]: module.init_app(app) diff --git a/pyenc/cmdline.py b/pyenc/cmdline.py new file mode 100644 index 0000000..099018d --- /dev/null +++ b/pyenc/cmdline.py @@ -0,0 +1,29 @@ +import click +from flask.cli import AppGroup + +app_group = AppGroup('user', help="Testt") + +@app_group.command('enc') +@click.argument('fqdn') +def enc(fqdn): + from . import enc + enc.run_enc(fqdn) + +@app_group.command('init-db') +def initialize_database(): + from . import model + model.db.create_all() + +@app_group.command('enumerate-classes') +def enumerate_classes(): + from . import enumerate_classes + environment_name = 'production' + path_base = '/var/lib/machines/busting/etc/puppetlabs/code/environments/' + enumerate_classes.run( + path_base=path_base, + environment_name=environment_name) + +def init_app(app): + """Add command line options to current flask app.""" + app.cli.add_command(app_group) + diff --git a/pyenc/db.py b/pyenc/db.py deleted file mode 100644 index 38edda1..0000000 --- a/pyenc/db.py +++ /dev/null @@ -1,24 +0,0 @@ -"""Database connection for application.""" - -import click -from flask import current_app, g -from flask.cli import with_appcontext -from .model import db - - -@click.command('init-db') -@with_appcontext -def init_db_command(): - """Create database from command line.""" - # init_db() - # print(db) - print(db) - db.create_all() - click.echo('Initialized the database.') - - -def init_app(app): - """Add database (and click) to given flask app.""" - # app.teardown_appcontext(close_db) - db.init_app(app) - app.cli.add_command(init_db_command) diff --git a/pyenc/enc.py b/pyenc/enc.py index e0f3f8d..8732af2 100644 --- a/pyenc/enc.py +++ b/pyenc/enc.py @@ -2,21 +2,9 @@ Command line entry point for Puppet External Node Classifier (enc). """ -import click import yaml -# from flask import current_app, g -# from flask.cli import with_appcontext, AppGroup -from flask.cli import AppGroup -# from .db import db from . import model - - -app_group = AppGroup('user', help="Testt") - -@app_group.command('enc') -@click.argument('fqdn') -# @with_appcontext def run_enc(fqdn): """ Run the puppet node classifier. @@ -34,7 +22,3 @@ def run_enc(fqdn): print(yaml.dump(out)) return 0 - -def init_app(app): - """Add puppet enc click to current flask app.""" - app.cli.add_command(app_group) diff --git a/pyenc/enumerate_classes.py b/pyenc/enumerate_classes.py new file mode 100644 index 0000000..822852e --- /dev/null +++ b/pyenc/enumerate_classes.py @@ -0,0 +1,235 @@ + +""" +Loads all puppet files in environment, parse them, and store the +parsed data in the database. +""" + +import hashlib +import json +import os +import os.path +import subprocess +import time +from sqlalchemy.sql import text + +import threading +from threading import Lock, Thread +from queue import Queue + + +# import pyenc +from pyenc.model import db +from pyenc import model + +from typing import Union, Generator + + +def with_lock(lock, proc): + try: + lock.acquire() + proc() + finally: + lock.release() + + +def call(proc, *args): + proc(*args) + +path = Union[str, bytes] + +def find(path: path, **kvs) -> list[bytes]: + """Wrapper around find(1).""" + cmdline = ['find', path] + for key, value in kvs.items(): + cmdline.append(f'-{key}') + cmdline.append(value) + cmdline.append('-print0') + + cmd = subprocess.run(cmdline, capture_output=True, check=True) + return (f for f in cmd.stdout.split(b'\0') if f) + + +class PuppetParseError(Exception): + def __init__(self, code, msg): + super().__init__() + self.code = code + self.msg = msg + + def __repr__(self): + return f'PuppetParserError({self.code}, {self.msg})' + + def __str__(self): + return repr(self) + + +def puppet_parse(file: path) -> bytes: + with subprocess.Popen( + ['puppet', 'parser', 'dump', '--format', 'json', file], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) as cmd: + if cmd.retuncode and cmd.returncode != 0: + raise PuppetParseError(cmd.returncode, cmd.stderr.read().decode('UTF-8')) + + json_data = cmd.stdout.read() + + if (value := cmd.wait()) != 0: + raise PuppetParseError(value, cmd.stderr.read().decode('UTF-8')) + + return json_data + + +def parse_files(files: list[path]) -> Generator[model.PuppetFile]: + for i, file in enumerate(files): + try: + stat = os.stat(file) + + last_modify = stat.st_mtime + old_object = model.PuppetFile.query \ + .where(model.PuppetFile.path == file) \ + .first() + + if old_object and old_object.last_parse > last_modify: + # file unchanged since our last parse, skip + continue + + print(f'{i}/{len(files)}: {file}') + + if old_object: + puppet_file = old_object + else: + puppet_file = model.PuppetFile(path=file) + puppet_file.last_parse = time.time() + # m.json = puppet_parse(file) + + yield puppet_file + + except PuppetParseError as err: + # TODO cache error + print('Error:', err) + continue + + +def interpret_file(json_data: dict) -> list[str]: + """Find all classes in json-representation of file.""" + top = json_data['^'] + if top[0] == 'class': + tmp = top[1]['#'] + idx = tmp.index('name') + return [tmp[idx + 1]] + # print(tmp[idx + 1]) + elif top[0] == 'block': + ret_value = [] + for element in top[1:]: + if element['^'][0] == 'class': + tmp = element['^'][1]['#'] + idx = tmp.index('name') + ret_value.append(tmp[idx + 1]) + return ret_value + else: + return [] + + + +def enumerate_files(path_base, environment_name): + path = os.path.join(path_base, environment.name) + + files = list(find(path, type='f', name='*.pp')) + + try: + for puppet_file in parse_files(files): + with open(puppet_file.path, 'rb') as f: + checksum = hashlib.sha256(f.read()).hexdigest() + # Returns puppet_file.path, relative to path_base + puppet_file.path = os.path.relpath(puppet_file.path, path.encode('UTF-8')) + # TODO does flask want the whole environment object? + puppet_file.environment = environment.id + puppet_file.checksum = checksum + db.session.add(puppet_file) + + finally: + db.session.commit() + + +def run(path_base: path, environment_name: str): + + ### Ensure that we have oru environment + environment = model.PuppetEnvironment.query.where(model.PuppetEnvironment.name == environment_name).first() + if not environment: + environment = model.PuppetEnvironment(name=environment_name) + db.session.add(environment) + # TODO does this update the environment object + db.session.commit() + + ### Identify all puppet files, and note the base of their content + # enumerate_files(path_base, environment_name) + + + ### Find all puppet files which we haven't parsed + + result = db.engine.execute(text(""" + SELECT + f.id, + f.path, + f.last_parse, + f.checksum, + env.name + FROM puppet_file f + LEFT OUTER JOIN puppet_file_content c + ON f.checksum = c.checksum + LEFT JOIN puppet_environment env + ON f.environment = env.id + WHERE c.json IS NULL + """)) + + # db_lock = Lock() + threads = [] + q = Queue() + for (id, path, last, checksum, environment) in result: + print(environment, path) + # return + full_path = os.path.join(path_base.encode('UTF-8'), environment.encode('UTF-8'), path) + + with open(full_path, 'rb') as f: + current_checksum = hashlib.sha256(f.read()).hexdigest() + + if current_checksum != checksum: + print(f'Checksum changed for {environment}/{path}') + # db.engine.execute(model.PuppetFile.delete().where(model.PuppetFile.id == id)) + continue + + thread = Thread(target=lambda checksum, full_path: (checksum, puppet_parse(full_path)), + args=(checksum, full_path), + name=f'{environment}/{path}') + thread.start() + threads.append(thread) + + try: + # for thread in threads: + # print(f'Waiting on {thread.name}') + # thread.join() + # print(f'{thread.name} joined') + while not q.empty(): + print('Getting something from queue') + (checksum, item) = q.get() + print(checksum) + pfc = model.PuppetFileContent(checksum=checksum, json=item) + db.session.add(pfc) + q.task_done() + finally: + db.session.commit() + + return + + try: + for puppet_file in model.PuppetFile.query.all(): + try: + class_names = interpret_file(json.loads(os.path.join(path, puppet_file.json))) + for class_name in class_names: + db.session.add(model.PuppetClass( + class_name=class_name, + comes_from=puppet_file)) + except Exception as e: + print(e) + print(f'Failed: {puppet_file.path}') + finally: + db.session.commit() diff --git a/pyenc/model.py b/pyenc/model.py index 0a240d9..e774014 100644 --- a/pyenc/model.py +++ b/pyenc/model.py @@ -43,26 +43,66 @@ class Host(db.Model): for column in self.__table__.columns} +class PuppetEnvironment(db.Model): + """ + A puppet environment. + + An enviromnet is a collection of modules, but here we only keep + the files of the modules, in PuppetFile. + """ + __tablename__ = 'puppet_environment' + id = db.Column(db.Integer, primary_key=True) + name = db.Column(db.Text, nullable=False) + + class PuppetFile(db.Model): """ Puppet source code file. Keeps track of known puppet files. Each file contains 0 to many puppet classes. + + Each file is uniquely identified by the pair (path, environment). """ __tablename__ = 'puppet_file' id = db.Column(db.Integer, primary_key=True) # Where we found the file + # TODO normalize this to path = db.Column(db.Text, nullable=False) - # Output of 'puppet parser dump --format json ' - json = db.Column(db.Text, nullable=False) + + environment = db.Column(db.Integer, db.ForeignKey(f'{PuppetEnvironment.__tablename__}.id')) + + # Checksum of the content, should be usable as a key in + # PuppetFileContent + # TODO flask weak keys? + checksum = db.Column(db.Text) + # When we last read data into json last_parse = db.Column(db.Float) # classes = db.relationship('PuppetClass', back_populates='comes_from') classes = db.relationship('PuppetClass', backref='comes_from') +class PuppetFileContent(db.Model): + """ + (Parsed) contents of puppet source files. + + Separate from PuppetFile since many environments can share files, + and I don't want to store reduntand data. + """ + __tablename__ = 'puppet_file_content' + + id = db.Column(db.Integer, primary_key=True) + + # Checksum of the original file + checksum = db.Column(db.Text, nullable=False) + + # Output of 'puppet parser dump --format json ' + json = db.Column(db.Text, nullable=False) + + + class PuppetClass(db.Model): """ A puppet class. diff --git a/pyenc/templates/start_page.html b/pyenc/templates/start_page.html index 8df08a3..268f7d1 100644 --- a/pyenc/templates/start_page.html +++ b/pyenc/templates/start_page.html @@ -19,6 +19,7 @@
  • + Soruce
  • {% endwith %} {% endfor %} -- cgit v1.2.3