pyenc/enumerate_classes.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296

"""
Loads all puppet files in environment, parse them, and store the
parsed data in the database.
"""

# TODO write propper tests
# Which escpecially tests environments

from typing import Union
import hashlib
import json
import os
import os.path
import subprocess
# import time
import traceback

from sqlalchemy.sql import text

# import pyenc
from pyenc.app.model import db
from pyenc.app import model


Path = Union[str, bytes]


def find(path: Path, **kvs) -> list[bytes]:
    """
    Wrapper around find(1).

    variables:
    path -- base path for the find operation

    key word args:
    any that find(1) takes, but with the leading dash (-) removed.
    """
    cmdline = ['find', path]
    for key, value in kvs.items():
        cmdline.append(f'-{key}')
        cmdline.append(value)
    cmdline.append('-print0')

    cmd = subprocess.run(cmdline, capture_output=True, check=True)
    return (f.decode('UTF-8') for f in cmd.stdout.split(b'\0') if f)


class PuppetParseError(Exception):
    """
    Error holding failure result of `puppet parser dump`.
    """
    def __init__(self, code, msg):
        super().__init__()
        self.code = code
        self.msg = msg

    def __repr__(self):
        return f'PuppetParserError({self.code}, {self.msg})'

    def __str__(self):
        return repr(self)


def puppet_parse(file: Path) -> bytes:
    """
    Runs the external puppet parser, and returns json as bytes.

    Note that this is really slow.

    file -- Path to the file to check
    """
    with subprocess.Popen(
            ['puppet', 'parser', 'dump', '--format', 'json', file],
            text=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE) as cmd:
        if cmd.returncode and cmd.returncode != 0:
            raise PuppetParseError(cmd.returncode, cmd.stderr.read())

        json_data = cmd.stdout.read()

        if (value := cmd.wait()) != 0:
            raise PuppetParseError(value, cmd.stderr.read())

        return json_data


def interpret_file(json_data: dict) -> list[str]:
    """Find all puppet class names in json-representation of file."""
    top = json_data['^']
    if top[0] == 'class':
        tmp = top[1]['#']
        idx = tmp.index('name')
        return [tmp[idx + 1]]
        # print(tmp[idx + 1])
    if top[0] == 'block':
        ret_value = []
        for element in top[1:]:
            if element['^'][0] == 'class':
                tmp = element['^'][1]['#']
                idx = tmp.index('name')
                ret_value.append(tmp[idx + 1])
        return ret_value
    return []


def enumerate_files(path_base, environment):
    """
    Enumerate all puppete files in a puppet environment.

    Updates the database so that the query
    >>> SELECT path FROM puppet_file WHERE environment = :environment
    Returns how the directory tree looks *right now*.
    """
    path = os.path.join(path_base, environment.name)
    files = find(path, type='f', name='*.pp')
    files = [f for f in find(path, type='f', name='*.pp')
             if os.path.basename(os.path.dirname(f)) == 'manifests']

    try:
        for puppet_file in (model.PuppetFile(path=file) for file in files):
            with open(puppet_file.path, 'rb') as f:
                checksum = hashlib.sha256(f.read()).hexdigest()
            # Returns puppet_file.path, relative to path_base

            # This works in at least postgres and sqlite3
            db.engine.execute(text("""
            INSERT INTO puppet_file (path, environment_id, checksum)
            VALUES (:path, :environment, :checksum)
            ON CONFLICT (path, environment_id)
            DO UPDATE SET checksum = EXCLUDED.checksum
            """), {
                'path': os.path.relpath(puppet_file.path, path),
                'environment': environment.id,
                'checksum': checksum,
            })

    finally:
        db.session.commit()


def ensure_environment(name):
    """
    Returns a valid PuppetEnvironment object for the named environment.

    If it already exists in the database the return the existing,
    otherwise create it and return that.
    """
    environment = model \
        .Environment \
        .query \
        .where(model.Environment.name == name) \
        .one_or_none()

    if not environment:
        environment = model.Environment(name=name)
        db.session.add(environment)
        # This also updates our environment object, filling in
        # autogenerated fieldsfields.
        db.session.commit()

    return environment


def run(path_base: Path = '/etc/puppetlabs/code/environments',
        environment_name: str = 'production'):
    """
    Runs the class enumeration.

    Arguments:
    path_base -- Path to where each puppet environment resides
    environment_name -- Which puppet environment to parse
    """

    ### Ensure that we have our environment
    environment = ensure_environment(environment_name)

    ### Identify all puppet files, and note the base of their content
    enumerate_files(path_base, environment)

    ### Find all puppet files which we haven't parsed
    subexpr = model.db.session \
                   .query(model.PuppetFile.path,
                          model.PuppetFile.checksum,
                          # Selects any of the availably environmentns. Since the checksum
                          # is the same the file should also be the same, regardles of
                          # which environment we chose
                          model.db.func.min(model.PuppetFile.environment_id).label('env_id')) \
                   .outerjoin(model.PuppetFileContent,
                              model.PuppetFile.checksum == model.PuppetFileContent.checksum) \
                   .where(model.PuppetFileContent.json == None) \
                   .group_by(model.PuppetFile.checksum,
                             model.PuppetFile.path) \
                   .cte()

    base = model.db.session \
                   .query(subexpr.c.path,
                          subexpr.c.checksum,
                          model.Environment.name) \
                   .join(model.Environment,
                         model.Environment.id == subexpr.c.env_id)

    files = base.all()
    count = base.count()

    db.session.commit()

    # Parse all puppet files, and store their output into pupet_file_content
    try:
        for (i, (path, checksum, env)) in enumerate(files):
            print(f'\x1b[2K{env} {path}')
            print(f'{i} / {count}', end='\r')

            full_path = os.path.join(path_base, env, path)

            try:
                item = puppet_parse(full_path)
            except PuppetParseError as e:
                print(e)
                continue

            # Check that the file we just parsed is the file we
            # expected.
            # NOTE this is technically incorrect, consider
            # | Us                     | Attacker     |
            # |------------------------|--------------|
            # | initial checksum       |              |
            # |                        | replace file |
            # | parse                  |              |
            # |                        | restore file |
            # | second checksum (this) |              |

            with open(full_path, 'rb') as f:
                current_checksum = hashlib.sha256(f.read()).hexdigest()

            if current_checksum != checksum:
                print(f'Checksum changed for {env}/{path}')
                continue

            # File parsed was file we expected to parse, addit to the
            # database
            pfc = model.PuppetFileContent(checksum=checksum,
                                          json=item)
            db.session.add(pfc)

        print('loop finished')
    finally:
        # TODO sqlite fails here, complains that the "database is locked"
        db.session.commit()

    # Interpret the parsed result of all parsed puppet files
    # This takes a few seconds
    for file in model.PuppetFile.query.where(model.PuppetFile.content).all():
        try:
            class_names = interpret_file(json.loads(file.content.json))
            for class_name in class_names:
                db.engine.execute(text("""
                INSERT INTO puppet_class (name)
                VALUES (:name)
                ON CONFLICT (name) DO NOTHING
                """), {'name': class_name})

                # Add class to environment (if not already there)
                # TODO this adds to much
                db.engine.execute(text("""
                INSERT INTO environment_classes (environment_id, class_id)
                SELECT :env, id FROM puppet_class WHERE puppet_class.name = :name
                ON CONFLICT (environment_id, class_id) DO NOTHING
                """), {'env': environment.id, 'name': class_name})

                # Add class to file mapping (if not already there)
                db.engine.execute(text("""
                INSERT INTO class_files (file_id, class_id)
                SELECT :file, id FROM puppet_class WHERE puppet_class.name = :name
                ON CONFLICT (file_id, class_id) DO NOTHING
                """), {'file': file.id, 'name': class_name})
        except Exception as e:
            print(f'Error for {file.id} ({file.path}) - {e}')
            traceback.print_exc()

    db.session.commit()


def gc_puppet_files():
    """
    Remove unused puppet file content.

    Removes all puppet file contents which no longer has an "owning" file.
    """

    db.engine.execute(text("""
    DELETE FROM puppet_file_content WHERE pfc.id IN
    ( SELECT pfc.id FROM puppet_file_content pfc
      LEFT JOIN puppet_file f ON pfc.checksum = f.checksum
      WHERE f.id IS NULL
    ) """))