Source code for workbench.server.bro.bro_log_reader

"""This module handles the mechanics around easily pulling in Bro Log data.

   The read_log method is a generator (in the python sense) for rows in a Bro log,
   because of this, it's memory efficient and does not read the entire file into memory.
"""

import datetime
import optparse
import time


[docs]class BroLogReader(object):
    """This class implements a python based Bro Log Reader."""

    def __init__(self, convert_datetimes=True):
        """Init for BroLogReader."""
        self.delimiter = '\t'
        self.convert_datetimes = convert_datetimes

[docs]    def read_log(self, logfile):
        """The read_log method returns a memory efficient generator for rows in a Bro log.

        Usage: 
            rows = my_bro_reader.read_log(logfile)
            for row in rows:
                do something with row

        Args:
            logfile: The Bro Log file.
        """

        # Make sure we're at the beginning
        logfile.seek(0)

        # First parse the header of the bro log
        field_names, _ = self._parse_bro_header(logfile)

        # Note: SO stupid to write a csv reader, but csv.DictReader on Bro
        #       files was doing something weird with generator output that
        #       affected zeroRPC and gave 'could not route _zpc_more' error.
        #       So wrote my own, put a sleep at the end, seems to fix it.
        while 1:
            _line = next(logfile).strip()
            if not _line.startswith('#close'):
                yield self._cast_dict(dict(zip(field_names, _line.split(self.delimiter))))
            else:
                time.sleep(.1) # Give time for zeroRPC to finish messages
                break


    def _parse_bro_header(self, logfile):
        """This method tries to parse the Bro log header section.

        Note: My googling is failing me on the documentation on the format,
        so just making a lot of assumptions and skipping some shit.
        Assumption 1: The delimeter is a tab.
        Assumption 2: Types are either time, string, int or float
        Assumption 3: The header always ends with #fields and #types as
                      the last two lines.

        Format example:
            #separator \x09
            #set_separator	,
            #empty_field	(empty)
            #unset_field	-
            #path	httpheader_recon
            #fields	ts	origin	useragent	header_events_json
            #types	time	string	string	string

        Args:
            logfile: The Bro log file.

        Returns:
            A tuple of 2 lists. One for field names and other for field types.
        """

        # Skip until you find the #fields line
        _line = next(logfile)
        while (not _line.startswith('#fields')):
            _line = next(logfile)

        # Read in the field names
        _field_names = _line.strip().split(self.delimiter)[1:]

        # Read in the types
        _line = next(logfile)
        _field_types = _line.strip().split(self.delimiter)[1:]

        # Return the header info
        return _field_names, _field_types

    def _cast_dict(self, data_dict):
        """Internal method that makes sure any dictionary elements
        are properly cast into the correct types, instead of
        just treating everything like a string from the csv file.

        Args:
            data_dict: dictionary containing bro log data.

        Returns:
            Cleaned Data dict.
        """
        for key, value in data_dict.iteritems():
            data_dict[key] = self._cast_value(value)

        # Fixme: resp_body_data can be very large so removing it for now
        if 'resp_body_data' in data_dict:
            del data_dict['resp_body_data']

        return data_dict

    def _cast_value(self, value):
        """Internal method that makes sure every value in dictionary
        is properly cast into the correct types, instead of
        just treating everything like a string from the csv file.

        Args:
            value : The value to be casted

        Returns:
            A casted Value.
        """
        # Try to convert to a datetime (if requested)
        if (self.convert_datetimes):
            try:
                date_time = datetime.datetime.fromtimestamp(float(value))
                if datetime.datetime(1970, 1, 1) > date_time:
                    raise ValueError
                else:
                    return date_time

            # Next try a set of primitive types
            except ValueError:
                pass

        # Try conversion to basic types
        tests = (int, float, str)
        for test in tests:
            try:
                return test(value)
            except ValueError:
                continue
        return value


if __name__ == '__main__':

    # Handle command-line arguments
    PARSER = optparse.OptionParser()
    PARSER.add_option('--logfile', default=None, help='Logfile to read from.  Default: %default')
    (OPTIONS, ARGUMENTS) = PARSER.parse_args()
    print OPTIONS, ARGUMENTS

    # Create a BRO log file reader and pull from the logfile
    BRO_LOG = BroLogReader()
    RECORDS = BRO_LOG.read_log(open(OPTIONS.logfile, 'rb'))
    for row in RECORDS:
        print row
Quick search

Source code for workbench.server.bro.bro_log_reader

Navigation