Source code for workbench.server.bro.bro_log_reader
"""This module handles the mechanics around easily pulling in Bro Log data.
The read_log method is a generator (in the python sense) for rows in a Bro log,
because of this, it's memory efficient and does not read the entire file into memory.
"""
import datetime
import optparse
import time
[docs]class BroLogReader(object):
"""This class implements a python based Bro Log Reader."""
def __init__(self, convert_datetimes=True):
"""Init for BroLogReader."""
self.delimiter = '\t'
self.convert_datetimes = convert_datetimes
[docs] def read_log(self, logfile):
"""The read_log method returns a memory efficient generator for rows in a Bro log.
Usage:
rows = my_bro_reader.read_log(logfile)
for row in rows:
do something with row
Args:
logfile: The Bro Log file.
"""
# Make sure we're at the beginning
logfile.seek(0)
# First parse the header of the bro log
field_names, _ = self._parse_bro_header(logfile)
# Note: SO stupid to write a csv reader, but csv.DictReader on Bro
# files was doing something weird with generator output that
# affected zeroRPC and gave 'could not route _zpc_more' error.
# So wrote my own, put a sleep at the end, seems to fix it.
while 1:
_line = next(logfile).strip()
if not _line.startswith('#close'):
yield self._cast_dict(dict(zip(field_names, _line.split(self.delimiter))))
else:
time.sleep(.1) # Give time for zeroRPC to finish messages
break
def _parse_bro_header(self, logfile):
"""This method tries to parse the Bro log header section.
Note: My googling is failing me on the documentation on the format,
so just making a lot of assumptions and skipping some shit.
Assumption 1: The delimeter is a tab.
Assumption 2: Types are either time, string, int or float
Assumption 3: The header always ends with #fields and #types as
the last two lines.
Format example:
#separator \x09
#set_separator ,
#empty_field (empty)
#unset_field -
#path httpheader_recon
#fields ts origin useragent header_events_json
#types time string string string
Args:
logfile: The Bro log file.
Returns:
A tuple of 2 lists. One for field names and other for field types.
"""
# Skip until you find the #fields line
_line = next(logfile)
while (not _line.startswith('#fields')):
_line = next(logfile)
# Read in the field names
_field_names = _line.strip().split(self.delimiter)[1:]
# Read in the types
_line = next(logfile)
_field_types = _line.strip().split(self.delimiter)[1:]
# Return the header info
return _field_names, _field_types
def _cast_dict(self, data_dict):
"""Internal method that makes sure any dictionary elements
are properly cast into the correct types, instead of
just treating everything like a string from the csv file.
Args:
data_dict: dictionary containing bro log data.
Returns:
Cleaned Data dict.
"""
for key, value in data_dict.iteritems():
data_dict[key] = self._cast_value(value)
# Fixme: resp_body_data can be very large so removing it for now
if 'resp_body_data' in data_dict:
del data_dict['resp_body_data']
return data_dict
def _cast_value(self, value):
"""Internal method that makes sure every value in dictionary
is properly cast into the correct types, instead of
just treating everything like a string from the csv file.
Args:
value : The value to be casted
Returns:
A casted Value.
"""
# Try to convert to a datetime (if requested)
if (self.convert_datetimes):
try:
date_time = datetime.datetime.fromtimestamp(float(value))
if datetime.datetime(1970, 1, 1) > date_time:
raise ValueError
else:
return date_time
# Next try a set of primitive types
except ValueError:
pass
# Try conversion to basic types
tests = (int, float, str)
for test in tests:
try:
return test(value)
except ValueError:
continue
return value
if __name__ == '__main__':
# Handle command-line arguments
PARSER = optparse.OptionParser()
PARSER.add_option('--logfile', default=None, help='Logfile to read from. Default: %default')
(OPTIONS, ARGUMENTS) = PARSER.parse_args()
print OPTIONS, ARGUMENTS
# Create a BRO log file reader and pull from the logfile
BRO_LOG = BroLogReader()
RECORDS = BRO_LOG.read_log(open(OPTIONS.logfile, 'rb'))
for row in RECORDS:
print row