Source code for workbench.server.workbench

"""Workbench: Open Source Security Framework """

from gevent import monkey; monkey.patch_all(thread=False) # Monkey!
from gevent import signal as gevent_signal
import signal
import sys, os
import zerorpc
import zmq
import logging
logging.basicConfig()
import StringIO
import json
import hashlib
import inspect
import funcsigs
import ConfigParser
import magic

# Workbench server imports
try:
    from . import data_store
    from . import els_indexer
    from . import neo_db
    from . import plugin_manager
    from . import help_system
    from bro import bro_log_reader

# Okay this happens when you're running workbench in a debugger so having
# this is super handy and we'll keep it even though it hurts coverage score.
except ValueError:
    import data_store
    import els_indexer
    import neo_db
    import plugin_manager
    import help_system
    from bro import bro_log_reader


class WorkBench(object):
[docs] """Workbench: Open Source Security Framework.""" def __init__(self, store_args=None, els_hosts=None, neo_uri=None): """Initialize the Framework. Args: store_args: Dictionary with keys uri,database,samples_cap, worker_cap. els_hosts: The address where Elastic Search Indexer is running. neo_uri: The address where Neo4j is running. """ # Announce Version try: print '<<< Workbench Version %s >>>' % sys.modules['workbench'].__version__ except (AttributeError, KeyError): print '<<< Workbench Version %s >>>' % 'DEBUGGING' # Open DataStore self.data_store = data_store.DataStore(**store_args) # ELS Indexer try: self.indexer = els_indexer.ELSIndexer(**{'hosts': els_hosts} if els_hosts else {}) except SystemExit: print 'Could not connect to ELS. Is it running?' self.indexer = els_indexer.ELSStubIndexer(**{'uri': neo_uri} if neo_uri else {}) # Neo4j DB try: self.neo_db = neo_db.NeoDB(**{'uri': neo_uri} if neo_uri else {}) except RuntimeError: print 'Could not connect to Neo4j DB. Is it running? $ neo4j start' self.neo_db = neo_db.NeoDBStub(**{'uri': neo_uri} if neo_uri else {}) # Create Plugin Manager self.plugin_meta = {} plugin_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),'../workers') self.plugin_manager = plugin_manager.PluginManager(self._new_plugin, plugin_dir=plugin_dir) # Get Help System self.help_system = help_system.HelpSystem(self) ####################### # Sample Methods ####################### def store_sample(self, filename, input_bytes, type_tag):
[docs] """ Store a sample into the DataStore. Args: filename: name of the file (used purely as meta data not for lookup) input_bytes: the actual bytes of the sample e.g. f.read() type_tag: ('exe','pcap','pdf','json','swf', or ...) Returns: the md5 of the sample. """ # If the sample comes in with an unknown type_tag try to determine it if type_tag == 'unknown': print '<<< Unknown File: Trying to Determine Type >>>' type_tag = self.guess_type_tag(input_bytes) return self.data_store.store_sample(filename, input_bytes, type_tag) def get_sample(self, md5):
[docs] """ Get a sample from the DataStore. Args: md5: the md5 of the sample Returns: A dictionary of meta data about the sample which includes a ['raw_bytes'] key that contains the raw bytes. """ sample = self.data_store.get_sample(md5) return {'sample': sample} def get_sample_window(self, type_tag, size):
[docs] """ Get a sample from the DataStore. Args: type_tag: the type of samples ('pcap','exe','pdf') size: the size of the window in MegaBytes (10 = 10MB) Returns: A list of md5s representing the newest samples within the size window """ return self.data_store.get_sample_window(type_tag, size) def has_sample(self, md5):
[docs] """ Do we have this sample in the DataStore. Args: md5: the md5 of the sample Returns: True or False """ return self.data_store.has_sample(md5) def list_samples(self, predicate={}):
[docs] """List all samples that meet the predicate or all if predicate is not specified. Args: predicate: Match samples against this predicate (or all if not specified) Returns: List of dictionaries with matching samples {'md5':md5, 'filename': 'foo.exe', 'type_tag': 'exe'} """ return self.data_store.list_samples(predicate) @zerorpc.stream
def stream_sample(self, md5, max_rows):
[docs] """ Stream the sample by giving back a generator, typically used on 'logs'. Args: md5: the md5 of the sample max_rows: the maximum number of rows to return (None for all) Returns: A generator that yields rows of the file/log """ # Grab the sample and it's raw bytes sample = self.data_store.get_sample(md5) raw_bytes = sample['raw_bytes'] # Figure out the type of file to be streamed type_tag = sample['type_tag'] if type_tag == 'bro': bro_log = bro_log_reader.BroLogReader(convert_datetimes=False) mem_file = StringIO.StringIO(raw_bytes) generator = bro_log.read_log(mem_file) return generator elif type_tag == 'els_query': els_log = json.loads(raw_bytes) # Try to determine a couple of different types of ELS query results if 'fields' in els_log['hits']['hits'][0]: generator = (row['fields'] for row in els_log['hits']['hits'][:max_rows]) else: generator = (row['_source'] for row in els_log['hits']['hits'][:max_rows]) return generator elif type_tag == 'log': generator = ({'row':row} for row in raw_bytes.split('\n')[:max_rows]) return generator elif type_tag == 'json': generator = (row for row in json.loads(raw_bytes)[:max_rows]) return generator else: raise RuntimeError('Cannot stream file %s with type_tag:%s' % (md5, type_tag)) def guess_type_tag(self, input_bytes):
[docs] """ Try to guess the type_tag for this sample """ mime_to_type = {'application/x-dosexec': 'exe', 'application/pdf': 'pdf', 'application/zip': 'zip', 'application/jar': 'jar', 'application/vnd.ms-cab-compressed': 'cab', 'text/plain': 'txt', 'image/gif': 'gif', 'image/jpeg': 'jpg', 'image/png': 'png', 'text/html': 'html', 'application/vnd.ms-fontobject': 'ms_font', 'application/x-shockwave-flash': 'swf'} # See what filemagic can determine with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as mag: mime_type = mag.id_buffer(input_bytes[:1024]) if mime_type in mime_to_type: return mime_to_type[mime_type] else: print '<<< Sample Type could not be Determined >>>' return 'unknown' ####################### # Index Methods ####################### def index_sample(self, md5, index_name):
[docs] """ Index a stored sample with the Indexer. Args: md5: the md5 of the sample index_name: the name of the index Returns: Nothing """ generator = self.stream_sample(md5, None) for row in generator: self.indexer.index_data(row, index_name) def index_worker_output(self, worker_name, md5, index_name, subfield):
[docs] """ Index worker output with the Indexer. Args: worker_name: 'strings', 'pe_features', whatever md5: the md5 of the sample index_name: the name of the index subfield: index just this subfield (None for all) Returns: Nothing """ # Grab the data if subfield: data = self.work_request(worker_name, md5)[worker_name][subfield] else: data = self.work_request(worker_name, md5)[worker_name] # Okay now index the data self.indexer.index_data(data, index_name=index_name, doc_type='unknown') def search(self, index_name, query):
[docs] """ Search a particular index in the Indexer Args: index_name: the name of the index query: the query against the index Returns: All matches to the query """ return self.indexer.search(index_name, query) ####################### # Graph Methods ####################### def add_node(self, node_id, name, labels):
[docs] """ Add a node to the graph with name and labels. Args: node_id: the unique node_id e.g. 'www.evil4u.com' name: the display name of the node e.g. 'evil4u' labels: a list of labels e.g. ['domain','evil'] Returns: Nothing """ self.neo_db.add_node(node_id, name, labels) def has_node(self, node_id):
[docs] """ Does the Graph DB have this node Args: node_id: the unique node_id e.g. 'www.evil4u.com' Returns: True/False """ return self.neo_db.has_node(node_id) def add_rel(self, source_id, target_id, rel):
[docs] """ Add a relationship: source, target must already exist (see add_node) 'rel' is the name of the relationship 'contains' or whatever. Args: source_id: the unique node_id of the source target_id: the unique node_id of the target rel: name of the relationship Returns: Nothing """ self.neo_db.add_rel(source_id, target_id, rel) def clear_graph_db(self):
[docs] """ Clear the Graph Database of all nodes and edges. Args: None Returns: Nothing """ self.neo_db.clear_db() def clear_db(self):
[docs] """ Clear the Main Database of all samples and worker output. Args: None Returns: Nothing """ self.data_store.clear_db() ####################### # Work Request Methods ####################### def work_request(self, worker_name, md5, subkeys=None):
[docs] """ Make a work request for an existing stored sample. Args: worker_name: 'strings', 'pe_features', whatever md5: the md5 of the sample subkeys: just return a subfield e.g. 'foo' or 'foo.bar' (None for all) Returns: The output of the worker or just the subfield of the worker output """ # Check valid if worker_name not in self.plugin_meta.keys(): raise RuntimeError('Invalid work request for class %s (not found)' % (worker_name)) # Get results (even if we have to wait for them) # Note: Yes, we're going to wait. Gevent concurrent execution will mean this # code gets spawned off and new requests can be handled without issue. work_results = self._recursive_work_resolver(worker_name, md5) # Subkeys? (Fixme this is super klutzy) if subkeys: try: sub_results = {} for subkey in subkeys: tmp = work_results[worker_name] for key in subkey.split('.'): tmp = tmp[key] sub_results[key] = tmp work_results = sub_results except (KeyError, TypeError): raise RuntimeError('Could not get one or more subkeys for: %s' % (work_results)) # Clean it and ship it work_results = self.data_store.clean_for_serialization(work_results) return work_results @zerorpc.stream
def batch_work_request(self, worker_name, kwargs={}):
[docs] """Make a batch work request for an existing set of stored samples. A subset of sample can be specified with kwargs. Args: worker_name: 'strings', 'pe_features', whatever kwargs: a way of specifying subsets of samples ({} for all) type_tag: subset based on sample type (e.g. type_tag='exe') md5_list: subset just the samples in this list subkeys: return just this subkey (e.g. 'foo' or 'foo.bar') Returns: A generator that yields rows of worker output or subfields of the worker output """ type_tag = kwargs.get('type_tag',None) md5_list = kwargs.get('md5_list',None) subkeys = kwargs.get('subkeys',None) # If no md5_list specified put all samples (of type type_tag if not None) if not md5_list: md5_list = self.data_store.all_sample_md5s(type_tag) # Loop through all the md5s and return a generator with yield for md5 in md5_list: try: if subkeys: yield self.work_request(worker_name, md5, subkeys) else: yield self.work_request(worker_name, md5)[worker_name] except KeyError: continue def store_sample_set(self, md5_list):
[docs] """ Store a sample set (which is just a list of md5s). Note: All md5s must already be in the data store. Args: md5_list: a list of the md5s in this set (all must exist in data store) Returns: The md5 of the set (the actual md5 of the set """ for md5 in md5_list: if not self.has_sample(md5): raise RuntimeError('Sample not found all items in sample_set\ must be in the datastore: %s (not found)' % (md5)) set_md5 = hashlib.md5(str(md5_list)).hexdigest() self._store_work_results({'md5_list':md5_list}, 'sample_set', set_md5) return set_md5 def get_sample_set(self, md5):
[docs] """ Store a sample set (which is just a list of md5s). Args: md5_list: a list of the md5s in this set (all must exist in data store) Returns: The md5 of the set (the actual md5 of the set """ return self._get_work_results('sample_set', md5) @zerorpc.stream
def stream_sample_set(self, md5):
[docs] """ Stream a sample set (which is just a list of md5s). Args: md5: the md5 of the sample_set Returns: A generator that yields the md5s in the sample set """ for md5 in self._get_work_results('sample_set', md5)['md5_list']: yield md5 def get_datastore_uri(self):
[docs] """ Gives you the current datastore URL. Args: None Returns: The URI of the data store currently being used by Workbench """ return self.data_store.get_uri() ################## # Help ################## def help(self, cli=False):
[docs] """ Returns help commands """ return self.help_system.help(cli) def help_basic(self, cli=False):
[docs] """ Returns basic help commands """ return self.help_system.help_basic(cli) def help_commands(self, cli=False):
[docs] """ Returns a big string of Workbench commands and signatures """ return self.help_system.help_commands(cli) def help_command(self, command, cli=False):
[docs] """ Returns a specific Workbench command and docstring """ return self.help_system.help_command(command,cli) def help_workers(self, cli=False):
[docs] """ Returns a big string of the loaded Workbench workers and their dependencies """ return self.help_system.help_workers(cli) def help_worker(self, worker, cli=False):
[docs] """ Returns a specific Workbench worker and docstring """ return self.help_system.help_worker(worker, cli) def help_advanced(self, cli=False):
[docs] """ Returns advanced help commands """ return self.help_system.help_advanced(cli) def help_everything(self, cli=False):
[docs] """ Returns advanced help commands """ return self.help_system.help_everything(cli) ################## # Introspection ################## def list_all_commands(self):
[docs] """ Returns a list of all the Workbench commands""" commands = [name for name, _ in inspect.getmembers(self, predicate=inspect.ismethod) if not name.startswith('_')] commands.append('batch_work_request') # I think the zerorpc decorator messes up inspect return commands def list_all_workers(self):
[docs] """ List all the currently loaded workers """ return self.plugin_meta.keys() def worker_info(self, worker_name):
[docs] """ Get the information about this worker """ plugin = self.plugin_meta[worker_name] return {'dependencies': plugin['class'].dependencies, 'doc': plugin['class'].__doc__} ################## # Testing ################## def test_worker(self, worker_name):
[docs] """ Run the test for a specific worker """ # First find the plugin try: plugin = self.plugin_meta[worker_name] except KeyError: return '%s worker not found.. misspelled?' % worker_name # Now try to run the test try: return plugin['test']() except (AttributeError, KeyError) as error: output = 'Failure for plugin: %s' % (worker_name) output += 'Error: %s' % error return output #################### # Internal Methods #################### def _new_plugin(self, plugin, mod_time):
""" Internal: This method handles the mechanics around new plugins. """ print '\t- %s: loaded...' % (plugin['name']) plugin['time_stamp'] = mod_time # datetime.datetime.utcnow() self.plugin_meta[plugin['name']] = plugin def _store_work_results(self, results, collection, md5): """ Internal: Stores the work results of a worker.""" self.data_store.store_work_results(results, collection, md5) def _get_work_results(self, collection, md5): """ Internal: Method for fetching work results.""" results = self.data_store.get_work_results(collection, md5) return {collection: results} if results else None # So the trick here is that since each worker just stores it's input dependencies # we can resursively backtrack and all the needed work gets done. def _recursive_work_resolver(self, worker_name, md5): """ Internal: Input dependencies are resursively backtracked, invoked and then passed down the pipeline until htting the requested worker. """ # Looking for the sample or sample_set? if (worker_name == 'sample'): # If we have a sample set with this md5, return it if self.get_sample_set(md5): return self.get_sample_set(md5) # Return the sample (might raise a RuntimeError) return self.get_sample(md5) # Do I actually have this plugin? (might have failed, etc) if (worker_name not in self.plugin_meta): print 'Request for non-existing or failed plugin: %s' % (worker_name) return {} # If the results exist and the time_stamp is newer than the plugin's, I'm done collection = self.plugin_meta[worker_name]['name'] work_results = self._get_work_results(collection, md5) if work_results: if self.plugin_meta[worker_name]['time_stamp'] < work_results[collection]['__time_stamp']: print 'Returning cached work results for plugin: %s' % (worker_name) return work_results else: print 'Updating work results for new plugin: %s' % (worker_name) # Okay either need to generate (or re-generate) the work results dependencies = self.plugin_meta[worker_name]['dependencies'] dependant_results = {} for dependency in dependencies: dependant_results.update(self._recursive_work_resolver(dependency, md5)) print 'New work for plugin: %s' % (worker_name) work_results = self.plugin_meta[worker_name]['class']().execute(dependant_results) # Enforce dictionary output if not isinstance(work_results, dict): print 'Critical: Plugin %s MUST produce a python dictionary!' % worker_name return None # Store the results and return self._store_work_results(work_results, collection, md5) return self._get_work_results(collection, md5) def _find_element(self,d,k): if k in d: return d[k] submatch = [d[_k][k] for _k in d if k in d[_k]] return submatch[0] if submatch else None def run():
[docs] """ Run the workbench server """ # Load the configuration file relative to this script location config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'config.ini') workbench_conf = ConfigParser.ConfigParser() config_ini = workbench_conf.read(config_path) if not config_ini: print 'Could not locate config.ini file, tried %s : exiting...' % config_path exit(1) # Pull configuration settings datastore_uri = workbench_conf.get('workbench', 'datastore_uri') database = workbench_conf.get('workbench', 'database') worker_cap = workbench_conf.getint('workbench', 'worker_cap') samples_cap = workbench_conf.getint('workbench', 'samples_cap') # Spin up Workbench ZeroRPC try: store_args = {'uri': datastore_uri, 'database': database, 'worker_cap':worker_cap, 'samples_cap':samples_cap} print '<<< Workbench Server at %s >>>' % ('tcp://0.0.0.0:4242') workbench = zerorpc.Server(WorkBench(store_args=store_args), name='workbench', heartbeat=60) workbench.bind('tcp://0.0.0.0:4242') print '\nWorkbench is ready and feeling super duper!' gevent_signal(signal.SIGTERM, workbench.stop) gevent_signal(signal.SIGINT, workbench.stop) gevent_signal(signal.SIGKILL, workbench.stop) workbench.run() print '\nWorkbench Server Shutting Down... and dreaming of sheep...' except zmq.error.ZMQError: print '\nInfo: Could not start Workbench server (no worries, probably already running...)\n' # Test that just calls main def test():
[docs] run() if __name__ == '__main__':
run()