Source code for workbench.server.workbench

"""Workbench: Open Source Security Framework """

from gevent import monkey; monkey.patch_all(thread=False) # Monkey!
from gevent import signal as gevent_signal
import signal
import sys, os
import zerorpc
import zmq
import logging
logging.basicConfig()
import StringIO
import json
import hashlib
import inspect
import funcsigs
import ConfigParser
import magic

# Workbench server imports
try:
    from . import data_store
    from . import els_indexer
    from . import neo_db
    from . import plugin_manager
    from . import help_system
    from bro import bro_log_reader

# Okay this happens when you're running workbench in a debugger so having
# this is super handy and we'll keep it even though it hurts coverage score.
except ValueError:
    import data_store
    import els_indexer
    import neo_db
    import plugin_manager
    import help_system
    from bro import bro_log_reader


class WorkBench(object):
[docs]    """Workbench: Open Source Security Framework."""

    def __init__(self, store_args=None, els_hosts=None, neo_uri=None):
        """Initialize the Framework.

        Args:
            store_args: Dictionary with keys uri,database,samples_cap, worker_cap.
            els_hosts: The address where Elastic Search Indexer is running.
            neo_uri: The address where Neo4j is running.
        """
        # Announce Version
        try:
            print '<<< Workbench Version %s >>>' % sys.modules['workbench'].__version__
        except (AttributeError, KeyError):
            print '<<< Workbench Version %s >>>' % 'DEBUGGING'

        # Open DataStore
        self.data_store = data_store.DataStore(**store_args)

        # ELS Indexer
        try:
            self.indexer = els_indexer.ELSIndexer(**{'hosts': els_hosts} if els_hosts else {})
        except SystemExit:
            print 'Could not connect to ELS. Is it running?'
            self.indexer = els_indexer.ELSStubIndexer(**{'uri': neo_uri} if neo_uri else {})

        # Neo4j DB
        try:
            self.neo_db = neo_db.NeoDB(**{'uri': neo_uri} if neo_uri else {})
        except RuntimeError:
            print 'Could not connect to Neo4j DB. Is it running?  $ neo4j start'
            self.neo_db = neo_db.NeoDBStub(**{'uri': neo_uri} if neo_uri else {})

        # Create Plugin Manager
        self.plugin_meta = {}
        plugin_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),'../workers')
        self.plugin_manager = plugin_manager.PluginManager(self._new_plugin, plugin_dir=plugin_dir)

        # Get Help System
        self.help_system = help_system.HelpSystem(self)


    #######################
    # Sample Methods
    #######################
    def store_sample(self, filename, input_bytes, type_tag):
[docs]        """ Store a sample into the DataStore.
            Args:
                filename: name of the file (used purely as meta data not for lookup)
                input_bytes: the actual bytes of the sample e.g. f.read()
                type_tag: ('exe','pcap','pdf','json','swf', or ...)
            Returns:
                the md5 of the sample.
        """

        # If the sample comes in with an unknown type_tag try to determine it
        if type_tag == 'unknown':
            print '<<< Unknown File: Trying to Determine Type >>>'
            type_tag = self.guess_type_tag(input_bytes)

        return self.data_store.store_sample(filename, input_bytes, type_tag)

    def get_sample(self, md5):
[docs]        """ Get a sample from the DataStore.
            Args:
                md5: the md5 of the sample
            Returns:
                A dictionary of meta data about the sample which includes
                a ['raw_bytes'] key that contains the raw bytes.
        """
        sample = self.data_store.get_sample(md5)
        return {'sample': sample}

    def get_sample_window(self, type_tag, size):
[docs]        """ Get a sample from the DataStore.
            Args:
                type_tag: the type of samples ('pcap','exe','pdf')
                size: the size of the window in MegaBytes (10 = 10MB)
            Returns:
                A list of md5s representing the newest samples within the size window
        """
        return self.data_store.get_sample_window(type_tag, size)

    def has_sample(self, md5):
[docs]        """ Do we have this sample in the DataStore.
            Args:
                md5: the md5 of the sample
            Returns:
                True or False
        """
        return self.data_store.has_sample(md5)

    def list_samples(self, predicate={}):
[docs]        """List all samples that meet the predicate or all if predicate is not specified.

        Args:
            predicate: Match samples against this predicate (or all if not specified)

        Returns:
            List of dictionaries with matching samples {'md5':md5, 'filename': 'foo.exe', 'type_tag': 'exe'}
        """
        return self.data_store.list_samples(predicate)

    @zerorpc.stream
    def stream_sample(self, md5, max_rows):
[docs]        """ Stream the sample by giving back a generator, typically used on 'logs'.
            Args:
                md5: the md5 of the sample
                max_rows: the maximum number of rows to return (None for all)
            Returns:
                A generator that yields rows of the file/log
        """

        # Grab the sample and it's raw bytes
        sample = self.data_store.get_sample(md5)
        raw_bytes = sample['raw_bytes']

        # Figure out the type of file to be streamed
        type_tag = sample['type_tag']
        if type_tag == 'bro':
            bro_log = bro_log_reader.BroLogReader(convert_datetimes=False)
            mem_file = StringIO.StringIO(raw_bytes)
            generator = bro_log.read_log(mem_file)
            return generator
        elif type_tag == 'els_query':
            els_log = json.loads(raw_bytes)
            # Try to determine a couple of different types of ELS query results
            if 'fields' in els_log['hits']['hits'][0]:
                generator = (row['fields'] for row in els_log['hits']['hits'][:max_rows])
            else:
                generator = (row['_source'] for row in els_log['hits']['hits'][:max_rows])
            return generator
        elif type_tag == 'log':
            generator = ({'row':row} for row in raw_bytes.split('\n')[:max_rows])
            return generator
        elif type_tag == 'json':
            generator = (row for row in json.loads(raw_bytes)[:max_rows])
            return generator
        else:
            raise RuntimeError('Cannot stream file %s with type_tag:%s' % (md5, type_tag))

    def guess_type_tag(self, input_bytes):
[docs]        """ Try to guess the type_tag for this sample """
        mime_to_type = {'application/x-dosexec': 'exe',
                        'application/pdf': 'pdf',
                        'application/zip': 'zip',
                        'application/jar': 'jar',
                        'application/vnd.ms-cab-compressed': 'cab',
                        'text/plain': 'txt',
                        'image/gif': 'gif',
                        'image/jpeg': 'jpg',
                        'image/png': 'png',
                        'text/html': 'html',
                        'application/vnd.ms-fontobject': 'ms_font',
                        'application/x-shockwave-flash': 'swf'}

        # See what filemagic can determine
        with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as mag:
            mime_type = mag.id_buffer(input_bytes[:1024])
            if mime_type in mime_to_type:
                return mime_to_type[mime_type]
            else:
                print '<<< Sample Type could not be Determined >>>'
                return 'unknown'



    #######################
    # Index Methods
    #######################
    def index_sample(self, md5, index_name):
[docs]        """ Index a stored sample with the Indexer.
            Args:
                md5: the md5 of the sample
                index_name: the name of the index
            Returns:
                Nothing
        """
        generator = self.stream_sample(md5, None)
        for row in generator:
            self.indexer.index_data(row, index_name)

    def index_worker_output(self, worker_name, md5, index_name, subfield):
[docs]        """ Index worker output with the Indexer.
            Args:
                worker_name: 'strings', 'pe_features', whatever
                md5: the md5 of the sample
                index_name: the name of the index
                subfield: index just this subfield (None for all)
            Returns:
                Nothing
        """

        # Grab the data
        if subfield:
            data = self.work_request(worker_name, md5)[worker_name][subfield]
        else:
            data = self.work_request(worker_name, md5)[worker_name]

        # Okay now index the data
        self.indexer.index_data(data, index_name=index_name, doc_type='unknown')

    def search(self, index_name, query):
[docs]        """ Search a particular index in the Indexer
            Args:
                index_name: the name of the index
                query: the query against the index
            Returns:
                All matches to the query
        """
        return self.indexer.search(index_name, query)


    #######################
    # Graph Methods
    #######################
    def add_node(self, node_id, name, labels):
[docs]        """ Add a node to the graph with name and labels.
            Args:
                node_id: the unique node_id e.g. 'www.evil4u.com'
                name: the display name of the node e.g. 'evil4u'
                labels: a list of labels e.g. ['domain','evil']
            Returns:
                Nothing
        """
        self.neo_db.add_node(node_id, name, labels)

    def has_node(self, node_id):
[docs]        """ Does the Graph DB have this node
            Args:
                node_id: the unique node_id e.g. 'www.evil4u.com'
            Returns:
                True/False
        """
        return self.neo_db.has_node(node_id)

    def add_rel(self, source_id, target_id, rel):
[docs]        """ Add a relationship: source, target must already exist (see add_node)
            'rel' is the name of the relationship 'contains' or whatever.
            Args:
                source_id: the unique node_id of the source
                target_id: the unique node_id of the target
                rel: name of the relationship
            Returns:
                Nothing
        """
        self.neo_db.add_rel(source_id, target_id, rel)

    def clear_graph_db(self):
[docs]        """ Clear the Graph Database of all nodes and edges.
            Args:
                None
            Returns:
                Nothing
        """
        self.neo_db.clear_db()

    def clear_db(self):
[docs]        """ Clear the Main Database of all samples and worker output.
            Args:
                None
            Returns:
                Nothing
        """
        self.data_store.clear_db()


    #######################
    # Work Request Methods
    #######################
    def work_request(self, worker_name, md5, subkeys=None):
[docs]        """ Make a work request for an existing stored sample.
            Args:
                worker_name: 'strings', 'pe_features', whatever
                md5: the md5 of the sample
                subkeys: just return a subfield e.g. 'foo' or 'foo.bar' (None for all) 
            Returns:
                The output of the worker or just the subfield of the worker output
        """

        # Check valid
        if worker_name not in self.plugin_meta.keys():
            raise RuntimeError('Invalid work request for class %s (not found)' % (worker_name))

        # Get results (even if we have to wait for them)
        # Note: Yes, we're going to wait. Gevent concurrent execution will mean this
        #       code gets spawned off and new requests can be handled without issue.
        work_results = self._recursive_work_resolver(worker_name, md5)

        # Subkeys? (Fixme this is super klutzy)
        if subkeys:
            try:
                sub_results = {}
                for subkey in subkeys:
                    tmp = work_results[worker_name]
                    for key in subkey.split('.'):
                        tmp = tmp[key]
                        sub_results[key] = tmp
                work_results = sub_results
            except (KeyError, TypeError):
                raise RuntimeError('Could not get one or more subkeys for: %s' % (work_results))

        # Clean it and ship it
        work_results = self.data_store.clean_for_serialization(work_results)
        return work_results

    @zerorpc.stream
    def batch_work_request(self, worker_name, kwargs={}):
[docs]        """Make a batch work request for an existing set of stored samples.

        A subset of sample can be specified with kwargs.

        Args:
            worker_name: 'strings', 'pe_features', whatever
            kwargs: a way of specifying subsets of samples ({} for all)
                type_tag: subset based on sample type (e.g. type_tag='exe')
                md5_list: subset just the samples in this list
                subkeys: return just this subkey (e.g. 'foo' or 'foo.bar')
        Returns:
            A generator that yields rows of worker output or subfields of the worker output
        """
        type_tag = kwargs.get('type_tag',None)
        md5_list = kwargs.get('md5_list',None)
        subkeys = kwargs.get('subkeys',None)

        # If no md5_list specified put all samples (of type type_tag if not None)
        if not md5_list:
            md5_list = self.data_store.all_sample_md5s(type_tag) 

        # Loop through all the md5s and return a generator with yield
        for md5 in md5_list:
            try:
                if subkeys:
                    yield self.work_request(worker_name, md5, subkeys)
                else:
                    yield self.work_request(worker_name, md5)[worker_name]
            except KeyError:
                continue

    def store_sample_set(self, md5_list):
[docs]        """ Store a sample set (which is just a list of md5s).

            Note: All md5s must already be in the data store.

            Args:
                md5_list: a list of the md5s in this set (all must exist in data store)

            Returns:
                The md5 of the set (the actual md5 of the set
        """
        for md5 in md5_list:
            if not self.has_sample(md5):
                raise RuntimeError('Sample not found all items in sample_set\
                                    must be in the datastore: %s (not found)' % (md5))
        set_md5 = hashlib.md5(str(md5_list)).hexdigest()
        self._store_work_results({'md5_list':md5_list}, 'sample_set', set_md5)
        return set_md5

    def get_sample_set(self, md5):
[docs]        """ Store a sample set (which is just a list of md5s).

            Args:
                md5_list: a list of the md5s in this set (all must exist in data store)

            Returns:
                The md5 of the set (the actual md5 of the set
        """
        return self._get_work_results('sample_set', md5)

    @zerorpc.stream
    def stream_sample_set(self, md5):
[docs]        """ Stream a sample set (which is just a list of md5s).

            Args:
                md5: the md5 of the sample_set

            Returns:
                A generator that yields the md5s in the sample set
        """
        for md5 in self._get_work_results('sample_set', md5)['md5_list']:
            yield md5

    def get_datastore_uri(self):
[docs]        """ Gives you the current datastore URL.

            Args:
                None

            Returns:
                The URI of the data store currently being used by Workbench
        """
        return self.data_store.get_uri()


    ##################
    # Help
    ##################
    def help(self, cli=False):
[docs]        """ Returns help commands """
        return self.help_system.help(cli)

    def help_basic(self, cli=False):
[docs]        """ Returns basic help commands """
        return self.help_system.help_basic(cli)

    def help_commands(self, cli=False):
[docs]        """ Returns a big string of Workbench commands and signatures """
        return self.help_system.help_commands(cli)

    def help_command(self, command, cli=False):
[docs]        """ Returns a specific Workbench command and docstring """
        return self.help_system.help_command(command,cli)

    def help_workers(self, cli=False):
[docs]        """ Returns a big string of the loaded Workbench workers and their dependencies """
        return self.help_system.help_workers(cli)

    def help_worker(self, worker, cli=False):
[docs]        """ Returns a specific Workbench worker and docstring """
        return self.help_system.help_worker(worker, cli)

    def help_advanced(self, cli=False):
[docs]        """ Returns advanced help commands """
        return self.help_system.help_advanced(cli)

    def help_everything(self, cli=False):
[docs]        """ Returns advanced help commands """
        return self.help_system.help_everything(cli)


    ##################
    # Introspection
    ##################
    def list_all_commands(self):
[docs]        """ Returns a list of all the Workbench commands"""
        commands = [name for name, _ in inspect.getmembers(self, predicate=inspect.ismethod) if not name.startswith('_')]
        commands.append('batch_work_request') # I think the zerorpc decorator messes up inspect
        return commands

    def list_all_workers(self):
[docs]        """ List all the currently loaded workers """
        return self.plugin_meta.keys()

    def worker_info(self, worker_name):
[docs]        """ Get the information about this worker """
        plugin = self.plugin_meta[worker_name]
        return {'dependencies': plugin['class'].dependencies, 'doc': plugin['class'].__doc__}


    ##################
    # Testing
    ##################
    def test_worker(self, worker_name):
[docs]        """ Run the test for a specific worker """

        # First find the plugin
        try:
            plugin = self.plugin_meta[worker_name]
        except KeyError:
            return '%s worker not found.. misspelled?' % worker_name

        # Now try to run the test
        try:
            return plugin['test']()
        except (AttributeError, KeyError) as error:
            output = 'Failure for plugin: %s' % (worker_name)
            output += 'Error: %s' % error
            return output


    ####################
    # Internal Methods
    ####################
    def _new_plugin(self, plugin, mod_time):
        """ Internal: This method handles the mechanics around new plugins. """
        print '\t- %s: loaded...' % (plugin['name'])
        plugin['time_stamp'] = mod_time # datetime.datetime.utcnow()
        self.plugin_meta[plugin['name']] = plugin

    def _store_work_results(self, results, collection, md5):
        """ Internal: Stores the work results of a worker."""
        self.data_store.store_work_results(results, collection, md5)
    def _get_work_results(self, collection, md5):
        """ Internal: Method for fetching work results."""
        results = self.data_store.get_work_results(collection, md5)
        return {collection: results} if results else None


    # So the trick here is that since each worker just stores it's input dependencies
    # we can resursively backtrack and all the needed work gets done.
    def _recursive_work_resolver(self, worker_name, md5):
        """ Internal: Input dependencies are resursively backtracked, invoked and then
               passed down the pipeline until htting the requested worker. """

        # Looking for the sample or sample_set?
        if (worker_name == 'sample'):
            # If we have a sample set with this md5, return it
            if self.get_sample_set(md5):
                return self.get_sample_set(md5)
            # Return the sample (might raise a RuntimeError)
            return self.get_sample(md5)

        # Do I actually have this plugin? (might have failed, etc)
        if (worker_name not in self.plugin_meta):
            print 'Request for non-existing or failed plugin: %s' % (worker_name)
            return {}

        # If the results exist and the time_stamp is newer than the plugin's, I'm done
        collection = self.plugin_meta[worker_name]['name']
        work_results = self._get_work_results(collection, md5)
        if work_results:
            if self.plugin_meta[worker_name]['time_stamp'] < work_results[collection]['__time_stamp']:
                print 'Returning cached work results for plugin: %s' % (worker_name)
                return work_results
            else:
                print 'Updating work results for new plugin: %s' % (worker_name)

        # Okay either need to generate (or re-generate) the work results
        dependencies = self.plugin_meta[worker_name]['dependencies']
        dependant_results = {}
        for dependency in dependencies:
            dependant_results.update(self._recursive_work_resolver(dependency, md5))
        print 'New work for plugin: %s' % (worker_name)
        work_results = self.plugin_meta[worker_name]['class']().execute(dependant_results)

        # Enforce dictionary output
        if not isinstance(work_results, dict):
            print 'Critical: Plugin %s MUST produce a python dictionary!' % worker_name
            return None

        # Store the results and return
        self._store_work_results(work_results, collection, md5)
        return self._get_work_results(collection, md5)

    def _find_element(self,d,k):
        if k in d: return d[k]
        submatch = [d[_k][k] for _k in d if k in d[_k]]
        return submatch[0] if submatch else None


def run():
[docs]    """ Run the workbench server """

    # Load the configuration file relative to this script location
    config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'config.ini')
    workbench_conf = ConfigParser.ConfigParser()
    config_ini = workbench_conf.read(config_path)
    if not config_ini:
        print 'Could not locate config.ini file, tried %s : exiting...' % config_path
        exit(1)

    # Pull configuration settings
    datastore_uri = workbench_conf.get('workbench', 'datastore_uri')
    database = workbench_conf.get('workbench', 'database')
    worker_cap = workbench_conf.getint('workbench', 'worker_cap')
    samples_cap = workbench_conf.getint('workbench', 'samples_cap')

    # Spin up Workbench ZeroRPC
    try:
        store_args = {'uri': datastore_uri, 'database': database, 'worker_cap':worker_cap, 'samples_cap':samples_cap}
        print '<<< Workbench Server at %s >>>' % ('tcp://0.0.0.0:4242')
        workbench = zerorpc.Server(WorkBench(store_args=store_args), name='workbench', heartbeat=60)
        workbench.bind('tcp://0.0.0.0:4242')
        print '\nWorkbench is ready and feeling super duper!'
        gevent_signal(signal.SIGTERM, workbench.stop)
        gevent_signal(signal.SIGINT, workbench.stop)
        gevent_signal(signal.SIGKILL, workbench.stop)
        workbench.run()
        print '\nWorkbench Server Shutting Down... and dreaming of sheep...'

    except zmq.error.ZMQError:
        print '\nInfo: Could not start Workbench server (no worries, probably already running...)\n'


# Test that just calls main
def test():
[docs]    run()

if __name__ == '__main__':
    run()
Quick search

Source code for workbench.server.workbench

Navigation