Source code for workbench.clients.pe_sim_graph

"""This client generates a similarity graph from features in PE Files."""

import zerorpc
import os
import workbench_client

[docs]def add_it(workbench, file_list, labels): """Add the given file_list to workbench as samples, also add them as nodes. Args: workbench: Instance of Workbench Client. file_list: list of files. labels: labels for the nodes. Returns: A list of md5s. """ md5s = [] for filename in file_list: if filename != '.DS_Store': with open(filename, 'rb') as pe_file: base_name = os.path.basename(filename) md5 = workbench.store_sample(base_name,, 'exe') workbench.add_node(md5, md5[:6], labels) md5s.append(md5) return md5s
[docs]def jaccard_sims(feature_list): """Compute Jaccard similarities between all the observations in the feature list. Args: feature_list: a list of dictionaries, each having structure as { 'md5' : String, 'features': list of Strings } Returns: list of dictionaries with structure as {'source': md5 String, 'target': md5 String, 'sim': Jaccard similarity Number} """ sim_info_list = [] for feature_info in feature_list: md5_source = feature_info['md5'] features_source = feature_info['features'] for feature_info in feature_list: md5_target = feature_info['md5'] features_target = feature_info['features'] if md5_source == md5_target: continue sim = jaccard_sim(features_source, features_target) if sim > .5: sim_info_list.append({'source': md5_source, 'target': md5_target, 'sim': sim}) return sim_info_list
[docs]def jaccard_sim(features1, features2): """Compute similarity between two sets using Jaccard similarity. Args: features1: list of PE Symbols. features2: list of PE Symbols. Returns: Returns an int. """ set1 = set(features1) set2 = set(features2) try: return len(set1.intersection(set2))/float(max(len(set1), len(set2))) except ZeroDivisionError: return 0
[docs]def run(): """This client generates a similarity graph from features in PE Files.""" # Grab server args args = workbench_client.grab_server_args() # Start up workbench connection workbench = zerorpc.Client(timeout=300, heartbeat=60) workbench.connect('tcp://'+args['server']+':'+args['port']) # Test out PEFile -> pe_deep_sim -> pe_jaccard_sim -> graph data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),'../data/pe/bad') bad_files = [os.path.join(data_path, child) for child in os.listdir(data_path)][:5] data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),'../data/pe/good') good_files = [os.path.join(data_path, child) for child in os.listdir(data_path)][:5] # Clear any graph in the Neo4j database workbench.clear_graph_db() # First throw them into workbench and add them as nodes into the graph all_md5s = add_it(workbench, bad_files, ['exe', 'bad']) + add_it(workbench, good_files, ['exe', 'good']) # Compute pe_features on all files of type pe, just pull back the sparse features import_gen = workbench.batch_work_request('pe_features', {'md5_list': all_md5s, 'subkeys':['md5', 'sparse_features.imported_symbols']}) imports = [{'md5': r['md5'], 'features': r['imported_symbols']} for r in import_gen] # Compute pe_features on all files of type pe, just pull back the sparse features warning_gen = workbench.batch_work_request('pe_features', {'md5_list': all_md5s, 'subkeys':['md5', 'sparse_features.pe_warning_strings']}) warnings = [{'md5': r['md5'], 'features': r['pe_warning_strings']} for r in warning_gen] # Compute strings on all files of type pe, just pull back the string_list string_gen = workbench.batch_work_request('strings', {'md5_list': all_md5s, 'subkeys':['md5', 'string_list']}) strings = [{'md5': r['md5'], 'features': r['string_list']} for r in string_gen] # Compute pe_peid on all files of type pe, just pull back the match_list # Fixme: commenting this out until we figure out why peid is SO slow ''' peid_gen = workbench.batch_work_request('pe_peid', {'md5_list': all_md5s, 'subkeys':['md5', 'match_list']}) peids = [{'md5': r['md5'], 'features': r['match_list']} for r in peid_gen] ''' # Compute the Jaccard Index between imported systems and store as relationships sims = jaccard_sims(imports) for sim_info in sims: workbench.add_rel(sim_info['source'], sim_info['target'], 'imports') # Compute the Jaccard Index between warnings and store as relationships sims = jaccard_sims(warnings) for sim_info in sims: workbench.add_rel(sim_info['source'], sim_info['target'], 'warnings') # Compute the Jaccard Index between strings and store as relationships sims = jaccard_sims(strings) for sim_info in sims: workbench.add_rel(sim_info['source'], sim_info['target'], 'strings') # Compute the Jaccard Index between peids and store as relationships # Fixme: commenting this out until we figure out why peid is SO slow ''' sims = jaccard_sims(peids) for sim_info in sims: workbench.add_rel(sim_info['source'], sim_info['target'], 'peids') ''' # Compute pe_deep_sim on all files of type pe results = workbench.batch_work_request('pe_deep_sim', {'type_tag': 'exe'}) # Store the ssdeep sims as relationships for result in list(results): for sim_info in result['sim_list']: workbench.add_rel(result['md5'], sim_info['md5'], 'ssdeep') # Let them know where they can get there graph print 'All done: go to http://localhost:7474/browser and execute this query: "%s"' % \ ('match (n)-[r]-() return n,r')
import pytest @pytest.mark.xfail def test(): """Executes pe_sim_graph test.""" run() if __name__ == '__main__': run()