#!/usr/bin/env python
# coding: utf-8

##############################################################################
#                                                                            #
#  Code for the USENIX Security '22 paper:                                   #
#  How Machine Learning Is Solving the Binary Function Similarity Problem.   #
#                                                                            #
#  MIT License                                                               #
#                                                                            #
#  Copyright (c) 2019-2022 Cisco Talos                                       #
#                                                                            #
#  Permission is hereby granted, free of charge, to any person obtaining     #
#  a copy of this software and associated documentation files (the           #
#  "Software"), to deal in the Software without restriction, including       #
#  without limitation the rights to use, copy, modify, merge, publish,       #
#  distribute, sublicense, and/or sell copies of the Software, and to        #
#  permit persons to whom the Software is furnished to do so, subject to     #
#  the following conditions:                                                 #
#                                                                            #
#  The above copyright notice and this permission notice shall be            #
#  included in all copies or substantial portions of the Software.           #
#                                                                            #
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,           #
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF        #
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                     #
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE    #
#  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION    #
#  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION     #
#  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.           #
#                                                                            #
#  Dataset-1 creation                                                        #
#                                                                            #
##############################################################################

import json
import os
import pandas as pd

from collections import defaultdict
from tqdm import tqdm

# where to save the new dataset
OUTPUT_DIR = "."

# The starting point
CSV_FLOWCHART_FP = "flowchart_Dataset-MOTIF.csv"
MOTIF_METADATA = 'motif_dataset.jsonl'

# teach the CSV parser how to handle file, function names with commas
def names_w_strings(entries):
     
    # figure out how many entries are part of the file/function name, respectively
    set_file_name_entries = False
    set_func_name_entries = False
    
    for i, entry in enumerate(entries):
        if entry.startswith('0x'):
            if not set_file_name_entries:
                file_name_entries = (0, i)
                set_file_name_entries = True
            elif not set_func_name_entries:
                func_name_entries = (file_name_entries[1]+1, i)
                set_func_name_entries = True
    
    full_file_name = ",".join(entries[file_name_entries[0]:file_name_entries[1]])
    full_fn_name = ",".join(entries[func_name_entries[0]:func_name_entries[1]]) 
    
    new_entries = [full_file_name, entries[file_name_entries[1]], full_fn_name, entries[-5], 
                   entries[-4], entries[-3], entries[-2], entries[-1]]
    return new_entries

# Read the list of functions from the output of IDA flowchart
df = pd.read_csv(CSV_FLOWCHART_FP, engine='python', on_bad_lines=names_w_strings)
print(f"Shape: {df.shape}")

del df['bb_list']

# load MOTIF metadata

# Load the JSON with label info
metadata = pd.read_json(path_or_buf=MOTIF_METADATA, lines=True)
metadata = metadata.set_index('md5')
labels = metadata['label']
labels_dict = labels.to_dict() # key = file hash, value = malware family label

# Replace 'func_name'] column with label
def get_label(idb_path):
    # idb_path looks like IDBs/Dataset-MOTIF/MOTIF_eb8e26645a457fdc7158cb9be749c5d2.i64
    idb_file_name = idb_path.split('/')[-1]
    motif_hash = idb_file_name.replace('.i64','').replace('MOTIF_','')

    label = labels_dict[motif_hash]

    return label

labels = []
for i in tqdm(range(len(df))):
    path = df.iloc[i]['idb_path']
    label = get_label(path)
    labels.append(label)

df['func_name'] = labels

df.to_csv(os.path.join(OUTPUT_DIR, "Dataset-MOTIF.csv"))

# Save the "selected functions" to a JSON.
# This is useful to limit the IDA analysis to some functions only.
fset = set([tuple(x) for x in df[['idb_path', 'fva']].values])
print("{}: {} functions".format("all", len(fset)))

selected_functions = defaultdict(list)
for t in fset:
    selected_functions[t[0]].append(int(t[1], 16))
        
# Test
assert(sum([len(v) for v in selected_functions.values()]) == len(fset))

# Save to file
with open(os.path.join(OUTPUT_DIR, "selected_Dataset-MOTIF.json"), "w") as f_out:
    json.dump(selected_functions, f_out)




