{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# jTrans preprocessing codes for Assemblage\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. Flatten the assemblage binaries folder into single layer folders\n",
    "\n",
    "dataset_path : Original downloaded Assemblage folder\n",
    "flatten_dir : Destination folder\n",
    "dbfile : SQLite path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import sys\n",
    "import os\n",
    "import sqlite3\n",
    "import glob\n",
    "from tqdm import tqdm\n",
    "import shutil\n",
    "import hashlib\n",
    "\n",
    "def getmd5(s):\n",
    "    return hashlib.md5(s.encode()).hexdigest()\n",
    "\n",
    "dbfile = 'sept25.sqlite' # SQlite database file\n",
    "dataset_path = 'dataset_sept25' # Path to the dataset\n",
    "flatten_dir = \"dataset\" # Path to the flatten dataset, choose anywhere you like, but it will be deleted first!\n",
    "\n",
    "if os.path.exists(flatten_dir):\n",
    "    os.system(f\"rm -rf {flatten_dir}\")\n",
    "os.makedirs(flatten_dir)\n",
    "\n",
    "connection = sqlite3.connect(dbfile)\n",
    "cursor = connection.cursor()\n",
    "\n",
    "infos = cursor.execute('SELECT id, path, file_name, optimization, github_url, toolset_version FROM binaries;')\n",
    "for binid, path, file_name, opt, github_url,toolset_version in tqdm(infos):\n",
    "    full_path = os.path.join(dataset_path, path.replace(\"\\\\\", \"/\"))\n",
    "    if not os.path.isfile(full_path):\n",
    "        print(\"Missing!\", full_path)\n",
    "        continue\n",
    "    if not os.path.isdir(os.path.join(flatten_dir, str(binid))):\n",
    "        os.makedirs(os.path.join(flatten_dir, str(binid)))\n",
    "    # Original format datautils/dataset/libcap-git-setcap-O2-8dc43f20ea80b7703f6973a1ea86e8b8\n",
    "    shutil.copy(full_path, os.path.join(flatten_dir, str(binid), f\"{binid}_{file_name}-{toolset_version}-{opt}-{getmd5(github_url)}\"))\n",
    "    newcursor = connection.cursor()\n",
    "    pdbs = newcursor.execute('SELECT pdb_path FROM pdbs where binary_id = ?', (binid,))\n",
    "    for pdb in pdbs:\n",
    "        full_path = os.path.join(dataset_path, pdb[0].replace(\"\\\\\", \"/\"))\n",
    "        if not os.path.isfile(full_path):\n",
    "            print(\"Missing!\", full_path)\n",
    "            continue\n",
    "        shutil.copy(full_path, os.path.join(flatten_dir, str(binid), os.path.basename(pdb[0].replace(\"\\\\\", \"/\"))))\n",
    "\n",
    "# Remove excessive pdb prefixes\n",
    "import glob\n",
    "import os\n",
    "for f in glob.glob(\"{flatten_dir}/**/*\", recursive=True):\n",
    "    if f.endswith(\".pdb\"):\n",
    "        dirname = os.path.dirname(f)\n",
    "        basename = os.path.basename(f)\n",
    "        os.rename(f, os.path.join(dirname, basename.split(\"_\")[-1]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. Optional: run jtrans IDA dumping code\n",
    "\n",
    "You can also use author's ida script, this is a multiprocessing version with everything the same"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import subprocess\n",
    "import multiprocessing\n",
    "import time\n",
    "from util.pairdata import pairdata\n",
    "from subprocess import STDOUT, check_output\n",
    "import glob\n",
    "import shutil\n",
    "\n",
    "ida_path=\"idat64\"\n",
    "script_path = \"./process_pe.py\"\n",
    "\n",
    "if os.path.exists(\"extract\"):\n",
    "    os.system(\"rm -rf extract\")\n",
    "if os.path.exists(\"log\"):\n",
    "    os.system(\"rm -rf log\")\n",
    "if os.path.exists(\"idb\"):\n",
    "    os.system(\"rm -rf idb\")\n",
    "os.makedirs(\"extract\")\n",
    "os.makedirs(\"log\")\n",
    "os.makedirs(\"idb\")\n",
    "\n",
    "def getTarget(path, prefixfilter=None):\n",
    "    return [x for x in glob.glob(f'{path}/**/*', recursive=True) if os.path.isfile(x) and (prefixfilter is None or any([x.startswith(y) for y in prefixfilter]))]\n",
    "\n",
    "def cmd_warp(cmd, timeout):\n",
    "    output = check_output(cmd, stderr=STDOUT, timeout=timeout)\n",
    "    print(cmd, output)\n",
    "    return\n",
    "\n",
    "start = time.time()\n",
    "target_list = getTarget(flatten_dir)\n",
    "\n",
    "pool = multiprocessing.Pool(processes=128)\n",
    "for target in target_list:\n",
    "    if target.lower().endswith(\"lib\") or target.lower().endswith(\"pdb\"):\n",
    "        # Skip lib and pdb files\n",
    "        continue\n",
    "    filename = os.path.basename(target)\n",
    "    filename_strip = filename\n",
    "    cmd = [ida_path, f'-Llog/{filename}.log', '-c', '-A', f'-S{script_path}', f'-oidb/{filename}.idb', f'{target}']\n",
    "    pool.apply_async(cmd_warp, args=(cmd, 600, ))\n",
    "\n",
    "\n",
    "pool.close()\n",
    "pool.join()\n",
    "\n",
    "from util.pairdata_assemblage_pe import pairdata\n",
    "pairdata(\"extract\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Now you should have a `extract` folder that is used on jTrans"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Optionl: check function hash by IDA and PE File Module\n",
    "\n",
    "It's not necessary, as Windows PE function are sometime sliced into pieces, from our experience IDA recovers more than 95% correct functions with pdb, around 90% correct when given function entry address"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "import hashlib\n",
    "import sys\n",
    "import os\n",
    "import sqlite3\n",
    "import glob\n",
    "from tqdm import tqdm\n",
    "import shutil\n",
    "import hashlib\n",
    "import pickle\n",
    "from hashlib import sha256\n",
    "\n",
    "def getmd5(s):\n",
    "    return hashlib.md5(s.encode()).hexdigest()\n",
    "\n",
    "dbfile = 'sept25.sqlite'\n",
    "\n",
    "connection = sqlite3.connect(dbfile)\n",
    "cursor = connection.cursor()\n",
    "\n",
    "\n",
    "def sha256sum(b):\n",
    "    h1 = sha256()\n",
    "    h1.update(b)\n",
    "    return h1.digest().hex()\n",
    "\n",
    "# Calculate the hash of each function\n",
    "for f in glob.glob(\"extract/**/*\", recursive=True):\n",
    "    if f.endswith(\"saved_index.pkl\"):\n",
    "        with open(f, \"rb\") as f:\n",
    "            saved_index = pickle.load(f)\n",
    "        for x in saved_index:\n",
    "            print(x)\n",
    "            for y in saved_index[x]:\n",
    "                if type(y[-3]) == bytes:\n",
    "                    hash = sha256sum(y[-3])\n",
    "                    infos = cursor.execute(f'SELECT name FROM functions where hash=\"{hash}\";')\n",
    "                    for q in infos:\n",
    "                        if q[0]!=x:\n",
    "                            print(\"Name diff\", q[0])\n",
    "                            # It is probably fine, as some functions may be have same bytes\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "jtrans",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
