{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import gzip\n",
    "import subprocess\n",
    "\n",
    "for data in ['down', 'original', 'hicplus' 'vehicle', 'hicsr', 'deephic', 'hicbridge']:\n",
    "    for chr in [4, 14, 16, 20]:\n",
    "        f = \"./hicqc_inputs/\" + data + \"_\" + str(chr) + \".gz\"\n",
    "\n",
    "        # read file with pandas\n",
    "        df = pd.read_csv(f, sep=\"\\t\", header=None)\n",
    "\n",
    "        # get all item in column 1 without repeats\n",
    "        midpoint = df[1].unique()\n",
    "\n",
    "        # sum of all values in column 4 for each item in column 1\n",
    "        sums = df.groupby(1)[4].sum()\n",
    "\n",
    "        # create new text file with midpoint and sums and gzip it\n",
    "        with gzip.open(\"./hicqc_inputs/K562/\" + data + \"_\" + str(chr) + \"_fragement.gz\", \"wt\") as f:\n",
    "            for i, s in zip(midpoint, sums):\n",
    "                f.write(str(chr) + \"\\t\" + str(0) + \"\\t\" + str(i) + \"\\t\" + str(s) + \"\\t\" + str(0) + \"\\n\")\n",
    "\n",
    "        f.close()\n",
    "        subprocess.run('fithic -i .\\hicqc_inputs\\\\'+data+'_'+str(chr)+'.gz -f .\\hicqc_inputs\\\\'+data+'_'+str(chr)+'_fragement.gz -r 10000 -o fithic_'+data+str(chr))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "for chr in [4, 14, 16, 20]:\n",
    "    print(chr)\n",
    "    original = './fithic_original'+str(chr)+'/FitHiC.spline_pass1.res10000.significances.txt.gz'\n",
    "    down = './fithic_down'+str(chr)+'/FitHiC.spline_pass1.res10000.significances.txt.gz'\n",
    "    hicbridge = './fithic_hicbridge'+str(chr)+'/FitHiC.spline_pass1.res10000.significances.txt.gz'\n",
    "    hicplus = './fithic_hicplus'+str(chr)+'/FitHiC.spline_pass1.res10000.significances.txt.gz'\n",
    "    hicsr = './fithic_hicsr'+str(chr)+'/FitHiC.spline_pass1.res10000.significances.txt.gz'\n",
    "    deephic = './fithic_deephic'+str(chr)+'/FitHiC.spline_pass1.res10000.significances.txt.gz'\n",
    "    vehicle = './fithic_vehicle'+str(chr)+'/FitHiC.spline_pass1.res10000.significances.txt.gz'\n",
    "\n",
    "\n",
    "    genome_distance = [20e3, 1e6]\n",
    "\n",
    "    df_origin = pd.read_csv(original, sep=\"\\t\")\n",
    "    df_origin = df_origin[df_origin['q-value'] < 1e-6]\n",
    "    df_origin = df_origin[(df_origin['fragmentMid2'] - df_origin['fragmentMid1'] < genome_distance[1]) & (df_origin['fragmentMid2'] - df_origin['fragmentMid1'] > genome_distance[0])]\n",
    "\n",
    "    for compare in [down, vehicle, hicplus, hicsr, deephic, hicbridge]:\n",
    "        df_compare = pd.read_csv(compare, sep=\"\\t\")\n",
    "        df_compare = df_compare[df_compare['q-value'] < 1e-6]\n",
    "        df_compare = df_compare[(df_compare['fragmentMid2'] - df_compare['fragmentMid1'] < genome_distance[1]) & (df_compare['fragmentMid2'] - df_compare['fragmentMid1'] > genome_distance[0])]\n",
    "\n",
    "        df_inter = pd.merge(df_compare, df_origin, on=['chr1', 'chr2', 'fragmentMid1', 'fragmentMid2', 'bias1', 'bias2'])\n",
    "\n",
    "        #print total length of df_origin, df_compare and df_inter\n",
    "        print(compare)\n",
    "\n",
    "        print(\"jaccard\")\n",
    "        print(len(df_inter) / (len(df_origin) + len(df_compare) - len(df_inter)))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "JaeminKim",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
