import numpy as np
import pandas as pd
import subprocess
import h5py
import multiprocessing as mp

def get_cmd_arguments():
    ap = argparse.ArgumentParser()
    ap.add_argument('input_file', required=True, action='store', type=str, dest='input_path',help= 'path to original ICGC snv bed file')
    ap.add_argument('output_file', required=True, action='store', type=str, dest='output_path', help 'path to output the annotated file')
    return ap.parse_args()

def collapse_annot(grp):
    annot = grp['Annot']
    translated = [annot_dic[v] for v in annot.values]
    final_annot = rev_annot_dic[np.array(translated).max()]
    entry = grp.iloc[0].copy()
    entry['Annot'] = final_annot
    return entry

def reduce_df(df, num):
    ret = df.groupby(by =df.index).apply(collapse_annot)
    return ret

def main():
    args = get_cmd_arguments()
    subprocess.call(['./ICGC_SNV_annot_extract.sh', args.input_path, args.output_path])
    mut_table = pd.read_csv('/scratch1/maxas/ICGC_Roadmap/cancer_mutations_ICGC/cancer_mutations/skin/SNV_skin_melanoma_MELA_AU.annot.bed.gz', index_col = False, names={'CHROM':str, 'START':int, 'END':int, 'REF':str, 'ALT':str, 'DONOR':str, 'Annot':str}, low_memory=False, compression='gzip',sep='\t',header=None)
    annot_dic = {'downstream_gene_variant':0, 'intergenic_region':0,
       'upstream_gene_variant':0, 'exon_variant':0, 'intron_variant':0,
       'missense_variant':1, '3_prime_UTR_variant':0, 'synonymous_variant':0,
       '5_prime_UTR_variant':0, 'splice_region_variant':0,
       '5_prime_UTR_premature_start_codon_gain_variant':0,
       'splice_donor_variant':2, 'splice_acceptor_variant':2, 'stop_gained':2,
       'stop_retained_variant':0, 'stop_lost':2, 'start_lost':2,
       'initiator_codon_variant':0, 'intragenic_variant':0}
    rev_annot_dic = {0:'low_impact', 1:'Missense_Mutation',2:'Nonsense_Mutation'}
    mut_table.index = ['chr{}:{}-{}:{}'.format(r[0], r[1], r[2], r[3]) for r in zip(mut_table.CHROM, mut_table.START, mut_table.END, mut_table.DONOR)]
    dups = mut_table.duplicated(['CHROM', 'START', 'END', 'DONOR'], keep=False)
    edges = []
    edges.append(0)
    chunksize = int(np.ceil(len(mut_table) / 30))
    for n, i in enumerate(np.arange(0, len(mut_table), chunksize)):
        if n != 0:
            while dups.iloc[i]:
                i +=1
            edges.append(i)
    edges.append(len(mut_table))
    pool = mp.Pool(30)
    res = []
    for i in range(len(edges) -1):
        df = mut_table.iloc[edges[i]:edges[i+1]]
        r = pool.apply_async(reduce_df, (df, 1))
        res.append(r)

    pool.close()
    pool.join()

    res_lst = [r.get() for r in res]
    reduced = pd.concat(res_lst)
    reduced_sorted = reduced.sort_values(by=['CHROM', 'START','END'])
    reduced_sorted.to_csv(args.output_path, sep = '\t', header = None, index = False)

if __name__ == "__main__":
    main()
