import os
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from model.utils.utils import load_config

"""
提取同时含有重链和轻链并且家族为NA或者unknow的抗体并尝试使用ANARCI注释家族信息
"""


if __name__ == '__main__':
    config, config_name = load_config('../configs/base_pretrain_ddp.yaml')
    df = pd.read_csv(config.dataset.summary_path_dir, sep='\t')
    used_df = df[~(pd.isna(df['Hchain']) | pd.isna(df['Lchain']))]  # 排除掉重链或者轻链未知的数据
    # 获取重链和轻链家族信息未知的数据
    used_df = used_df[used_df['heavy_subclass'].str.contains('unknown') |
                      used_df['heavy_subclass'].str.contains('NA') |
                      used_df['light_subclass'].str.contains('unknown') |
                      used_df['light_subclass'].str.contains('NA')]
    fasta_dict = SeqIO.to_dict(SeqIO.parse(config.dataset.root_path_dir + '/SAbDab_processed_onlyV.fasta', 'fasta'))
    heavy_unknown_subclass_seqs = []
    light_unknown_subclass_seqs = []
    for line in used_df.iterrows():
        seq_id = '%s_%s_%s_%s' % (line[1].pdb, line[1].Hchain, line[1].Lchain, line[1].antigen_chain)
        try:
            heavy_seq, light_seq = str(fasta_dict[seq_id].seq).split(':')
        except:
            continue
        heavy_unknown_subclass_seqs.append(SeqIO.SeqRecord(Seq(heavy_seq), id=seq_id, description=""))
        light_unknown_subclass_seqs.append(SeqIO.SeqRecord(Seq(light_seq), id=seq_id, description=""))
    SeqIO.write(heavy_unknown_subclass_seqs, config.dataset.root_path_dir + '/heavy_unknown_subclass_seqs.fasta', "fasta")
    SeqIO.write(light_unknown_subclass_seqs, config.dataset.root_path_dir + '/light_unknown_subclass_seqs.fasta', "fasta")
    os.system(r'ANARCI -i %s/heavy_unknown_subclass_seqs.fasta -o %s/heavy_unknown_subclass_seqs_imgt.csv --csv --scheme imgt' % (config.dataset.root_path_dir, config.dataset.root_path_dir))
    os.system(r'ANARCI -i %s/light_unknown_subclass_seqs.fasta -o %s/light_unknown_subclass_seqs_imgt.csv --csv --scheme imgt' % (config.dataset.root_path_dir, config.dataset.root_path_dir))
    """经检查，无法恢复家族信息，故省略合并步骤"""
