import os
from utils import extract_seq_from_pdb
from tqdm import tqdm


def parse_cath():
    modes = ["train", "validation", "test"]

    for mode in modes:
        data_dir = f"./RAG/cath42/{mode}/"
        aln_path = os.path.join(data_dir, f"aln")
        rag_seq_dir = os.path.join(data_dir, f"rag_seq")
        os.makedirs(rag_seq_dir, exist_ok=True)

        query2matchs = {}
        with open(aln_path) as f:
            lines = f.readlines()
        for line in tqdm(lines):
            items = line.split()
            query = items[0].split("_")[0]
            target = items[1]
            score = items[2]
            if query in query2matchs.keys():
                query2matchs[query].append((target, score))
            else:
                query2matchs[query] = [(target, score)]
        for query, target_sco in query2matchs.items():
            name = query.split("_")[0]
            out = ""
            for i, (target, score) in enumerate(target_sco):
                out += f"{name} {target} {score}\n"
            with open(os.path.join(rag_seq_dir, f"{name}"), "w") as fw:
                fw.write(out)

parse_cath()