import requests, time, random, logging, json, sys
import pandas as pd
from pathlib import Path
from utils import *
from config import *

OUTPUT_DIR = LLM_DIR
OUT = ANN_PATH
INPUT = TOP_CANDIDATES

def main():
    inp = Path(INPUT)
    if not inp.exists():
        logging.error(f"File not found: {INPUT}")
        sys.exit(2)

    df = pd.read_csv(inp, sep='\t')

    # Use both requested columns: protein1 and similar_protein_name
    TARGET_COLS = ["protein1", "similar_protein_name"]
    available = [c for c in TARGET_COLS if c in df.columns]
    if not available:
        logging.error(f"Columns not found: {TARGET_COLS}. Available columns: {list(df.columns)}")
        sys.exit(2)

    logging.info(f"I will use the columns {available} as the list of genes/proteins")

    genes = []
    for c in available:
        vals = (
            df[c]
            .astype(str)
            .str.strip()
            .replace({'nan': ''})
            .tolist()
        )
        genes.extend([v for v in vals if v])

    n_records = len(genes)
    unique_genes = sorted(set(genes))
    logging.info(f"Found {n_records} total records from {available}, {len(unique_genes)} unique genes")

    # 1) STRING + 2) UniProt accession
    string_cache = {}
    acc_cache = {}

    for i, g in enumerate(unique_genes, 1):
        try:
            sinfo = resolve_string_info(g, TAXID)
            acc   = resolve_accession_by_gene(g, TAXID)
        except Exception as e:
            logging.warning(f"Error on {g!r}: {e}")
            sinfo = {"string_id":"", "preferred_name":"", "annotation":""}
            acc   = ""
        string_cache[g] = sinfo
        acc_cache[g]    = acc
        logging.info(f"[{i}/{len(unique_genes)}] {g} -> string_id={sinfo['string_id']!r}, acc={acc!r}")
        time.sleep(SLEEP)

    # 3) Annotations per accession (in batch, with caching)
    accs = sorted(set([a for a in acc_cache.values() if a]))
    logging.info(f"Retrieving UniProt annotations for {len(accs)} accessions")
    annots = fetch_annotations_for_accessions(accs)

    # Compose output rows
    rows = []
    for g in unique_genes:
        sinfo = string_cache.get(g, {})
        acc   = acc_cache.get(g, "")
        a     = annots.get(acc, {}) if acc else {}
        rows.append({
            "name": g,
            "string_id": sinfo.get("string_id", ""),
            "alias": sinfo.get("preferred_name", "") or g,
            "Annotation": sinfo.get("annotation", ""),
            "uniprot_acc": acc or "",
            "GO_MF": a.get("GO_MF",""),
            "GO_BP": a.get("GO_BP",""),
            "GO_CC": a.get("GO_CC",""),
            "Domains": a.get("Domains",""),
            "Pathways": a.get("Pathways",""),
            "Complexes": a.get("Complexes",""),
            "Notes": a.get("Notes",""),
        })

    out_df = pd.DataFrame(rows, columns=list(rows[0].keys()) if rows else None)
    out_df.to_csv(OUT, sep='\t', index=False)
    print(f"Wrote {OUT} with {len(out_df)} proteins")

if __name__ == "__main__":
    main()