import requests, time, random, logging, json, sys
import pandas as pd
from pathlib import Path

TAXID      = "9606"                        
SLEEP      = 0.3                           
RETRIES    = 6
TIMEOUT    = 60
GENE_COL   = None  # If None, try auto-detect. Otherwise e.g. "gene" or "symbol".

logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")

DEFAULT_HEADERS = {
    "User-Agent": "ppi-debug/1.0 (+https://example.org)",
    "Accept": "application/json",
    "Connection": "keep-alive",
}

# ---------- HTTP with retry/backoff ----------
def request_with_retry(url, params=None, headers=None, retries=RETRIES, sleep=SLEEP, timeout=TIMEOUT):
    merged = dict(DEFAULT_HEADERS)
    if headers: merged.update(headers)
    last_err = None
    for i in range(retries):
        try:
            r = requests.get(url, params=params, headers=merged, timeout=timeout)
            ct = r.headers.get("Content-Type","")
            prefix = (r.text or "")[:200]
            logging.info(f"[HTTP] {r.status_code} {r.url} CT={ct} BODY[:200]={prefix!r}")
            if r.status_code == 200:
                return r
            if r.status_code == 429:
                ra = int(r.headers.get("Retry-After", 2))
                wait = ra + random.uniform(0.2, 0.8)
                logging.warning(f"429; wait {wait:.1f}s"); time.sleep(wait); continue
            if r.status_code in (408, 500, 502, 503, 504):
                wait = sleep*(2**i) + random.uniform(0,0.4)
                logging.warning(f"{r.status_code} transient; retry in {wait:.1f}s")
                time.sleep(wait); continue
            raise RuntimeError(f"non-200: {r.status_code} for {r.url} (CT={ct}) body={prefix!r}")
        except requests.RequestException as e:
            last_err = e
            wait = sleep*(2**i) + 0.2*i
            logging.warning(f"net error: {e}; retry in {wait:.1f}s")
            time.sleep(wait)
    raise RuntimeError(f"request failed; last_err={last_err}")

# ---------- STRING: resolve string_id + annotation ----------
def resolve_string_info(name, taxid=None):
    base = "https://version-12-0.string-db.org/api/json/get_string_ids"
    params = {
        "identifiers": name,
        "species": taxid or "",
        "limit": 1,
        "echo_query": 1,
        "caller_identity": "ppi-debug/1.0"
    }
    r = request_with_retry(base, params=params)
    items = r.json() or []
    if not items:
        return {"string_id":"", "preferred_name":"", "annotation":""}
    it = items[0]
    return {
        "string_id": it.get("stringId","") or "",
        "preferred_name": it.get("preferredName","") or "",
        "annotation": it.get("annotation","") or ""
    }

# ---------- UniProt: resolve accession from gene ----------
def _pick_best_primary_accession(items):
    if not items: return ""
    for it in items:
        if it.get("entryType","" ).upper() == "SWISSPROT" or it.get("reviewed") is True:
            return it.get("primaryAccession") or it.get("uniProtkbId") or ""
    it = items[0]
    return it.get("primaryAccession") or it.get("uniProtkbId") or ""


def uniprot_search(query, fields="accession,reviewed", size=3, organism_id=None):
    BASE = "https://rest.uniprot.org/uniprotkb/search"
    q = f"({query})"
    if organism_id:
        q = f"{q} AND (organism_id:{organism_id})"
    params = {"query": q, "fields": fields, "format": "json", "size": size}
    r = request_with_retry(BASE, params=params)
    data = r.json() or {}
    return _pick_best_primary_accession(data.get("results") or [])


def resolve_accession_by_gene(name, taxid=None):
    # Try stricter queries first, then relax them
    for q in [f"gene_exact:{name}", f"gene:{name}", f"id:{name}", f"protein_name:{name}"]:
        acc = uniprot_search(q, organism_id=taxid)
        if acc: return acc
    for q in [f"gene_exact:{name}", f"gene:{name}", f"id:{name}", f"protein_name:{name}"]:
        acc = uniprot_search(q)
        if acc: return acc
    return ""


def fetch_annotations_for_accessions(accessions, sleep=SLEEP):
    BASE = "https://rest.uniprot.org/uniprotkb"
    result = {}

    def pick_xrefs(rec, db):
        vals=[]
        for xr in rec.get("uniProtKBCrossReferences", []) or []:
            if xr.get("database") != db:
                continue
            if xr.get("id"): vals.append(str(xr["id"]))
            for p in xr.get("properties") or []:
                for k in ("id","name","value"):
                    if p.get(k): vals.append(str(p[k]))
        return " | ".join(sorted(set(vals)))

    for acc in accessions:
        if not acc: 
            continue
        r = request_with_retry(f"{BASE}/{acc}", params={"format":"json"})
        rec = r.json()

        MF,BP,CC = [],[],[]
        for it in rec.get("go", []) or []:
            asp = (it.get("aspect") or "").upper()
            term = it.get("term") or it.get("label") or it.get("value")
            if not term: 
                continue
            if asp=="F": MF.append(term)
            elif asp=="P": BP.append(term)
            elif asp=="C": CC.append(term)
        if not (MF or BP or CC):
            for xr in rec.get("uniProtKBCrossReferences", []) or []:
                if xr.get("database")!="GO": 
                    continue
                for p in xr.get("properties") or []:
                    v = p.get("value") or ""
                    if v.startswith("F:"): MF.append(v[2:].strip())
                    if v.startswith("P:"): BP.append(v[2:].strip())
                    if v.startswith("C:"): CC.append(v[2:].strip())

        interpro = pick_xrefs(rec, "InterPro")
        pfam     = pick_xrefs(rec, "Pfam")
        reactome = pick_xrefs(rec, "Reactome")
        complexp = pick_xrefs(rec, "ComplexPortal")

        # Subcellular location
        vals=[]
        csl = rec.get("ccSubcellularLocation") or rec.get("cc_subcellular_location")
        if isinstance(csl, list):
            for it in csl:
                if isinstance(it, dict):
                    t = it.get("value") or it.get("text") or it.get("note")
                    if t: vals.append(str(t))
                elif isinstance(it, str):
                    vals.append(it)
        elif isinstance(csl, str) and csl.strip():
            vals.append(csl.strip())
        for cm in rec.get("comments", []) or []:
            if isinstance(cm, dict) and cm.get("commentType")=="SUBCELLULAR LOCATION":
                for loc in cm.get("subcellularLocations", []) or []:
                    lf = loc.get("location")
                    ln = lf.get("value") if isinstance(lf, dict) else lf
                    if ln: vals.append(str(ln))
                for tx in cm.get("texts", []) or []:
                    if isinstance(tx, dict) and tx.get("value"):
                        vals.append(str(tx["value"]))

        result[acc] = {
            "GO_MF": " | ".join(sorted(set(MF))),
            "GO_BP": " | ".join(sorted(set(BP))),
            "GO_CC": " | ".join(sorted(set(CC))),
            "Domains": " | ".join([x for x in [interpro, pfam] if x]),
            "Pathways": reactome,
            "Complexes": complexp,
            "Notes": " | ".join(sorted(set(vals))),
        }
        time.sleep(sleep)
    return result

# ---------- Utility: detect the gene column ----------
POSSIBLE_GENE_COLS = [
    "gene", "symbol", "name", "protein", "candidate", "alias", "preferred_name",
    "Gene", "SYMBOL", "Name"
]

def autodetect_gene_col(cols):
    lower = {c.lower(): c for c in cols}
    for k in POSSIBLE_GENE_COLS:
        if k.lower() in lower:
            return lower[k.lower()]
    return None