#!/usr/bin/env python3
"""
Generate an ICLR-style Background section and compile the PDF.
All paths are anonymized via ANON_PROJECT_ROOT.
"""

import re
import json
import tqdm   # type: ignore
import argparse
import requests
import random
import os
from loguru import logger
from concurrent.futures import ThreadPoolExecutor
from create_util import *
from create_cite import *

parser_manager = ArgParser()
args = parser_manager.add_topic_args().add_model_args().parse()

topic: str = args.topic
topic_description: str = args.topic_description
model_name: str = args.model_name

# ---------- anonymous base directory ----------
BASE_DIR = os.environ.get("ANON_PROJECT_ROOT", "./anonymous_root")
WORK_DIR = os.path.join(BASE_DIR, topic, topic_description)
BACKGROUND_TEX = os.path.join(WORK_DIR, "latex", "content", "background.tex")
BACKGROUND_KEYWORDS_JSON = os.path.join(WORK_DIR, "latex", "content", "background_keywords.json")
BIB_FILE = os.path.join(WORK_DIR, "latex", "iclr2025_conference.bib")

# ---------- skip if already generated ----------
if os.path.exists(BACKGROUND_TEX) and os.path.getsize(BACKGROUND_TEX) > 100:
    logger.info("Background section already exists; skipping.")
    exit()

# ---------- load ideas & introduction ----------
IDEAS_JSON = os.path.join(WORK_DIR, "ideas.json")
with open(IDEAS_JSON, "r", encoding="utf-8") as f:
    paper_information = json.load(f)

INTRO_TEX = os.path.join(WORK_DIR, "latex", "content", "introduction.tex")
with open(INTRO_TEX, "r", encoding="utf-8") as f:
    introduction = f.read()

# ---------- generate background outline & keywords ----------
outline_prompt = f"""
Produce a concise background outline (3-4 subsections) plus 4-6 targeted keywords for retrieval.
Return only JSON: {{"keywords": ["kw1", "kw2", ...] }}

###### Introduction BEGIN ######
{introduction}
###### Introduction END ######

###### Paper Information BEGIN ######
{paper_information}
###### Paper Information END ######
"""

if not os.path.exists(BACKGROUND_KEYWORDS_JSON):
    try_counts = 5
    while try_counts > 0:
        try:
            model = ChatAgent(model_name)
            resp = model.chat({"role": "user", "content": outline_prompt})
            json_part = re.search(r"```json\n(.*)\n```", resp, re.DOTALL)[1]
            keywords_data = json.loads(json_part)
            with open(BACKGROUND_KEYWORDS_JSON, "w", encoding="utf-8") as f:
                json.dump(keywords_data, f, indent=4, ensure_ascii=False)
            break
        except Exception as e:
            try_counts -= 1
            logger.warning(f"Outline generation retry: {try_counts}")
else:
    with open(BACKGROUND_KEYWORDS_JSON, "r", encoding="utf-8") as f:
        keywords_data = json.load(f)

# ---------- prepare query list ----------
if isinstance(keywords_data, list):
    query_list = keywords_data
else:
    # pick first list found
    query_list = next((v for v in keywords_data.values() if isinstance(v, list)), [])

# ---------- semantic similarity filter ----------
from sentence_transformers import SentenceTransformer, util

model_st = SentenceTransformer("all-MiniLM-L6-v2")

BACKGROUND_POOL_DIR = os.path.join(BASE_DIR, topic, "background_pool")
os.makedirs(BACKGROUND_POOL_DIR, exist_ok=True)
BACKGROUND_CACHE = os.path.join(BACKGROUND_POOL_DIR, "background_cache.json")

if os.path.exists(BACKGROUND_CACHE):
    with open(BACKGROUND_CACHE, "r", encoding="utf-8") as f:
        background_cache = json.load(f)
else:
    background_cache = []

cached_queries = [bg["query"] for bg in background_cache]
retrieved_texts = []

SIMILARITY_THRESHOLD = 0.75

def max_similarity_score(q):
    if not cached_queries:
        return 0.0, ""
    emb_q = model_st.encode(q, convert_to_tensor=True)
    emb_c = model_st.encode(cached_queries, convert_to_tensor=True)
    scores = util.cos_sim(emb_q, emb_c)[0]
    max_idx = scores.argmax().item()
    return scores[max_idx].item(), cached_queries[max_idx]

for query in query_list:
    max_score, max_query = max_similarity_score(query)
    if max_score > SIMILARITY_THRESHOLD:
        retrieved_texts.append(
            background_cache[cached_queries.index(max_query)]["background"]
        )
        continue

    # ---------- web search ----------
    search_query = f"{query} background theory site:arxiv.org OR site:openreview.net"
    for url in search(search_query, num_results=5):
        if "pdf" in url:
            continue
        try:
            html = requests.get(url, timeout=10).text
            text = re.sub(r"<script.*?</script>", "", html, flags=re.DOTALL)
            text = re.sub(r"<.*?>", "", text)
            text = re.sub(r"{.*?}", "", text)
            text = re.sub(r"\s+", " ", text).strip()

            judge_prompt = f"""
Is the following text relevant to "{query}"?
If yes, extract its theoretical background (max 150 words).
If no, reply only "None".

##### Text BEGIN #####
{text}
##### Text END #####
"""
            model = ChatAgent(model_name)
            bg_snippet = model.chat({"role": "user", "content": judge_prompt})
            if bg_snippet.strip() != "None":
                retrieved_texts.append(bg_snippet)
                background_cache.append({"query": query, "background": bg_snippet})
                break
        except Exception as e:
            logger.debug(f"Web fetch error: {e}")
            continue

# ---------- persist cache ----------
with open(BACKGROUND_CACHE, "w", encoding="utf-8") as f:
    json.dump(background_cache, f, indent=4, ensure_ascii=False)

# ---------- generate background section ----------
background_prompt = f"""
Write the Background section in ICLR style based on the outline and retrieved snippets.

Guidelines:
- Academic ancestors: concepts and prior work required to understand our method.
- Use paragraph format (no lists).
- Include formal notation if needed.
- Follow the outline strictly.
- Use \\cite{{key}}, \\citet{{key}}, \\citep{{key}} correctly.

###### Outline BEGIN ######
{keywords_data}
###### Outline END ######

###### Retrieved Snippets BEGIN ######
{retrieved_texts}
###### Retrieved Snippets END ######
"""

latex_output = ""
while len(latex_output) < 10:
    model = ChatAgent(model_name)
    latex_output = model.chat_with_latex_retry(background_prompt)

# ---------- handle citations ----------
get_cite_from_llm_and_arxiv(latex_output, BIB_FILE)
delete_cite(latex_output, BACKGROUND_TEX)

with open(BACKGROUND_TEX, "w", encoding="utf-8") as f:
    f.write(latex_output)

logger.info(f"Background section saved to {BACKGROUND_TEX}")

# ---------- compile PDF ----------
from create_pdf import create_pdf
create_pdf(topic, topic_description)