# Extract JSON content between ```json``` markers in output1
import re
import os
from pypdf import PdfReader
import pandas as pd
import requests
import json
from create_util import *

parser_manager = ArgParser()
args = parser_manager.add_topic_args().add_csv_args().parse()
topic = args.topic
original_csv_path = args.original_csv_path

# -----------------------------------------------------------------------------
# Anonymous base directory – change to your own location or export via ENV
# -----------------------------------------------------------------------------
BASE_DIR = os.environ.get("ANON_PROJECT_ROOT", "./anonymous_project_root")

humanpaper_info = os.path.join(BASE_DIR, topic, "humanpaper", "paper_info.json")
with open(humanpaper_info, "r", encoding="utf-8") as f:
    results = json.load(f)

df = pd.read_csv(original_csv_path)
ids = df["id"].tolist()
topic_descriptions = df["topic_description"].tolist()

# Create download folder for each description
for desc in topic_descriptions:
    print(desc)
    os.makedirs(os.path.join(BASE_DIR, topic, desc, "download_paper"), exist_ok=True)

# -----------------------------------------------------------------------------
# Download PDFs and extract experiment sections
# -----------------------------------------------------------------------------
for idx, paper in enumerate(results):
    pdf_url = f"https://openreview.net/pdf?id={ids[idx]}"
    title = df["title"][idx].replace(" ", "_").replace(":", "_")
    output_pdf = os.path.join(BASE_DIR, topic, topic_descriptions[idx],
                               "download_paper", f"{title}.pdf")
    print(f"Attempting download from {pdf_url} -> {output_pdf}")

    retry = 10
    while retry > 0:
        try:
            resp = requests.get(pdf_url, stream=True, timeout=20)
            resp.raise_for_status()
            with open(output_pdf, "wb") as fh:
                for chunk in resp.iter_content(chunk_size=2048):
                    fh.write(chunk)
            print(f"Downloaded: {title}")
            break
        except requests.exceptions.HTTPError as err:
            print(f"HTTP error: {err}. Retrying...")
        except Exception as err:
            print(f"General error: {err}. Retrying...")
        retry -= 1
    else:
        print(f"Failed to download {title} after multiple attempts.")
        continue

    # -------------------------------------------------------------------------
    # Extract text and generate experiment JSON
    # -------------------------------------------------------------------------
    experiment_json_path = os.path.join(BASE_DIR, topic, topic_descriptions[idx],
                                        "paper_info_add_experiment_setting.json")
    if os.path.exists(experiment_json_path):
        continue

    reader = PdfReader(output_pdf)
    text = "".join(page.extract_text() or "" for page in reader.pages)

    prompt = f"""
You are given a paper about "{topic}".

Identify and summarise the **essential experiments** that the authors performed to support their claims.
Return a JSON array. Each element must be an object with:
- experiment_purpose
- experiment_setting: detailed setup (models, datasets, baselines, hyper-parameters, etc.)
- experiment_result

Paper content:
{text}

Your JSON output:
    """

    model = ChatAgent("deepseek-v3")
    json_output = model.chat_with_json_retry(prompt)

    with open(experiment_json_path, "w", encoding="utf-8") as f:
        json.dump(json_output, f, indent=4, ensure_ascii=False)