import os
from pathlib import Path
from dotenv import load_dotenv

# 加载项目目录下的 .env 文件
load_dotenv(Path(__file__).parent / ".env")

# 获取当前文件所在目录作为 base_dir
BASE_DIR = Path(__file__).parent.absolute()

# Model context window sizes (tokens). Used for pre-truncation.
# Add new models as needed. Falls back to 128K if model not listed.
MODEL_CONTEXT_WINDOWS = {
    # OpenAI
    "gpt-4o": 128_000,
    "gpt-4o-mini": 128_000,
    "gpt-4-turbo": 128_000,
    "gpt-4": 8_192,
    "gpt-3.5-turbo": 16_385,
    "o1": 200_000,
    "o1-mini": 128_000,
    "o3-mini": 200_000,
    # Google Gemini
    "gemini-2.5-pro": 1_048_576,
    "gemini-2.5-flash": 1_048_576,
    "gemini-3-pro-preview": 1_048_576,
    "gemini-3-flash-preview": 1_048_576,
    "gemini-2.0-flash": 1_048_576,
    "gemini-1.5-pro": 2_097_152,
    "gemini-1.5-flash": 1_048_576,
    # Anthropic
    "claude-sonnet-4-20250514": 200_000,
    "claude-opus-4-20250514": 200_000,
    "claude-3-5-sonnet-20241022": 200_000,
    # DeepSeek
    "deepseek-chat": 128_000,
    "deepseek-reasoner": 128_000,
}

DEFAULT_CONTEXT_WINDOW = 128_000


def get_model_context_window(model_name: str) -> int:
    """Get context window size for a model. Falls back to 128K."""
    # Exact match
    if model_name in MODEL_CONTEXT_WINDOWS:
        return MODEL_CONTEXT_WINDOWS[model_name]
    # Prefix match (e.g. "gpt-4o-2024-08-06" → "gpt-4o")
    for key, value in MODEL_CONTEXT_WINDOWS.items():
        if model_name.startswith(key):
            return value
    return DEFAULT_CONTEXT_WINDOW


config = {
    # ========== API 配置 ==========
    "api": {
        # LLM API (OpenAI-compatible) — used for generation (init, evolution, hypothesis)
        "llm": {
            "api_key": os.getenv("OPENAI_API_KEY", ""),
            "base_url": os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1"),
            "model": os.getenv("OPENAI_CHAT_MODEL", "gpt-4o-mini"),
            "temperature": 0.7,
            "max_tokens": 2000,
            "timeout_s": int(os.getenv("OPENAI_TIMEOUT_S", "30")),
        },
        # Judge API — strong model for novelty scoring (few calls per topic)
        "judge": {
            "api_key": os.getenv("OPENAI_API_KEY", ""),
            "base_url": os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1"),
            "model": os.getenv("OPENAI_JUDGE_MODEL", os.getenv("OPENAI_CHAT_MODEL", "gpt-4o-mini")),
            "timeout_s": int(os.getenv("OPENAI_TIMEOUT_S", "30")),
        },
        # Judge Lite API — strong model for hit verify final judgment, keyword extract, profiles
        "judge_lite": {
            "api_key": os.getenv("OPENAI_API_KEY", ""),
            "base_url": os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1"),
            "model": os.getenv("OPENAI_JUDGE_LITE_MODEL", os.getenv("OPENAI_CHAT_MODEL", "gpt-4o-mini")),
            "timeout_s": int(os.getenv("OPENAI_TIMEOUT_S", "30")),
        },
        # Judge Mini API — cheap pre-filter for hit verify (two-stage screening)
        "judge_mini": {
            "api_key": os.getenv("OPENAI_API_KEY", ""),
            "base_url": os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1"),
            "model": os.getenv("OPENAI_JUDGE_MINI_MODEL", "gpt-4o-mini"),
            "timeout_s": int(os.getenv("OPENAI_TIMEOUT_S", "30")),
        },
        # Hit verify pre-filter threshold
        "hit_prefilter_threshold": float(os.getenv("HIT_PREFILTER_THRESHOLD", "5.0")),
        # Embedding API
        "embedding": {
            "api_key": os.getenv("OPENAI_API_KEY", ""),
            "base_url": os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1"),
            "model": os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"),
            "dimension": 1536,
            "batch_size": 10,
            "timeout_s": int(os.getenv("OPENAI_EMBEDDING_TIMEOUT_S", os.getenv("OPENAI_TIMEOUT_S", "30"))),
        },
        # arXiv API
        "arxiv": {
            "base_url": os.getenv("ARXIV_BASE_URL", "https://export.arxiv.org/api/query"),
            "max_results": 100,
            "delay_ms": int(os.getenv("ARXIV_DELAY_MS", "3000")),
            "timeout_s": int(os.getenv("ARXIV_TIMEOUT_S", "60")),
            "batch_size": int(os.getenv("ARXIV_BATCH_SIZE", "100")),
            "min_batch_size": int(os.getenv("ARXIV_MIN_BATCH_SIZE", "10")),
            "retries": int(os.getenv("ARXIV_RETRIES", "5")),
            "retry_backoff_s": int(os.getenv("ARXIV_RETRY_BACKOFF_S", "30")),
            "window_months": int(os.getenv("ARXIV_WINDOW_MONTHS", "12")),
            "min_split_days": int(os.getenv("ARXIV_MIN_SPLIT_DAYS", "31")),
            "max_split_depth": int(os.getenv("ARXIV_MAX_SPLIT_DEPTH", "4")),
            "user_agent": os.getenv("ARXIV_USER_AGENT", "CKM-Eval/1.0"),
        },
    },
    # ========== 实验配置 ==========
    "experiment": {
        # Phase 1: Init — 知识基线构建的时间范围
        "init_start": os.getenv("INIT_START", "2016-01-01"),
        "init_end": os.getenv("INIT_END", "2021-01-01"),
        # Phase 2: Evolution — 增量演化的时间范围和步长(月)
        "evolution_start": os.getenv("EVOLUTION_START", "2021-01-01"),
        "evolution_end": os.getenv("EVOLUTION_END", "2024-01-01"),
        "evolution_step_months": int(os.getenv("EVOLUTION_STEP_MONTHS", "2")),
        "evolution_window_paper_limit": int(os.getenv("EVOLUTION_WINDOW_PAPER_LIMIT", "8")),
        "evolution_backfill_on_empty": os.getenv("EVOLUTION_BACKFILL_ON_EMPTY", "true").lower() in ("1", "true", "yes", "on"),
        "hypothesis_max_per_window": int(os.getenv("HYPOTHESIS_MAX_PER_WINDOW", "3")),
        "fulltext_timeout_s": int(os.getenv("FULLTEXT_TIMEOUT_S", "180")),
        "fulltext_retries": int(os.getenv("FULLTEXT_RETRIES", "2")),
        "fulltext_retry_delay_s": int(os.getenv("FULLTEXT_RETRY_DELAY_S", "2")),
        # Phase 3: Validation — 验证的时间范围
        "validation_start": os.getenv("VALIDATION_START", "2024-01-01"),
        "validation_end": os.getenv("VALIDATION_END", "2026-01-01"),
        # 论文搜索
        "max_papers": int(os.getenv("MAX_PAPERS", "300")),
        "init_max_papers": int(os.getenv("INIT_MAX_PAPERS", "48")),
        "evolution_max_papers": int(os.getenv("EVOLUTION_MAX_PAPERS", "96")),
        "validation_max_papers": int(os.getenv("VALIDATION_MAX_PAPERS", "180")),
        "validation_min_papers": int(os.getenv("VALIDATION_MIN_PAPERS", "120")),
        "validation_fulltext_concurrency": int(os.getenv("VALIDATION_FULLTEXT_CONCURRENCY", "8")),
        "validation_profile_concurrency": int(os.getenv("VALIDATION_PROFILE_CONCURRENCY", "4")),
        "validation_judge_concurrency": int(os.getenv("VALIDATION_JUDGE_CONCURRENCY", "4")),
        "phase_search_window_months": int(os.getenv("PHASE_SEARCH_WINDOW_MONTHS", "6")),
    },
    # ========== 路径配置 ==========
    "paths": {
        "root": BASE_DIR,
        "cache": BASE_DIR / ".cache",
        "arxiv_metadata_cache": BASE_DIR / ".cache" / "arxiv_metadata",
        "metabolism": BASE_DIR / "metabolism",
        "knowledge": BASE_DIR / "metabolism" / "knowledge",
        "diffs": BASE_DIR / "metabolism" / "diffs",
        "hypotheses": BASE_DIR / "metabolism" / "hypotheses",
        "log": BASE_DIR / "metabolism" / "log",
        "batch": BASE_DIR / "batch",
        "output": BASE_DIR / "output",
        "reports": BASE_DIR / "reports",
    },
    # ========== 日志配置 ==========
    "logging": {
        "level": os.getenv("LOG_LEVEL", "info"),
        "format": "json",
        "output_dir": BASE_DIR / "metabolism" / "log",
    },
}
