"""
Centralized configuration for paths and dataset settings.

Model configurations are defined in config.sh (single source of truth).
This file provides Python access to paths and dataset metadata.

Usage:
    from config import PathConfig, DatasetConfig

    # Access paths
    hf_cache = PathConfig.HF_CACHE
    output_dir = PathConfig.OUTPUT_BASE

    # Access dataset paths
    aime2025_path = DatasetConfig.get_dataset_path("aime_2025")
"""
import os
from pathlib import Path


class PathConfig:
    """
    Central configuration for all file system paths.

    Values are read from environment variables (set by config.sh).
    Defaults are provided as fallback.

    To change paths, modify config.sh (the single source of truth).
    """

    # ============================================================
    # BASE DIRECTORIES (read from env vars set by config.sh)
    # ============================================================

    # Base directory for datasets (JSONL files)
    DATASETS_BASE = os.environ.get("DATASETS_BASE", "/path/to/datasets")

    # Base directory for inference results output
    OUTPUT_BASE = os.environ.get("OUTPUT_BASE", "/path/to/sampling_credit_results")


    # ============================================================
    # HUGGINGFACE CACHE PATHS
    # ============================================================

    # HuggingFace home directory (for model downloads)
    HF_CACHE = os.environ.get("HF_HOME", "/path/to/hf_cache")

    # Transformers cache (usually same as HF_CACHE)
    TRANSFORMERS_CACHE = os.environ.get("TRANSFORMERS_CACHE", HF_CACHE)

    # ============================================================
    # DATASET DIRECTORIES
    # ============================================================

    # Individual dataset directories
    AIME_2025_DIR = os.path.join(DATASETS_BASE, "aime_2025")
    AIME_2024_DIR = os.path.join(DATASETS_BASE, "aime_2024")
    HMMT_2025_DIR = os.path.join(DATASETS_BASE, "hmmt_feb_2025")
    BRUMO_2025_DIR = os.path.join(DATASETS_BASE, "brumo_2025")

    # ============================================================
    # CONVENIENCE METHODS
    # ============================================================

    @classmethod
    def setup_hf_env(cls):
        """Set HuggingFace environment variables. Call this at the start of scripts."""
        os.environ["HF_HOME"] = cls.HF_CACHE
        os.environ["TRANSFORMERS_CACHE"] = cls.TRANSFORMERS_CACHE

    @classmethod
    def ensure_dirs_exist(cls):
        """Create all necessary directories if they don't exist."""
        dirs = [
            cls.DATASETS_BASE,
            cls.HF_CACHE,
            cls.OUTPUT_BASE,
            cls.AIME_2025_DIR,
            cls.AIME_2024_DIR,
            cls.HMMT_2025_DIR,
            cls.BRUMO_2025_DIR,
        ]
        for d in dirs:
            os.makedirs(d, exist_ok=True)


class DatasetConfig:
    """
    Configuration for dataset paths and metadata.
    Used by data preparation scripts (scripts/prepare_data/*.py).
    """

    # Dataset definitions: name -> (directory, filename, huggingface_id)
    DATASETS = {
        "aime_2025": {
            "dir": PathConfig.AIME_2025_DIR,
            "filename": "aime_2025.jsonl",
            "hf_id": "MathArena/aime_2025",
            "split": "train",
        },
        "aime_2024": {
            "dir": PathConfig.AIME_2024_DIR,
            "filename": "aime_2024.jsonl",
            "hf_id": ["MathArena/aime_2024_I", "MathArena/aime_2024_II"],
            "split": "train",
        },
        "hmmt_2025": {
            "dir": PathConfig.HMMT_2025_DIR,
            "filename": "hmmt_feb_2025.jsonl",
            "hf_id": "MathArena/hmmt_feb_2025",
            "split": "train",
        },
        "brumo_2025": {
            "dir": PathConfig.BRUMO_2025_DIR,
            "filename": "brumo_2025.jsonl",
            "hf_id": "MathArena/brumo_2025",
            "split": "train",
        },
    }

    @classmethod
    def get_dataset_path(cls, dataset_name: str) -> str:
        """Get full path to dataset JSONL file."""
        if dataset_name not in cls.DATASETS:
            raise ValueError(f"Unknown dataset: {dataset_name}. Available: {list(cls.DATASETS.keys())}")

        ds = cls.DATASETS[dataset_name]
        return os.path.join(ds["dir"], ds["filename"])

    @classmethod
    def get_dataset_dir(cls, dataset_name: str) -> str:
        """Get directory for dataset."""
        if dataset_name not in cls.DATASETS:
            raise ValueError(f"Unknown dataset: {dataset_name}")
        return cls.DATASETS[dataset_name]["dir"]

    @classmethod
    def list_datasets(cls) -> list:
        """List all available dataset names."""
        return list(cls.DATASETS.keys())


# ============================================================
# SHELL SCRIPT HELPER - Export paths as environment variables
# ============================================================

def print_shell_exports():
    """Print shell export commands for use in bash scripts."""
    print("# Add these to your shell script or source this output")
    print(f'export DATASETS_BASE="{PathConfig.DATASETS_BASE}"')
    print(f'export OUTPUT_BASE="{PathConfig.OUTPUT_BASE}"')
    print(f'export HF_CACHE="{PathConfig.HF_CACHE}"')
    print()
    print("# Dataset paths")
    for name in DatasetConfig.list_datasets():
        path = DatasetConfig.get_dataset_path(name)
        var_name = f"DATASET_{name.upper().replace('-', '_')}"
        print(f'export {var_name}="{path}"')


if __name__ == "__main__":
    # When run directly, print shell exports
    print_shell_exports()
