"""OpenCompass evaluation tool with vLLM local inference support.

Based on FT-Agent benchmark implementation.
"""

import os
import json
import subprocess
from collections.abc import Sequence
from pathlib import Path
from typing import TYPE_CHECKING, Any, Self

import pandas as pd
import yaml
from jinja2 import Template
from pydantic import Field

from openhands.sdk import Action, Observation, TextContent, ImageContent
from openhands.sdk.tool import ToolDefinition, ToolExecutor

if TYPE_CHECKING:
    from openhands.sdk.conversation import LocalConversation


# ==================== Benchmark Configuration ====================
# Mapping from benchmark_name -> OpenCompass dataset import path
BENCHMARK_CONFIG = {
    # Math Reasoning
    "aime24": "opencompass.configs.datasets.aime2024.aime2024_gen_17d799",
    "aime25": "opencompass.configs.datasets.aime2025.aime2025_cascade_eval_gen_5e9f4f",
    "math": "opencompass.configs.datasets.math.math_0shot_gen_393424",
    # General Knowledge
    "mmlu": "opencompass.configs.datasets.mmlu.mmlu_gen",
    # Code Generation
    "humaneval": "opencompass.configs.datasets.humaneval.humaneval_gen",
    "mbpp": "opencompass.configs.datasets.mbpp.mbpp_gen",
    # PANORAMA - Patent Analysis (zero-shot)
    "panorama": "opencompass.configs.datasets.panorama.panorama_gen",
    "panorama_par4pc": "opencompass.configs.datasets.panorama.panorama_par4pc_gen",
    "panorama_pi4pc": "opencompass.configs.datasets.panorama.panorama_pi4pc_gen",
    "panorama_noc4pc": "opencompass.configs.datasets.panorama.panorama_noc4pc_gen",
    # PANORAMA - Patent Analysis (CoT)
    "panorama_par4pc_cot": "opencompass.configs.datasets.panorama.panorama_par4pc_cot_gen",
    "panorama_pi4pc_cot": "opencompass.configs.datasets.panorama.panorama_pi4pc_cot_gen",
    "panorama_noc4pc_cot": "opencompass.configs.datasets.panorama.panorama_noc4pc_cot_gen",
    # ChemCoTBench - Chemistry Reasoning
    "chemcotbench": "opencompass.configs.datasets.chemcotbench.chemcotbench_gen",
    "chemcotbench_mol_und": "opencompass.configs.datasets.chemcotbench.chemcotbench_mol_und_gen",
    "chemcotbench_mol_edit": "opencompass.configs.datasets.chemcotbench.chemcotbench_mol_edit_gen",
    "chemcotbench_mol_opt": "opencompass.configs.datasets.chemcotbench.chemcotbench_mol_opt_gen",
    "chemcotbench_reaction": "opencompass.configs.datasets.chemcotbench.chemcotbench_reaction_gen",
    # TableBench - Table Question Answering
    "tablebench_data_analysis": "opencompass.configs.datasets.tablebench.tablebench_data_analysis_gen",
    "tablebench_fact_checking": "opencompass.configs.datasets.tablebench.tablebench_fact_checking_gen",
    "tablebench_numerical_reasoning": "opencompass.configs.datasets.tablebench.tablebench_numerical_reasoning_gen",
    "tablebench_visualization": "opencompass.configs.datasets.tablebench.tablebench_visualization_gen",
    "tablebench_gen": "opencompass.configs.datasets.tablebench.tablebench_gen",
    # BioProBench
    "bioprobench_gen": "opencompass.configs.datasets.bioprobench.bioprobench_gen",
    "bioprobench_ord": "opencompass.configs.datasets.bioprobench.bioprobench_ord",
    "bioprobench_err": "opencompass.configs.datasets.bioprobench.bioprobench_err",
    "bioprobench_pqa": "opencompass.configs.datasets.bioprobench.bioprobench_pqa",
    # FinanceIQ - Financial QA (requires LLM Judge)
    "financeiq": "opencompass.configs.datasets.FinanceIQ.FinanceIQ_gen_e0e6b5",
}

# ==================== Model Default Configuration ====================
MODEL_DEFAULTS = {
    "temperature": 0.6,
    "top_p": 0.95,
    "top_k": 20,
    "max_seq_len": 32768,
    "max_out_len": 8192,
    "batch_size": 16,
    "tensor_parallel_size": 1,
    "gpu_memory_utilization": 0.9,
    "repetition_penalty": 1.0,
    "dtype": "bfloat16",
    "enable_thinking": False,
    "use_cot_postprocessor": True,
}

# Model-specific overrides
MODEL_CONFIGS = {
    "Qwen/Qwen3": {
        "max_seq_len": 40960,
        "max_out_len": 38912,
        "enable_thinking": True,
    },
    "Qwen/Qwen2.5": {
        "temperature": 0.0,
        "top_p": 1.0,
        "top_k": 1,
        "use_cot_postprocessor": True,
    },
    "meta-llama/Llama-3": {
        "temperature": 0.7,
        "max_out_len": 4096,
    },
}


# ==================== OpenCompass Config Template (Jinja2) ====================
VLLM_CONFIG_TEMPLATE = """from mmengine.config import read_base
from opencompass.models import VLLMwithChatTemplate

# ==================== Dataset Import ====================
with read_base():
{% for dataset_module in dataset_imports %}
    from {{ dataset_module }} import *
{% endfor %}

# Aggregate all dataset variables
datasets = sum([v for k, v in locals().items() if (k == 'datasets' or k.endswith('_datasets')) and isinstance(v, list)], [])

# Apply dataset modifications
for ds in datasets:
{% if test_range %}
    # Apply dataset range
    if 'reader_cfg' not in ds:
        ds['reader_cfg'] = {}
    ds['reader_cfg']['test_range'] = '{{ test_range }}'

    # Sync to evaluator's dataset_cfg
    if 'eval_cfg' in ds and 'evaluator' in ds['eval_cfg']:
        evaluator = ds['eval_cfg']['evaluator']
        if isinstance(evaluator, dict) and 'dataset_cfg' in evaluator:
            if 'reader_cfg' not in evaluator['dataset_cfg']:
                evaluator['dataset_cfg']['reader_cfg'] = {}
            evaluator['dataset_cfg']['reader_cfg']['test_range'] = '{{ test_range }}'
{% endif %}
{% if num_runs and num_runs > 1 %}
    # Multiple runs for averaging
    ds['n'] = {{ num_runs }}
{% endif %}
    pass

# ==================== Model Configuration ====================
models = [
    dict(
        type=VLLMwithChatTemplate,
        abbr='{{ model_abbr }}',
        path='{{ model_path }}',
        model_kwargs=dict(
            tensor_parallel_size={{ tensor_parallel_size }},
            gpu_memory_utilization={{ gpu_memory_utilization }},
            trust_remote_code=True,
            dtype='{{ dtype }}',
            max_model_len={{ max_seq_len }},
{% if is_lora %}
            enable_lora=True,
            max_lora_rank=64,
            max_cpu_loras=1,
{% endif %}
        ),
{% if is_lora %}
        lora_path='{{ lora_path }}',
{% endif %}
        max_seq_len={{ max_seq_len }},
        max_out_len={{ max_out_len }},
        batch_size={{ batch_size }},
        generation_kwargs=dict(
            temperature={{ temperature }},
            top_p={{ top_p }},
            top_k={{ top_k }},
{% if repetition_penalty != 1.0 %}
            repetition_penalty={{ repetition_penalty }},
{% endif %}
        ),
{% if enable_thinking %}
        chat_template_kwargs=dict(enable_thinking=True),
{% endif %}
{% if enable_thinking or use_cot_postprocessor %}
        pred_postprocessor=dict(type='extract-non-reasoning-content'),
{% endif %}
        run_cfg=dict(
            num_gpus={{ tensor_parallel_size }},
            num_procs=1,
        ),
    ),
]

# ==================== Inference Configuration ====================
infer = dict(
    partitioner=dict(type='NaivePartitioner'),
    runner=dict(
        type='LocalRunner',
        max_num_workers=16,
        task=dict(type='OpenICLInferTask'),
    ),
)

# ==================== Evaluation Configuration ====================
eval = dict(
    partitioner=dict(type='NaivePartitioner'),
    runner=dict(
        type='LocalRunner',
        max_num_workers=16,
        task=dict(type='OpenICLEvalTask', dump_details=True),
    ),
)

work_dir = '{{ work_dir }}'
"""

# API model template (for external API evaluation)
API_CONFIG_TEMPLATE = """from mmengine.config import read_base
from opencompass.models import OpenAI

# ==================== Dataset Import ====================
with read_base():
{% for dataset_module in dataset_imports %}
    from {{ dataset_module }} import *
{% endfor %}

# Aggregate all dataset variables
datasets = sum([v for k, v in locals().items() if (k == 'datasets' or k.endswith('_datasets')) and isinstance(v, list)], [])

# Apply dataset modifications
for ds in datasets:
{% if test_range %}
    if 'reader_cfg' not in ds:
        ds['reader_cfg'] = {}
    ds['reader_cfg']['test_range'] = '{{ test_range }}'

    if 'eval_cfg' in ds and 'evaluator' in ds['eval_cfg']:
        evaluator = ds['eval_cfg']['evaluator']
        if isinstance(evaluator, dict) and 'dataset_cfg' in evaluator:
            if 'reader_cfg' not in evaluator['dataset_cfg']:
                evaluator['dataset_cfg']['reader_cfg'] = {}
            evaluator['dataset_cfg']['reader_cfg']['test_range'] = '{{ test_range }}'
{% endif %}
    pass

# ==================== API Model Configuration ====================
api_meta_template = dict(round=[
    dict(role='HUMAN', api_role='HUMAN'),
    dict(role='BOT', api_role='BOT', generate=True),
])

models = [
    dict(
        abbr='{{ model_abbr }}',
        type=OpenAI,
        path='{{ model_name }}',
        key='{{ api_key }}',
        openai_api_base='{{ api_base }}',
        meta_template=api_meta_template,
        query_per_second={{ query_per_second }},
        max_out_len={{ max_out_len }},
        max_seq_len={{ max_seq_len }},
        batch_size={{ batch_size }},
        retry=5,
    ),
]

# ==================== Inference Configuration ====================
infer = dict(
    partitioner=dict(type='NaivePartitioner'),
    runner=dict(
        type='LocalRunner',
        max_num_workers={{ max_num_workers }},
        retry=2,
        task=dict(type='OpenICLInferTask'),
    ),
)

# ==================== Evaluation Configuration ====================
eval = dict(
    partitioner=dict(type='NaivePartitioner'),
    runner=dict(
        type='LocalRunner',
        max_num_workers=4,
        retry=2,
        task=dict(type='OpenICLEvalTask', dump_details=True),
    ),
)

work_dir = '{{ work_dir }}'
"""


def detect_lora_model(model_path: str) -> bool:
    """Detect whether the model path is a LoRA adapter."""
    model_dir = Path(model_path)
    if not model_dir.exists():
        return False

    # Check for LoRA adapter files
    if (model_dir / "adapter_config.json").exists():
        return True
    for fname in ("adapter_model.bin", "adapter_model.safetensors"):
        if (model_dir / fname).exists():
            return True
    return False


def get_model_config(base_model_name: str, gpu_count: int = 1) -> dict:
    """Get model inference configuration."""
    config = MODEL_DEFAULTS.copy()

    # Apply model-specific overrides (prefix matching)
    for model_prefix, overrides in MODEL_CONFIGS.items():
        if base_model_name.startswith(model_prefix):
            config.update(overrides)
            break

    # Set tensor_parallel_size based on GPU count
    if gpu_count > 1:
        # Round down to nearest power of 2
        power = 0
        while (1 << (power + 1)) <= gpu_count:
            power += 1
        config["tensor_parallel_size"] = 1 << power

    return config


def extract_error_samples(results_dir: Path, max_samples: int = 10) -> list[dict]:
    """Extract error samples from evaluation results for feedback."""
    error_samples = []

    # Find prediction files
    predictions_dir = results_dir / "predictions"
    if not predictions_dir.exists():
        # Try alternate location
        for subdir in results_dir.rglob("predictions"):
            predictions_dir = subdir
            break

    if not predictions_dir.exists():
        return error_samples

    # Parse prediction JSON files
    for pred_file in predictions_dir.rglob("*.json"):
        try:
            with open(pred_file, encoding="utf-8") as f:
                data = json.load(f)

            for item in data:
                # Check if this is an error case
                gold = item.get("gold", item.get("answer", ""))
                pred = item.get("prediction", item.get("pred", ""))
                correct = item.get("correct", None)

                # If correct field exists and is False, or if gold != pred
                is_error = (correct is False) or (correct is None and str(gold).strip() != str(pred).strip())

                if is_error and len(error_samples) < max_samples:
                    error_samples.append({
                        "question": item.get("question", item.get("input", "")),
                        "gold": gold,
                        "prediction": pred,
                        "source": pred_file.stem,
                    })
        except Exception:
            continue

    return error_samples[:max_samples]


class EvalAction(Action):
    """Action for OpenCompass evaluation."""

    # Common fields
    benchmarks: list[str] = Field(
        default_factory=lambda: ["chemcotbench_mol_und"],
        description="List of benchmark names to evaluate",
    )
    data_range: str = Field(
        default="",
        description="Data range expression for dataset slicing. "
        "Examples: '[:100]' (first 100), '[-100:]' (last 100), "
        "'[:min(100, len(index_list)//2)]' (dynamic range). "
        "If not set, will use OPENCOMPASS_DATA_RANGE environment variable as fallback.",
    )
    output_dir: str = Field(description="Directory to save evaluation results")
    max_out_len: int = Field(default=8192, description="Maximum output length")
    max_seq_len: int = Field(default=32768, description="Maximum sequence length")
    batch_size: int = Field(default=16, description="Batch size for evaluation")

    # Mode selection
    mode: str = Field(
        default="vllm",
        description="Evaluation mode: 'vllm' for local inference, 'api' for API calls",
    )

    # vLLM mode fields
    model_path: str = Field(
        default="",
        description="Path to model (for vLLM mode). Can be base model or LoRA adapter.",
    )
    base_model: str = Field(
        default="Qwen/Qwen2.5-7B-Instruct",
        description="Base model name (for vLLM mode with LoRA)",
    )
    gpu_count: int = Field(default=1, description="Number of GPUs for tensor parallelism")
    use_cot_postprocessor: bool = Field(
        default=True,
        description="Enable CoT postprocessor to extract answer from <think>...</think> format",
    )

    # API mode fields
    api_base: str = Field(default="", description="API base URL (for API mode)")
    api_key: str = Field(default="sk-1234", description="API key (for API mode)")
    model_name: str = Field(default="gpt-4o-mini", description="Model name (for API mode)")
    query_per_second: int = Field(default=100, description="QPS limit (for API mode)")

    # Error sample extraction
    max_error_samples: int = Field(
        default=10,
        description="Maximum number of error samples to extract for feedback",
    )

    # GPU configuration
    gpu_ids: list[int] = Field(default_factory=list, description="GPU IDs to use (empty = auto-select)")

    # LLM Judge configuration
    judge_model: str = Field(default="", description="LLM Judge model name")
    judge_api_base: str = Field(default="", description="LLM Judge API base URL")
    judge_api_key: str = Field(default="", description="LLM Judge API key")


class EvalObservation(Observation):
    """Observation from evaluation."""

    scores: dict[str, float] = Field(
        default_factory=dict,
        description="Benchmark scores: {dataset_metric: value}",
    )
    metrics: dict[str, str] = Field(
        default_factory=dict,
        description="Metric types: {dataset_metric: metric_name}",
    )
    details_path: str = Field(default="", description="Path to detailed results")
    error_samples: list[dict] = Field(
        default_factory=list,
        description="Error samples for feedback analysis",
    )
    success: bool = Field(default=True, description="Whether evaluation succeeded")
    error_message: str = Field(default="", description="Error message if failed")

    @property
    def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
        if not self.success:
            return [TextContent(text=f"Evaluation failed: {self.error_message}")]

        scores_str = "\n".join(f"  - {k}: {v:.4f}" for k, v in self.scores.items())
        error_str = ""
        if self.error_samples:
            error_str = f"\n\nError Samples ({len(self.error_samples)} samples):\n"
            for i, sample in enumerate(self.error_samples[:3], 1):
                error_str += f"  {i}. Q: {sample.get('question', '')[:100]}...\n"
                error_str += f"     Gold: {sample.get('gold', '')}\n"
                error_str += f"     Pred: {sample.get('prediction', '')}\n"

        return [
            TextContent(
                text=(
                    f"Evaluation completed.\n"
                    f"Scores:\n{scores_str}\n"
                    f"Details: {self.details_path}"
                    f"{error_str}"
                )
            )
        ]


class OpenCompassExecutor(ToolExecutor[EvalAction, EvalObservation]):
    """Executor for OpenCompass evaluation."""

    def __init__(self, working_dir: str | None = None, default_data_range: str | None = None):
        self.working_dir = working_dir or "."
        self.default_data_range = default_data_range

    def _resolve_data_range(self, action: EvalAction) -> str:
        """Resolve data_range with fallback to config/env defaults.
        
        Priority:
        1. Explicit value from action.data_range (if non-empty)
        2. Constructor default_data_range (from config)
        3. Environment variable OPENCOMPASS_DATA_RANGE
        4. Final fallback: '[:]' (full dataset)
        """
        if action.data_range and action.data_range.strip():
            return action.data_range
        
        if self.default_data_range:
            print(f"[OpenCompass] Using configured default data_range: {self.default_data_range}")
            return self.default_data_range
        
        env_range = os.environ.get("OPENCOMPASS_DATA_RANGE", "")
        if env_range:
            print(f"[OpenCompass] Using env OPENCOMPASS_DATA_RANGE: {env_range}")
            return env_range
        
        print("[OpenCompass] WARNING: No data_range specified, using full dataset '[:]'")
        return "[:]"

    def __call__(
        self, action: EvalAction, conversation: "LocalConversation | None" = None
    ) -> EvalObservation:
        """Execute OpenCompass evaluation."""
        try:
            # Resolve data_range with fallback
            resolved_data_range = self._resolve_data_range(action)
            
            output_dir = Path(action.output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)

            all_scores = []  # List of {dataset, metric, value}
            all_error_samples = []

            for benchmark in action.benchmarks:
                if benchmark not in BENCHMARK_CONFIG:
                    print(f"Unknown benchmark: {benchmark}, skipping...")
                    continue

                benchmark_dir = output_dir / benchmark
                benchmark_dir.mkdir(parents=True, exist_ok=True)

                # Generate config based on mode
                work_dir = str(benchmark_dir / "results")
                if action.mode == "vllm":
                    config_content = self._generate_vllm_config(action, benchmark, work_dir, resolved_data_range)
                else:
                    config_content = self._generate_api_config(action, benchmark, work_dir, resolved_data_range)

                config_path = benchmark_dir / "config.py"
                with open(config_path, "w") as f:
                    f.write(config_content)

                # Run OpenCompass
                cmd = ["opencompass", str(config_path), "--work-dir", work_dir]

                # Set environment variables
                env = os.environ.copy()

                # GPU configuration
                if action.gpu_ids:
                    env["CUDA_VISIBLE_DEVICES"] = ",".join(str(g) for g in action.gpu_ids)

                # LLM Judge configuration (use action params or fallback to env vars)
                judge_model = action.judge_model or os.environ.get("OC_JUDGE_MODEL", "")
                judge_api_base = action.judge_api_base or os.environ.get("OC_JUDGE_API_BASE", "")
                judge_api_key = action.judge_api_key or os.environ.get("OC_JUDGE_API_KEY", "")
                
                if judge_model:
                    env["OC_JUDGE_MODEL"] = judge_model
                if judge_api_base:
                    env["OC_JUDGE_API_BASE"] = judge_api_base
                if judge_api_key:
                    env["OC_JUDGE_API_KEY"] = judge_api_key

                print(f"Running benchmark: {benchmark}")
                result = subprocess.run(
                    cmd,
                    cwd=self.working_dir,
                    capture_output=True,
                    text=True,
                    timeout=7200,  # 2 hours timeout per benchmark
                    env=env,
                )

                if result.returncode != 0:
                    print(f"Benchmark {benchmark} failed: {result.stderr[-1000:]}")
                    continue

                # Parse results
                scores = self._parse_results(Path(work_dir))
                all_scores.extend(scores)

                # Extract error samples
                timestamped_dirs = sorted(Path(work_dir).glob("202*_*"), reverse=True)
                if timestamped_dirs:
                    errors = extract_error_samples(
                        timestamped_dirs[0],
                        max_samples=action.max_error_samples,
                    )
                    all_error_samples.extend(errors)

            if not all_scores:
                return EvalObservation(
                    scores={},
                    metrics={},
                    details_path=str(output_dir),
                    error_samples=[],
                    success=False,
                    error_message="No benchmark results found",
                )

            # Generate dict format: {dataset}_{metric} as key to preserve all metrics
            scores_dict = {
                f"{s['dataset']}_{s['metric']}": s["value"] 
                for s in all_scores
            }
            metrics_dict = {
                f"{s['dataset']}_{s['metric']}": s["metric"] 
                for s in all_scores
            }

            # Save summary
            summary_path = output_dir / "results.json"
            with open(summary_path, "w") as f:
                json.dump(
                    {
                        "scores": scores_dict,  # {dataset_metric: value}
                        "metrics": metrics_dict,  # {dataset_metric: metric_name}
                        "error_samples": all_error_samples[:action.max_error_samples],
                    },
                    f,
                    indent=2,
                    ensure_ascii=False,
                )

            return EvalObservation(
                scores=scores_dict,
                metrics=metrics_dict,
                details_path=str(summary_path),
                error_samples=all_error_samples[:action.max_error_samples],
                success=True,
            )

        except subprocess.TimeoutExpired:
            return EvalObservation(
                scores={},
                metrics={},
                details_path=str(action.output_dir),
                error_samples=[],
                success=False,
                error_message="Evaluation timeout",
            )
        except Exception as e:
            return EvalObservation(
                scores={},
                metrics={},
                details_path=str(action.output_dir),
                error_samples=[],
                success=False,
                error_message=str(e),
            )

    def _generate_vllm_config(self, action: EvalAction, benchmark: str, work_dir: str, data_range: str) -> str:
        """Generate OpenCompass configuration for vLLM local inference."""
        is_lora = detect_lora_model(action.model_path)
        model_config = get_model_config(action.base_model, action.gpu_count)

        # Override model config with action settings
        model_config["use_cot_postprocessor"] = action.use_cot_postprocessor
        model_config["max_out_len"] = action.max_out_len
        model_config["max_seq_len"] = action.max_seq_len
        model_config["batch_size"] = action.batch_size

        template = Template(VLLM_CONFIG_TEMPLATE)
        return template.render(
            dataset_imports=[BENCHMARK_CONFIG[benchmark]],
            test_range=data_range,  # Use resolved data_range
            num_runs=1,
            model_abbr=f"vllm-{benchmark}",
            model_path=action.base_model if is_lora else action.model_path,
            is_lora=is_lora,
            lora_path=action.model_path if is_lora else "",
            work_dir=work_dir,
            **model_config,
        )

    def _generate_api_config(self, action: EvalAction, benchmark: str, work_dir: str, data_range: str) -> str:
        """Generate OpenCompass configuration for API evaluation."""
        template = Template(API_CONFIG_TEMPLATE)
        return template.render(
            dataset_imports=[BENCHMARK_CONFIG[benchmark]],
            test_range=data_range,  # Use resolved data_range
            model_abbr=f"api-{benchmark}",
            model_name=action.model_name,
            api_key=action.api_key,
            api_base=action.api_base,
            max_out_len=action.max_out_len,
            max_seq_len=action.max_seq_len,
            batch_size=action.batch_size,
            query_per_second=action.query_per_second,
            max_num_workers=100,
            work_dir=work_dir,
        )

    def _parse_results(self, results_dir: Path) -> list[dict]:
        """Parse evaluation results from CSV files.

        Converts CSV to a list of dicts, each containing dataset, metric, and value.
        
        Returns:
            List of dicts: [{"dataset": ..., "metric": ..., "value": ...}, ...]
        """
        scores = []

        # Find latest timestamped directory
        timestamped_dirs = sorted(results_dir.glob("202*_*"), reverse=True)
        if not timestamped_dirs:
            return scores

        result_dir = timestamped_dirs[0]

        # Find CSV files
        csv_files = sorted(result_dir.rglob("summary/*.csv"), reverse=True)
        if not csv_files:
            return scores

        try:
            df = pd.read_csv(csv_files[0])
            score_col = [
                c for c in df.columns
                if c not in ["dataset", "version", "metric", "mode"]
            ][0]

            # Convert each row to a dict
            for _, row in df.iterrows():
                dataset = row.get("dataset", "unknown")
                metric = row.get("metric", "unknown")
                score = row.get(score_col, 0)
                if pd.notna(score):
                    scores.append({
                        "dataset": dataset,
                        "metric": metric,
                        "value": float(score)
                    })

        except Exception as e:
            print(f"Failed to parse CSV: {e}")

        return scores


_OPENCOMPASS_DESCRIPTION = """OpenCompass evaluation tool for LLM benchmarking.

Supports two modes:
1. vLLM mode: Local inference using vLLM (supports LoRA adapters)
2. API mode: Evaluation via external API

Features:
* Automatic LoRA adapter detection
* CoT postprocessor for <think>...</think> format
* Error sample extraction for feedback
* Multiple benchmarks: Math, Code, Chemistry, Table QA, etc.
"""


class OpenCompassTool(ToolDefinition[EvalAction, EvalObservation]):
    """Tool for OpenCompass evaluation."""

    name = "opencompass"

    @classmethod
    def create(
        cls, 
        conv_state=None, 
        working_dir: str | None = None, 
        default_data_range: str | None = None,
        **kwargs
    ) -> Sequence[Self]:
        """Create OpenCompassTool instance.
        
        Args:
            conv_state: Conversation state (optional)
            working_dir: Working directory for evaluation
            default_data_range: Default data range from config (e.g., validation_range or test_range).
                               Used as fallback when Agent doesn't specify data_range.
        """
        wd = working_dir
        if conv_state and hasattr(conv_state, "workspace"):
            wd = conv_state.workspace.working_dir

        return [
            cls(
                name="opencompass",
                description=_OPENCOMPASS_DESCRIPTION,
                action_type=EvalAction,
                observation_type=EvalObservation,
                executor=OpenCompassExecutor(working_dir=wd, default_data_range=default_data_range),
            )
        ]
