# Auto-generated OpenCompass Config for FT-Agent Benchmark
# DO NOT EDIT MANUALLY - Generated by benchmark.py

template: |-
    from mmengine.config import read_base
    from opencompass.models import VLLMwithChatTemplate

    # ==================== Dataset Import ====================
    with read_base():
    {% for dataset_module in dataset_imports %}
        from {{ dataset_module }} import *
    {% endfor %}

    # Aggregate all dataset variables
    datasets = sum([v for k, v in locals().items() if (k == 'datasets' or k.endswith('_datasets')) and isinstance(v, list)], [])

    # Apply dataset modifications
    for ds in datasets:
    {% if test_range %}
        # Apply dataset range (e.g., "[:100]" for validation, "[-100:]" for test)
        if 'reader_cfg' not in ds:
            ds['reader_cfg'] = {}
        ds['reader_cfg']['test_range'] = '{{ test_range }}'

        # Sync to evaluator's dataset_cfg
        if 'eval_cfg' in ds and 'evaluator' in ds['eval_cfg']:
            evaluator = ds['eval_cfg']['evaluator']
            if isinstance(evaluator, dict) and 'dataset_cfg' in evaluator:
                if 'reader_cfg' not in evaluator['dataset_cfg']:
                    evaluator['dataset_cfg']['reader_cfg'] = {}
                evaluator['dataset_cfg']['reader_cfg']['test_range'] = '{{ test_range }}'
    {% endif %}
    {% if num_runs and num_runs > 1 %}
        # Multiple runs (repeat each sample n times for averaging or pass@k)
        ds['n'] = {{ num_runs }}
    {% endif %}
    {% if pass_k %}
        # Pass@k evaluation
        ds['k'] = {{ pass_k }}
    {% endif %}
        pass

    # ==================== Model Configuration ====================
    models = [
        dict(
            type=VLLMwithChatTemplate,
            abbr='{{ model_abbr }}',
            path='{{ model_path }}',
            model_kwargs=dict(
                tensor_parallel_size={{ tensor_parallel_size }},
                gpu_memory_utilization={{ gpu_memory_utilization }},
                trust_remote_code=True,
                dtype='{{ dtype }}',
                max_model_len={{ max_seq_len }},
    {% if is_lora %}
                enable_lora=True,
                max_lora_rank=64,
                max_cpu_loras=1,
    {% endif %}
            ),
    {% if is_lora %}
            lora_path='{{ lora_path }}',
    {% endif %}
            max_seq_len={{ max_seq_len }},
            max_out_len={{ max_out_len }},
            batch_size={{ batch_size }},
            generation_kwargs=dict(
                temperature={{ temperature }},
                top_p={{ top_p }},
                top_k={{ top_k }},
    {% if repetition_penalty != 1.0 %}
                repetition_penalty={{ repetition_penalty }},
    {% endif %}
            ),
    {% if enable_thinking %}
            chat_template_kwargs=dict(enable_thinking=True),
    {% endif %}
    {% if enable_thinking or use_cot_postprocessor %}
            pred_postprocessor=dict(type='extract-non-reasoning-content'),
    {% endif %}
            run_cfg=dict(
                num_gpus={{ tensor_parallel_size }},
                num_procs=1,
            ),
        ),
    ]

    # ==================== Inference Configuration ====================
    infer = dict(
        partitioner=dict(
            type='NaivePartitioner',
        ),
        runner=dict(
            type='LocalRunner',
            max_num_workers=16,
            task=dict(
                type='OpenICLInferTask',
            ),
        ),
    )

    # ==================== Evaluation Configuration ====================
    eval = dict(
        partitioner=dict(
            type='NaivePartitioner',
        ),
        runner=dict(
            type='LocalRunner',
            max_num_workers=16,
            task=dict(
                type='OpenICLEvalTask',
                dump_details=True,
            ),
        ),
    )

    # ==================== Work Directory ====================
    work_dir = '{{ work_dir }}'
