# Three-stage reasoning evaluation configuration with math-specific prompts
# Captioner: OpenGVLab/InternVL3-2B
# Reasoner: JamAndTeaStudios/DeepSeek-R1-Distill-Qwen-32B-FP8-Dynamic
# Template: three_stage_math_v1 (unified adaptive decision format)

evaluation:
  # Core approach
  reasoning_approach: "three_stage"
  
  # VLM configuration (InternVL3-2B)
  vlm_api_base: "http://localhost:8000/v1"
  vlm_api_key: "EMPTY"
  vlm_model_name: "OpenGVLab/InternVL3-2B"
  vlm_timeout: 900.0
  
  # Reasoner configuration (DeepSeek-R1-Distill)
  reasoner_type: "openai"
  reasoner_api_base: "http://localhost:8001/v1"
  reasoner_api_key: "EMPTY"
  reasoner_model_name: "JamAndTeaStudios/DeepSeek-R1-Distill-Qwen-32B-FP8-Dynamic"
  reasoner_timeout: 900.0
  
  # Generation parameters
  max_tokens: 50000
  temperature: 0.6
  top_p: 0.95
  top_k: -1
  
  # Evaluation configuration
  datasets:
    - "MathVista_MINI"
    - "MathVerse_MINI"
  judge: "gpt-4.1-mini-2025-04-14"
  nproc: 32
  work_dir: "outputs/"
  
  # Sampling configuration
  dry_run: false
  limit: 256
  
  # Output configuration
  debug_data_dir: "/scratch/<ANONYMIZED>/debug_data/"
  html_report_dir: "/scratch/<ANONYMIZED>/html_sample_reports/"
  enable_html_reports: true
  
  # Reproducibility
  seed: 42
  deterministic: false

# Framework-specific settings
framework:
  # Use the reasoning framework models
  use_framework_models: true
  
  # Model configuration references
  vlm_config_ref: "models.vlm.internvl3_2b"
  reasoner_config_ref: "models.reasoner.custom_api"
  
  # Prompt template configuration
  prompt_template_ref: "three_stage_math_v1"
  
  # VLM overrides for InternVL3-2B
  vlm_overrides:
    api_base: "http://10.0.1.1:8000/v1"
    model_name: "OpenGVLab/InternVL3-2B-hf"
    temperature: 1.0
    top_p: 0.001
    top_k: 1
  
  # Reasoner overrides
  reasoner_overrides:
    model_type: "openai"
    api_base: "http://10.0.1.2:8001/v1"
    model_name: "JamAndTeaStudios/DeepSeek-R1-Distill-Qwen-32B-FP8-Dynamic"
    api_key: "EMPTY"
    max_tokens: 100000
    temperature: 0.6
    top_p: 0.95
    top_k: -1
  
  # Evaluation scaffold
  scaffold_type: "three_stage"
  scaffold_config:
    enable_vlm_confidence: false
    use_confidence_in_reasoner: false

# Three-stage specific configuration (no max_iterations - fixed at one decision)
three_stage:
  # Three-stage scaffold allows exactly ONE clarifying question when needed
  # No iterations parameter needed since it's fixed behavior
  
  # Confidence estimation experiment flags (disabled by default)
  enable_vlm_confidence: false
  use_confidence_in_reasoner: false

# Logging configuration
logging:
  level: "INFO"
  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
  
  # MLflow tracking (optional)
  mlflow:
    enabled: false
    tracking_uri: "http://localhost:5000"
    experiment_name: "ThreeStage-InternVL-Framework-Eval"

# Data processing
data:
  # Preprocessing settings
  preprocessing:
    resize_images: true
    max_image_size: [1024, 1024]
    normalize_text: true
  
  # Quality control
  quality_control:
    min_question_length: 10
    max_question_length: 1000
    filter_invalid_images: true 