# Two-stage reasoning evaluation configuration with math-specific prompts
# Captioner: OpenGVLab/InternVL3-2B
# Reasoner: JamAndTeaStudios/DeepSeek-R1-Distill-Qwen-32B-FP8-Dynamic
# Template: two_stage_math_v1 (optimized for mathematical datasets with \boxed{} formatting)

evaluation:
  # Core approach
  reasoning_approach: "two_stage"
  
  # VLM configuration (InternVL3-2B)
  vlm_api_base: "http://localhost:8001/v1"
  vlm_timeout: 900.0
  
  # Reasoner configuration (DeepSeek-R1-Distill)
  reasoner_type: "openai"
  reasoner_api_base: "http://10.0.1.14:8001/v1"
  reasoner_api_key: "EMPTY"
  reasoner_model: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
  reasoner_timeout: 900.0
  
  # Generation parameters
  max_tokens: 100000
  temperature: 0.6
  top_p: 0.95
  top_k: -1
  
  # Evaluation configuration
  datasets:
    - "MathVista_MINI"              # MathVista (testmini)
    - "MathVision"                  # MathVision (test)
    - "MathVision_MINI"             # MathVision (test mini)
    - "MathVerse_MINI"
    - "MathVerse_MINI_Vision_Only"  # MathVerse (vision-only)
    - "MMMU_DEV_VAL"                # MMMU (val)
    - "WeMath"                      # We-Math (we'll report Strict)
    - "LogicVista"                  # LogicVista
    - "DynaMath"                    # DynaMath (we'll report worst-case) - only if there's time, it's massive

  judge: "gpt-4.1-mini-2025-04-14"
  nproc: 128
  work_dir: "outputs/"
  
  # Sampling configuration
  dry_run: false
  limit: 10000000
  
  # Output configuration
  debug_data_dir: "/scratch/<ANONYMIZED>/debug_data/"
  html_report_dir: "/scratch/<ANONYMIZED>/html_sample_reports/"
  enable_html_reports: true
  
  # Reproducibility
  seed: 42
  deterministic: false

# Framework-specific settings
framework:
  # Use the reasoning framework models
  use_framework_models: true
  
  # Model configuration references
  vlm_config_ref: "models.vlm.qwen2_5_vl"
  reasoner_config_ref: "models.reasoner.custom_api"
  
  # Prompt template configuration
  prompt_template_ref: "two_stage_math_v1"
  
  # VLM overrides for InternVL3-2B
  vlm_overrides:
    model_name: "Qwen7BACRL_step800"
    temperature: 1.0
    top_p: 0.001
    top_k: 1
  
  # Reasoner overrides
  reasoner_overrides:
    model_type: "openai"
    api_base: "http://10.0.1.14:8001/v1"
    model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
    api_key: "EMPTY"
    max_tokens: 100000
    temperature: 0.6
    top_p: 0.95
    top_k: -1
  
  # Evaluation scaffold
  scaffold_type: "two_stage"
  scaffold_config:
    enable_description_verification: false
    use_enhanced_prompts: true

# Two-stage specific configuration
two_stage:
  # Two-stage scaffold specific settings
  enable_verification: false
  
  # Confidence estimation experiment flags (disabled by default)
  enable_vlm_confidence: false
  use_confidence_in_reasoner: false

# Logging configuration
logging:
  level: "INFO"
  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
  
  # MLflow tracking (optional)
  mlflow:
    enabled: false
    tracking_uri: "http://localhost:5000"
    experiment_name: "TwoStage-InternVL-Framework-Eval"

# Data processing
data:
  # Preprocessing settings
  preprocessing:
    resize_images: true
    max_image_size: [1024, 1024]
    normalize_text: true
  
  # Quality control
  quality_control:
    min_question_length: 10
    max_question_length: 1000
    filter_invalid_images: true 