# Quick Test Configuration V2
# Minimal configuration to test the new structure

batch_run_name: "final_zeroshot"
base_output_dir: "./gsm-agent/eval/results"

# Shared parameters - ALL parameters can be shared and overridden
shared_parameters:
  # Default model for all test settings
  model_name: "gpt-4o"
  model_provider: "openai"
  agent_type: "react"
  temperature: 0.4
  max_tokens: 4096
  agent_stop_type: "default"
  agent_stop_kwargs:
    interaction_rounds: 50
    answer_judge: "llm_only"
  # Default dataset settings
  dataset_type: "small"  # Use small dataset for testing
  dataset_dir: "./data"
  num_samples: 2
  
  # Search settings
  search_engine_type: "chromadb"
  chromadb_base_path: "./databases/chroma_db"
  collection_name: "default"
  embedding_model: "default"
  results_per_page: 5
  max_documents: 2
  
  # Test-optimized evaluation settings (faster for testing)
  retry_attempts: 2
  retry_delay: 5.0
  chunk_size: 10
  enable_continual_evaluation: true
  
  # Random seed for reproducibility
  random_seed: 42

num_workers: 10


# Test mode - only 3 samples
evaluation_control:
  test_mode: false
  test_samples: 3

prompts:
  system_prompt: |
    You solve math problems by finding missing facts (premises) with search tools.

    ANSWER FORMAT
    - Provisional (temporary) line starts with ****
      e.g., **** 42
    - Final line starts with #### and contains only the number (or “I don't know”)
      e.g., #### 42
            #### I don't know

    TOOLS
    - search_information: input the query to search for up to 5 relevant premises.
    - next_page: next 5 for the last search.


evaluation_settings:
  
  # OpenAI Models - Multiple random seeds
  # GPT-4o with 3 random seeds
  # - setting_id: "gpt-4o-zeroshot-seed-42"
  #   dataset_type: "full"
  #   model_name: "gpt-4o"
  #   model_provider: "openai"
  #   agent_type: "react"
  #   random_seed: 42
  #   metadata:
  #     purpose: "Test gpt-4o model on full dataset with zeroshot"
  
  # - setting_id: "gpt-4o-zeroshot-seed-123"
  #   dataset_type: "full"
  #   model_name: "gpt-4o"
  #   model_provider: "openai"
  #   agent_type: "react"
  #   random_seed: 123
  #   metadata:
  #     purpose: "Test gpt-4o model on full dataset with zeroshot"
  
  - setting_id: "gpt-4o-zeroshot-seed-456"
    dataset_type: "small"
    model_name: "gpt-4o"
    model_provider: "openai"
    agent_type: "react"
    random_seed: 456
    metadata:
      purpose: "Test gpt-4o model on small dataset with zeroshot"
  
  # GPT-5 with 3 random seeds
  # - setting_id: "gpt-5-zeroshot-seed-42"
  #   dataset_type: "full"
  #   model_name: "gpt-5"
  #   model_provider: "openai"
  #   agent_type: "react"
  #   random_seed: 42
  #   metadata:
  #     purpose: "Test gpt-5 model on full dataset with zeroshot"
  
  # - setting_id: "gpt-5-zeroshot-seed-123"
  #   dataset_type: "full"
  #   model_name: "gpt-5"
  #   model_provider: "openai"
  #   agent_type: "react"
  #   random_seed: 123
  #   metadata:
  #     purpose: "Test gpt-5 model on full dataset with zeroshot"
  
  # - setting_id: "gpt-5-zeroshot-seed-456"
  #   dataset_type: "full"
  #   model_name: "gpt-5"
  #   model_provider: "openai"
  #   agent_type: "react"
  #   random_seed: 456
  #   metadata:
  #     purpose: "Test gpt-5 model on full dataset with zeroshot"
  
  # O3 with 3 random seeds
  # - setting_id: "o3-zeroshot-seed-42"
  #   dataset_type: "full"
  #   temperature: 1.0
  #   max_tokens: 2048
  #   model_name: "o3"
  #   model_provider: "openai"
  #   agent_type: "react"
  #   random_seed: 42
  #   metadata:
  #     purpose: "Test o3 model on full dataset with zeroshot"
  
  # - setting_id: "o3-zeroshot-seed-123"
  #   dataset_type: "full"
  #   temperature: 1.0
  #   max_tokens: 2048
  #   model_name: "o3"
  #   model_provider: "openai"
  #   agent_type: "react"
  #   random_seed: 123
  #   metadata:
  #     purpose: "Test o3 model on full dataset with zeroshot"
  
  # - setting_id: "o3-zeroshot-seed-456"
  #   dataset_type: "full"
  #   temperature: 1.0
  #   max_tokens: 2048
  #   model_name: "o3"
  #   model_provider: "openai"
  #   agent_type: "react"
  #   random_seed: 456
  #   metadata:
  #     purpose: "Test o3 model on full dataset with zeroshot"
  
  # Anthropic Models - Multiple random seeds
  # Claude-4-Sonnet with 3 random seeds
  # - setting_id: "claude-4-sonnet-zeroshot-seed-42"
  #   dataset_type: "full"
  #   model_name: "claude-4-sonnet-20250514"
  #   model_provider: "anthropic"
  #   agent_type: "react"
  #   random_seed: 42
  #   metadata:
  #     purpose: "Test claude-4-sonnet-20250514 model on full dataset with zeroshot"
  
  # - setting_id: "claude-4-sonnet-zeroshot-seed-123"
  #   dataset_type: "full"
  #   model_name: "claude-4-sonnet-20250514"
  #   model_provider: "anthropic"
  #   agent_type: "react"
  #   random_seed: 123
  #   metadata:
  #     purpose: "Test claude-4-sonnet-20250514 model on full dataset with zeroshot"
  
  # - setting_id: "claude-4-sonnet-zeroshot-seed-456"
  #   dataset_type: "full"
  #   model_name: "claude-4-sonnet-20250514"
  #   model_provider: "anthropic"
  #   agent_type: "react"
  #   random_seed: 456
  #   metadata:
  #     purpose: "Test claude-4-sonnet-20250514 model on full dataset with zeroshot"
  
  # Claude-Opus with 3 random seeds
  # - setting_id: "claude-opus-zeroshot-seed-42"
  #   dataset_type: "full"
  #   model_name: "claude-3-opus-20240229"
  #   model_provider: "anthropic"
  #   agent_type: "react"
  #   random_seed: 42
  #   metadata:
  #     purpose: "Test claude-opus model on full dataset with zeroshot"
  
  # - setting_id: "claude-opus-zeroshot-seed-123"
  #   dataset_type: "full"
  #   model_name: "claude-3-opus-20240229"
  #   model_provider: "anthropic"
  #   agent_type: "react"
  #   random_seed: 123
  #   metadata:
  #     purpose: "Test claude-opus model on full dataset with zeroshot"
  
  # - setting_id: "claude-opus-zeroshot-seed-456"
  #   dataset_type: "full"
  #   model_name: "claude-3-opus-20240229"
  #   model_provider: "anthropic"
  #   agent_type: "react"
  #   random_seed: 456
  #   metadata:
  #     purpose: "Test claude-opus model on full dataset with zeroshot"
  
  # Google Models - Multiple random seeds
  # Gemini-2.5-Pro with 3 random seeds
  # - setting_id: "gemini-2.5-pro-zeroshot-seed-42"
  #   dataset_type: "full"
  #   model_name: "gemini-2.5-pro"
  #   model_provider: "google_genai"
  #   agent_type: "react"
  #   random_seed: 42
  #   metadata:
  #     purpose: "Test gemini-2.5-pro model on full dataset with zeroshot"
  
  # - setting_id: "gemini-2.5-pro-zeroshot-seed-123"
  #   dataset_type: "full"
  #   model_name: "gemini-2.5-pro"
  #   model_provider: "google_genai"
  #   agent_type: "react"
  #   random_seed: 123
  #   metadata:
  #     purpose: "Test gemini-2.5-pro model on full dataset with zeroshot"
  
  # - setting_id: "gemini-2.5-pro-zeroshot-seed-456"
  #   dataset_type: "full"
  #   model_name: "gemini-2.5-pro"
  #   model_provider: "google_genai"
  #   agent_type: "react"
  #   random_seed: 456
  #   metadata:
  #     purpose: "Test gemini-2.5-pro model on full dataset with zeroshot"
  
  # Gemini-2.5-Flash with 3 random seeds
  # - setting_id: "gemini-2.5-flash-zeroshot-seed-42"
  #   dataset_type: "full"
  #   model_name: "gemini-2.5-flash"
  #   model_provider: "google_genai"
  #   agent_type: "react"
  #   random_seed: 42
  #   metadata:
  #     purpose: "Test gemini-2.5-flash model on full dataset with zeroshot"
  
  # - setting_id: "gemini-2.5-flash-zeroshot-seed-123"
  #   dataset_type: "full"
  #   model_name: "gemini-2.5-flash"
  #   model_provider: "google_genai"
  #   agent_type: "react"
  #   random_seed: 123
  #   metadata:
  #     purpose: "Test gemini-2.5-flash model on full dataset with zeroshot"
  
  # - setting_id: "gemini-2.5-flash-zeroshot-seed-456"
  #   dataset_type: "full"
  #   model_name: "gemini-2.5-flash"
  #   model_provider: "google_genai"
  #   agent_type: "react"
  #   random_seed: 456
  #   metadata:
  #     purpose: "Test gemini-2.5-flash model on full dataset with zeroshot"
  
  # xAI Models - Multiple random seeds
  # Grok-4 with 3 random seeds
  # - setting_id: "grok-4-zeroshot-seed-42"
  #   dataset_type: "full"
  #   model_name: "grok-4"
  #   model_provider: "xai"
  #   agent_type: "react"
  #   random_seed: 42
  #   metadata:
  #     purpose: "Test grok-4 model on full dataset with zeroshot"
  
  # - setting_id: "grok-4-zeroshot-seed-123"
  #   dataset_type: "full"
  #   model_name: "grok-4"
  #   model_provider: "xai"
  #   agent_type: "react"
  #   random_seed: 123
  #   metadata:
  #     purpose: "Test grok-4 model on full dataset with zeroshot"
  
  # - setting_id: "grok-4-zeroshot-seed-456"
  #   dataset_type: "full"
  #   model_name: "grok-4"
  #   model_provider: "xai"
  #   agent_type: "react"
  #   random_seed: 456
  #   metadata:
  #     purpose: "Test grok-4 model on full dataset with zeroshot"
  
  # # Together AI Models (Open Source)
  # - setting_id: "Qwen3-235B-zeroshot"
  #   dataset_type: "full"
  #   model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-tput"
  #   model_provider: "together"
  #   agent_type: "react"
  #   metadata:
  #     purpose: "Test Qwen3-235B-A22B-Instruct-2507-tput model on full dataset with zeroshot"
  
  # - setting_id: "Qwen3-235B-Thinking-zeroshot"
  #   dataset_type: "full"
  #   model_name: "Qwen/Qwen3-235B-A22B-Thinking-2507"
  #   model_provider: "together"
  #   agent_type: "react"
  #   metadata:
  #     purpose: "Test Qwen3-235B-A22B-Thinking-2507 model on full dataset with zeroshot"
  
  # - setting_id: "Llama-4-Scout-zeroshot"
  #   dataset_type: "full"
  #   model_name: "meta-llama/Llama-4-Scout-17B-16E-Instruct"
  #   model_provider: "together"
  #   agent_type: "react"
  #   metadata:
  #     purpose: "Test Llama-4-Scout-17B-16E-Instruct model on full dataset with zeroshot"
  
  # - setting_id: "Llama-4-Maverick-zeroshot"
  #   dataset_type: "full"
  #   model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
  #   model_provider: "together"
  #   agent_type: "react"
  #   metadata:
  #     purpose: "Test Llama-4-Maverick-17B-128E-Instruct-FP8 model on full dataset with zeroshot"
  
  # - setting_id: "DeepSeek-R1-zeroshot"
  #   dataset_type: "full"
  #   model_name: "deepseek-ai/DeepSeek-R1"
  #   model_provider: "together"
  #   agent_type: "react"
  #   metadata:
  #     purpose: "Test DeepSeek-R1 model on full dataset with zeroshot"
  
  # - setting_id: "DeepSeek-V3-zeroshot"
  #   dataset_type: "full"
  #   model_name: "deepseek-ai/DeepSeek-V3"
  #   model_provider: "together"
  #   agent_type: "react"
  #   metadata:
  #     purpose: "Test DeepSeek-V3 model on full dataset with zeroshot"
  
  # - setting_id: "Kimi-K2-Instruct-zeroshot"
  #   dataset_type: "full"
  #   model_name: "moonshotai/Kimi-K2-Instruct"
  #   model_provider: "together"
  #   agent_type: "react"
  #   metadata:
  #     purpose: "Test Kimi-K2-Instruct model on full dataset with zeroshot"
