# Configuration for data constrained scaling law discovery with OpenEvolve
max_iterations: 50
checkpoint_interval: 1
log_level: "INFO"
random_seed: 42
wandb:
  enabled: true
  project: "openevolve"
  name: "data_constrained_scaling_law-{model}"
  group: "single_task/data_constrained_scaling_law/{model}"
  job_type: "single_task"
  tags: ["sldbench", "single_task", "data_constrained_scaling_law", "{model}"]
  mode: "online"

# LLM configuration
llm:
  primary_model: "gemini-3-flash-preview"
  primary_model_weight: 1.0
  secondary_model: null
  secondary_model_weight: 0.0
  api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
  max_tokens: 16384
  timeout: 240
  retries: 10
  retry_delay: 10

# Prompt configuration
prompt:
  system_message: |
    You are an expert in scaling laws and machine learning who specializes in discovering and improving scaling law functions for different LLM training scenarios. Your task is to evolve both the `scaling_law_func` function (currently a naive power law) and the `fit_scaling_law` optimization algorithm (currently a naive BFGS) to better model the relationship between training data characteristics and model loss under data-constrained conditions.

    **IMPORTANT: The scaling law function must use no more than 7 parameters.**

    Focus on mathematical accuracy across different data scales, cross-dataset generalization, parameter efficiency (simple forms that can be fitted with limited data), and numerical/theoretical stability.

    **DATA CHARACTERISTICS (182 total data points):**
    - Features: [unique_tokens, params, tokens] - 3D input
    - Labels: loss - scalar output
    - Dataset size: 161 
    - Parameter range (P): 1.1e8 to 1.1e9 (100M to 1.1B parameters)
    - Token count range (D): 1e9 to 1e12 tokens
    - Unique tokens range: 1e7 to 5e8 unique tokens
    - Loss range: 1.8 to 7.2 (cross-entropy loss)
    - Model architectures: Transformer variants with different parameterizations
    - Data explores scaling under token/unique-token constraints

    The function signatures must remain:

    ```python
    def scaling_law_func(data_points, params):
        # data_points: (N,3) array with columns [unique_tokens, params, tokens]
        # tokens: Array of token counts
        # params: Array of parameter counts
        # unique_tokens: Array of unique token counts
        # params: Array of up to 7 parameters
        # Returns: Predicted loss values

    def fit_scaling_law(data_points, loss_values):
        # data_points: (N,3) array with columns [unique_tokens, params, tokens]
        # loss_values: Array of corresponding loss values
        # Returns: Optimized parameters (up to 7 parameters)
    ```

    Write all improvements between # EVOLVE-BLOCK-START and # EVOLVE-BLOCK-END markers.

    You are not allowed to use input-dependent feature in scaling_law_func, e.g., median / min / max / etc.

  num_top_programs: 3
  num_diverse_programs: 2
  use_template_stochasticity: true

# Database configuration for evolution
database:
  population_size: 100
  archive_size: 50
  num_islands: 5
  migration_interval: 25
  migration_rate: 0.1
  elite_selection_ratio: 0.1
  exploration_ratio: 0.2
  exploitation_ratio: 0.7
  feature_dimensions: ["combined_score", "complexity", "diversity"]
  feature_bins: 10

# Evaluator configuration
evaluator:
  timeout: 600
  max_retries: 3
  cascade_evaluation: false
  cascade_thresholds: [0.3, 0.6]
  parallel_evaluations: 4
  use_llm_feedback: false

# Evolution settings
diff_based_evolution: false
max_code_length: 100000
