from dataclasses import dataclass, field
from typing import Optional, Union, List


@dataclass
class EvaluationArguments:
    # Core evaluation parameters
    tasks: list[str] = field(default_factory=list, metadata={'help': 'The tasks to evaluate the model on.'})
    num_fewshot: Optional[int] = field(default=None, metadata={'help': 'The number of few-shot examples to use for evaluation.'})
    evaluation_batch_size: Optional[str] = field(default=None, metadata={'help': 'The batch size for evaluation (maps to simple_evaluate batch_size). Can be "auto" or an integer as string.'})
    max_batch_size: Optional[int] = field(default=None, metadata={'help': 'Maximum batch size to use.'})
    
    # Cache and request management
    use_cache: Optional[str] = field(default=None, metadata={'help': 'Path to cache directory.'})
    cache_requests: bool = field(default=False, metadata={'help': 'Whether to cache requests.'})
    rewrite_requests_cache: bool = field(default=False, metadata={'help': 'Whether to rewrite the requests cache.'})
    delete_requests_cache: bool = field(default=False, metadata={'help': 'Whether to delete the requests cache.'})
    
    # Sampling and bootstrapping
    limit: Optional[int] = field(default=None, metadata={'help': 'The limit of the number of examples to evaluate on.'})
    bootstrap_iters: int = field(default=100000, metadata={'help': 'Number of bootstrap iterations.'})
    
    # Output and logging
    check_integrity: bool = field(default=False, metadata={'help': 'Whether to check data integrity.'})
    write_out: bool = field(default=False, metadata={'help': 'Whether to write out results.'})
    log_samples: bool = field(default=True, metadata={'help': 'Whether to log samples.'})
    
    # Chat and instruction formatting
    system_instruction: Optional[str] = field(default=None, metadata={'help': 'System instruction for chat models.'})
    apply_chat_template: bool = field(default=False, metadata={'help': 'Whether to apply chat template.'})
    fewshot_as_multiturn: bool = field(default=False, metadata={'help': 'Whether to use few-shot as multiturn.'})
    gen_kwargs: Optional[str] = field(default=None, metadata={'help': 'Generation kwargs as string.'})
    
    # Prediction and execution
    predict_only: bool = field(default=False, metadata={'help': 'Whether to only predict without evaluation.'})
    confirm_run_unsafe_code: bool = field(default=False, metadata={'help': 'Whether to confirm running unsafe code.'})
    
    # Random seeds
    random_seed: int = field(default=0, metadata={'help': 'Random seed for general randomness.'})
    numpy_random_seed: int = field(default=1234, metadata={'help': 'Random seed for numpy.'})
    torch_random_seed: int = field(default=1234, metadata={'help': 'Random seed for torch.'})
    fewshot_random_seed: int = field(default=1234, metadata={'help': 'Random seed for few-shot sampling.'})
    
    # Legacy fields for backward compatibility 
    eval_batch_size: int = field(default=1, metadata={'help': 'Legacy: The batch size for evaluation (maps to batch_size).'})
    progress_bar: bool = field(default=True, metadata={'help': 'Legacy: Whether to show a progress bar.'})
    download_mode: bool = field(default=False, metadata={'help': 'Legacy: Whether to download the model if not already downloaded.'})
    no_cache: bool = field(default=False, metadata={'help': 'Legacy: Whether to not use the cache.'})