model_name: 'gemini-2.5-flash'
model_type: "gemini"

competition: "titanic"
log_dir: logs

# General
need_format_pipeline: false
time_run_minutes: 360    # pipeline time
timeout: 240              # for LLM answered
runtime_error_time: 30   # for run code
top_N_for_running_on_test: 0
need_test_score: false
subset_size_in_percent: 10        # within 10 and 100
validator_size_threshold: "10^5"
CUDA_VISIBLE_DEVICES: "0"
need_pro_validator: false
dynamic_model: false

# Number of attempts
number_of_attempts_reader: 1
number_of_attempts_validator: 3
number_of_attempts_scorer: 3
number_of_attempts_baseline: 5
number_of_attempts_insight: 10
number_of_attempts_coder: 5
number_of_attempts_checker: 3
number_of_attempts_install: 5
number_of_attempts_debug: 3
number_of_attempts_debug_submit: 3

# Insight
number_of_ideas_eda: 2
number_of_ideas_data: 2
number_of_ideas_modelling: 2
insight_delay: 5

# Adding
# algorithm_type_adding: base_random_adding
# algorithm_type_adding: "adding_with_groups_split"
# algorithm_type_adding: "top_n_adding"
# algorithm_type_adding: adding_probability_distribution
algorithm_type_adding: "adding_greedy_epsilon"
adding_epsilon: 0.3
number_of_selected_node: 1
max_add_idea: 1
top_n_for_eda: 4

# Merging
algorithm_type_merging: "merger_by_parts"
number_of_iterations_parents: 2
number_of_selected_node_merging: 2
number_of_iterations_children: 2
merging_epsilon: 0.3
max_memory_long: 3
max_iteration_without_update_best_score: 3


# Debugger
number_of_attempts_debug_generate: 3
max_count_install_error: 1
max_count_of_identical_errors: 3
number_of_iter_for_code_regeneration: 3
debug_mode: "holistic"
debug_speed_mode: 'fast' # 'standart' for slower but more accurate debugging, 'fast' for faster debugging

# Scoring model
need_scoring_predict: false
anchor_examples: false
max_minutes_to_run_for_complex_training: 10
number_of_ideas_min: 2
number_of_ideas_max: 3
is_scoring_model_test: false

# Rag Agent
use_rag: true
retrieve_n_papers: 3 # How many papers retrive for the initial pool of ideas
retrieve_n_competitions: 3 # How many competitions retrive for the initial pool of ideas
number_rag_ideas: 5 # How many RAG ideas LLM will use

competitions_path: "kaggle_database/competitions_ideas.json" # Path to the stored competition data in json format.
vector_index_path: "kaggle_database/ideas_vectorbase.faiss" # Path to the FAISS index file containing competition embeddings.
metadata_path: "kaggle_database/ideas_metadata.json" # Path to a JSON file mapping FAISS index to (link, description).

# Memory
memory_size: 5
memory_algorithm: "nearest_nodes" #"random_nodes" # random_nodes, distant_nodes, nearest_nodes or None


# Phases
phases:
- - Model training
  - modeling.py
  - Modeling
- - Data preparation and feature engineering
  - data_preparation.py
  - FeatureEngineering
- - Exploratory data analysis
  - eda.py
  - EDA

# Agents that need to be passed through the checker
agents_for_checker:
  - validator
  - scorer
  - insighter
  - coder