# Default configuration for training experiments
# Can be overridden via command line arguments

# Dataset paths
data:
  h5ad_dir: "/path/to/tahoe-100m/h5ad"
  # label_dir is optional - defaults to training_experiments/data/mappings/
  # label_dir: null

# Training parameters
training:
  batch_size: 64
  num_epochs: 1
  learning_rate: 0.00001

# scDataset parameters
scdataset:
  fetch_factor: 256
  num_workers: 8

# Weight computation for weighted sampling
weights:
  # Minimum count baseline to add to all (cell_line, drug) combination counts
  # before computing balancing weights. This prevents extreme reweighting
  # for rare combinations.
  #
  # Example: If a combination has 30 cells and another has 20000 cells,
  # without baseline the ratio would be 666:1 (very extreme).
  # With baseline=1000: ratio becomes ~20:1 (much more reasonable).
  min_count_baseline: 1000

# Strategy configurations
strategies:
  # 1. Streaming - Sequential access without shuffling
  streaming:
    enabled: true
    shuffle: false

  # 2. Streaming with Buffer - Sequential with buffer-level shuffling (HuggingFace/Ray style)
  streaming_buffer:
    enabled: true
    shuffle: true

  # 3. Block Shuffling - scDataset with block_size=16
  block_shuffling:
    enabled: true
    block_size: 16

  # 4. Random Sampling - Full shuffling (block_size=1)
  random_sampling:
    enabled: true
    block_size: 1

  # 5. Block Weighted Sampling - Weighted with block_size=16
  block_weighted:
    enabled: true
    block_size: 16

  # 6. True Weighted Sampling - Weighted with block_size=1
  true_weighted:
    enabled: true
    block_size: 1

# Output
output:
  save_dir: "./training_experiments/results/main"
  log_interval: 1000
