defaults:
  - _self_

cache_dir: ${oc.env:DATA_HOME}/haipr/embeddings # null, ~/.cache/haipr/embeddings
cache_format: lmdb # hdf5, lmdb
chain_break_token: ${oc.select:model.chain_break_token, "|"} # This is the default for ESM3/C family of models, change according to the models Alphabet
test_split_idx: null
test_split_method: ${data.split_method} # use same by default
subsample_threshold: ${oc.select:subsample_threshold, 0} # if dataset has more than this number of samples, subsample to this number, 0 for no subsampling
subsample_train: 0 # if set, subsample the train data to this number of samples, without changing validation/test data 
focus: true # if true, focus on the mutated region when generating sequences
single_mutant_cutoff: 1000 # if less than this number of single mutants, skip contig splits
recompute_features: false 
cut_top: null  # null, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, hold out set for testing, can also be done with ood and test_split_idx
normalize_labels: false
split_method: ood # cv, lomo, ood, skewed, contig, modulo
num_splits: 5  # not used for lomo
run_single_split: false
num_classes: 0  # default is 0 for regression tasks
label_column: ${benchmark.targets[0]}  # Required: column name containing labels
sequence_column: ${benchmark.sequence_column}  # Column containing sequences
remove_constant_labels: false  # Whether to remove rows with constant labels
average_embeddings: true  # Whether to average embeddings, default is true -> sequence level
feature_type: ${model.feature_type}  # Type of features 
random_state: ${oc.select:seed, 42}  # Random seed for reproducibility

# DDP (Distributed Data Parallel) configuration
use_ddp: false  # Enable DDP for multi-GPU training 