#Example config: FT-Transformer, downstream task, target 3, 100 samples per class, linear head atop a frozen feature extractor
seed = 3

[data]
cat_policy = "indices"
dset_id = "mimic"
normalization = "quantile"
task = "binclass"

[model]
activation = "reglu"
attention_dropout = 0.205171574432684
d_ffn_factor = 0.682591056974095
d_token = 424
ffn_dropout = 0.4481899309381051
initialization = "kaiming"
n_heads = 8
n_layers = 2
prenormalization = true
residual_dropout = 0.0

[training]
batch_size = 256
eval_batch_size = 256
lr = 0.0001
lr_n_decays = 0
n_epochs = 200
num_batch_warm_up = 0
optimizer = "adamw"
patience = 100000.0
weight_decay = 3.559914696970604e-6

[transfer]
checkpoint_path = "YOUR_PATH_HERE/checkpoint.pt"
downstream_samples_per_class = 100
epochs_warm_up_head = 0
freeze_feature_extractor = true
head_lr = 0.0001
layers_to_fine_tune = [ "head",]
load_checkpoint = true
pretrain_proportion = 3 #this argument specifies the downstream MIMIC target (0-11)
stage = "downstream"
use_mlp_head = false
pretrain_subsample = false
