[wandb]
use_wandb = false

[benchmark]
model_type = "atom"
benchmark_name = "atom_std_att"
compile = true
compile_trace = false
runs = 3
log_weights = false

[dataloader]
multitask = false
delta_T = 3000
dataset = "md17"
num_timesteps = 8

# Single-task dataloader parameters
molecule_type = "aspirin"  # Iterates through single-task learning on these molecules

# Multitask dataloader parameters
train_molecules = ["aspirin", "benzene", "ethanol", "toluene", "uracil", "salicylic", "malonaldehyde"]
validation_molecules = ["aspirin", "benzene", "ethanol", "toluene", "uracil", "salicylic", "malonaldehyde"]
test_molecules = ["naphthalene"]

# Other dataloader parameters
explicit_hydrogen = false
explicit_hydrogen_gradients = false
radius_graph_threshold = 1.6
rrwp_length = 0  # 0 for no RRWPs
time_lag_mode = "uniform"

normalize_z = false
persistent_workers = true
num_workers = 4
pin_memory = true
prefetch_factor = 2
force_regenerate = true

[training]
device = "cuda"
seed = 42
batch_size = 100
epochs = 1000
use_amp = false
amp_dtype = "bfloat16"
max_grad_norm = 1.0
label_noise_std = 0.1

[optimizer]
type = "adamw"
learning_rate = 1e-3
weight_decay = 1e-5
adam_betas = [0.9, 0.999]
adam_eps = 1e-10

[scheduler]
type = "none"

[atom_config]
# Architecture parameters
num_layers = 5
num_heads = 8
lifting_dim = 128
# Output parameters
output_heads = 1
delta_update = false
# Attention parameters
heterogenous_attention_type = "self"
positional_encoding = "trope"
rope_base = 1000
rope_tau = 10000
learnable_attention_denom = false
# Feature parameters
lifting_type = "quasi_equivariant_tp"
projection_type = "equivariant"
# Layer parameters
norm = "rms"
activation = "swiglu"
value_residual_type = "learnable"

[egno_config]
num_layers = 5
lifting_dim = 64
activation = "silu"
normalise_scalars = true
use_time_conv = true
num_fourier_modes = 2
time_embed_dim = 32