scheme: pcqm.pretrain
model_name: tgt_at_dp_rdkit_24l_1440b_1e-4r_600ks_10dist_cgn5_nowd_cyclceloss_torsion_linangle_vn_dbt10-1-2_angle2dist
model_prefix: tgt_at_200m
distributed: true           # In our distributed setting we had 32 GPUs (V100)
batch_size: 180              # Total batch size = 64 * 32 = 2048
model_height: 24
node_width: 768
edge_width: 256
angle_width: 256
torsion_angle_width: 256
num_heads: 64
max_lr: 0.0015
clip_grad_norm: 5.0
source_dropout: 0.3
drop_path: 0.2
node_act_dropout: 0.1
edge_act_dropout: 0.1
lr_warmup_steps: 30000
lr_total_steps: 600000
node_ffn_multiplier: 1.0
edge_ffn_multiplier: 1.0
scale_degree: true
upto_hop: 32
activation: gelu
evaluation_samples: 10
prediction_samples: 10
prediction_bmult: 2
dataloader_workers: 2
evaluation_type: validation
optimizer: Adam    # If apex is not installed, set it to "Adam"
coords_noise: 0.2
coords_noise_smooth: 1.0
num_dist_bins: 512
range_dist_bins: 10
dist_loss_weight: 0.1
validation_frequency: 10
mixed_precision: true
train_split: train           # Set it to "train-3d" for meaningful validation
triplet_type: attention      # Set to "aggregate" for triplet aggregation
triplet_heads: 16
