###
# Casanovo configuration.
# Blank entries are interpreted as "None".
###

###
# The following parameters can be modified when running inference or when
# fine-tuning an existing Casanovo model.
###

# Max absolute difference allowed with respect to observed precursor m/z.
# Predictions outside the tolerance range are assigned a negative peptide score.
precursor_mass_tol: 50 # ppm
# Isotopes to consider when comparing predicted and observed precursor m/z's.
isotope_error_range: [0, 1]
# The minimum length of predicted peptides.
min_peptide_len: 6
# Number of spectra in one inference batch.
predict_batch_size: 1024
# Number of beams used in beam search.
n_beams: 1
# Number of PSMs for each spectrum.
top_match: 1
# The hardware accelerator to use. Must be one of:
# "cpu", "gpu", "tpu", "ipu", "hpu", "mps", or "auto".
accelerator: "auto"
# The devices to use. Can be set to a positive number int, or the value -1 to
# indicate all available devices should be used. If left empty, the appropriate
# number will be automatically selected for based on the chosen accelerator.
devices:

###
# The following parameters should only be modified if you are training a new
# Casanovo model from scratch.
###

# Random seed to ensure reproducible results.
random_seed: 454

# OUTPUT OPTIONS
# Logging frequency in training steps.
n_log: 100
# Tensorboard directory to use for keeping track of training metrics.
tb_summarywriter: "Training/tb/open_GAN_MTL_scalar"
# Save the top k model checkpoints during training. -1 saves all, and leaving
# this field empty saves none.
save_top_k: 1
# Path to saved checkpoints.
model_save_folder_path: "weights/open_GAN_MTL_scalar_1e6_param_sim_spectra"
# Path to save lance instances
lance_dir:
# Model validation and checkpointing frequency in training steps.
val_check_interval: 50_000

# Specifying the split to be used when only passing a single lance file
train_split_indices_path: "mskb_casanovo_data/combined/train_indices_combined_Train-Cfix_Mox_Val-Mox.pkl"
val_split_indices_path: "mskb_casanovo_data/combined/val_indices_combined_Val-Mox-Cfix.pkl"
test_split_indices_path:

# SPECTRUM PROCESSING OPTIONS
# Number of the most intense peaks to retain, any remaining peaks are discarded.
n_peaks: 300
# Min peak m/z allowed, peaks with smaller m/z are discarded.
min_mz: 50.5
# Max peak m/z allowed, peaks with larger m/z are discarded.
max_mz: 4500.0
# Min peak intensity allowed, less intense peaks are discarded.
min_intensity: 0.00
# Max absolute m/z difference allowed when removing the precursor peak.
remove_precursor_tol: 2.0 # Da
# Max precursor charge allowed, spectra with larger charge are skipped.
max_charge: 10

# MODEL ARCHITECTURE OPTIONS
# Dimensionality of latent representations, i.e. peak embeddings.
dim_model: 512
# Number of attention heads.
n_head: 8
# Dimensionality of fully connected layers.
dim_feedforward: 1024
# Number of transformer layers in spectrum encoder and peptide decoder.
n_layers: 9
# Dropout rate for model weights.
dropout: 0.18
# Number of dimensions to use for encoding peak intensity.
# Projected up to `dim_model` by default and summed with the peak m/z encoding.
dim_intensity:
# Max decoded peptide length.
max_length: 100
# The number of iterations for the linear warm-up of the learning rate.
warmup_iters: 600_000
# The number of iterations for the cosine half period of the learning rate.
cosine_schedule_period_iters: 600_000 #445447
# Learning rate for weight updates during training.
learning_rate: 4e-4
# Regularization term for weight updates.
weight_decay: 1e-5
# Amount of label smoothing when computing the training loss.
train_label_smoothing: 0.01
# Shuffle dataset during training.
# A buffer of size buffer_size is filled and examples from this buffer are randomly sampled.
shuffle: True
buffer_size: 100_000

# TRAINING/INFERENCE OPTIONS
# Number of spectra in one training batch.
train_batch_size: 64
# Max number of training epochs.
max_epochs: 2
# Number of validation steps to run before training begins.
num_sanity_val_steps: 1
# Calculate peptide and amino acid precision during training.
# This is expensive, so we recommend against it.
calculate_precision: False

# Additional Pytorch lightning trainer flags
accumulate_grad_batches: 1
gradient_clip_val:
gradient_clip_algorithm:
precision: "32-true" # '16-true', '16-mixed', 'bf16-true', 'bf16-mixed', '32-true', '64-true', '64', '32', '16', 'bf16'

# Resume training and early stopping
resume_training_from: "last" #, 'best', 'path'
early_stopping_patience:

# Replace I by L in peptide sequences
replace_isoleucine_with_leucine: True
# Reverse peptide sequences
reverse_peptides: True
# mskb tokenizer, otherwise proforma syntax
mskb_tokenizer: True

# AMINO ACID AND MODIFICATION VOCABULARY
residues:
  "G": 57.021464
  "A": 71.037114
  "S": 87.032028
  "P": 97.052764
  "V": 99.068414
  "T": 101.047670
  "C[Carbamidomethyl]": 160.030649 # 103.009185 + 57.021464
  "I": 113.084064
  "L": 113.084064
  "N": 114.042927
  "D": 115.026943
  "Q": 128.058578
  "K": 128.094963
  "E": 129.042593
  "M": 131.040485
  "H": 137.058912
  "F": 147.068414
  "R": 156.101111
  "Y": 163.063329
  "W": 186.079313
  # Amino acid modifications.
  "M[Oxidation]": 147.035400 # Met oxidation:   131.040485 + 15.994915
  "N[Deamidated]": 115.026943 # Asn deamidation: 114.042927 +  0.984016
  "Q[Deamidated]": 129.042593 # Gln deamidation: 128.058578 +  0.984016
  # N-terminal modifications.
  "[Acetyl]-": 42.010565 # Acetylation
  "[Carbamyl]-": 43.005814 # Carbamylation "+43.006"
  "[Ammonia-loss]-": -17.026549 # NH3 loss
  "[+25.980265]-": 25.980265 # Carbamylation and NH3 loss

# Specify unseen tokens during training here
expanded_residues:

# Calibration factor for calibrated stacking during inference. Score = Score * (factor if token seen during training else 1)
calibration_factor:

# Path to a pkl file containing a np.array of all residue occurences in the PSMs. Format: (map, matrix) where map is a map between the AA string and its position in the matrix
residue_counts_path:

# The residue distributions for every episode of training. The numbers will be normalized i.e. divided by the sum.
episodic_distributions:

# The proportion of real data during training (between 0 and 1). Rest will be simulated data
proportion_real_data: 0.5
