# Dataset and Model Configuration
dataset: 'imdb_lra'  # Dataset to use for training and evaluation
# Dataset choices: ['imdb', 'imdb_long', 'imdb_lra', 'cifar10', 'listops', 'pathfinder32', 'aan']
# pathfinder32 max_seq-len is 1024 (32x32 image)
# imdb_long uses canine-c tokenizer with data from huggingface and imdb_lra uses custom ascii tokenizer with dowloaded data
model_name: 'google/canine-c'  # Pretrained model to use/ For importing tokenizer
# Model choices: ['bert-base-uncased', 'bert-large-uncased', 'bert-base-cased', 'bert-large-cased', 'roberta-base',\
#'roberta-large', 'google/canine-c', 'custom']

# Training Parameters
batch_size: 64  # Batch size for training
max_seq_len: 4096  # Maximum sequence length for input texts
num_epochs: 100  # Number of training epochs
learning_rate: '5e-5'  # Learning rate
weight_decay: 0.1  # Weight decay factor
use_scheduler: True # Uses the default scheduler

# Model Architecture
num_heads: 6  # Number of attention heads
hidden_dim: 2048  # Dimension of the feedforward network model
num_layers: 1  # Number of transformer encoder layers
dropout: 0.1  # Dropout factor of the model
attention_type: 'astro'  # Type of attention to use, choices: ['astro', 'softmax']

# Memory Replay Parameters
memory_replay_backprop: True  # Use memory replay backpropagation
num_segments: 8  # Number of segments to split input sequences for memory replay backpropagation
num_memory_tokens: 1  # Number of memory tokens
astro_mem: True  # Flag to use the Astrocytic memory
mem_sum: False  # Flag to use the Summed memory from all the segments

# AstroAttention Parameters
scaleD: 100  # Scaling factor for AstroAttention
alpha: 0.25  # Alpha parameter for AstroAttention
add_Hrel: True  # Flag to add Hrel/scaling_factor
astro_sigmoid_nonlinearity: False  # Flag to use sigmoid nonlinearity over the H_neuron and H_astro
clip: 100  # Clip value for relative positional encoding

# Pooling Method
pooling: 'average'  # Pooling method, choices: ['cls', 'average']

# Dataset Sampling
sample_percentage: 100  # Percentage of dataset to use for training (only for imdb and imdb_long)

# Pretrained Model Handling
freeze_pretrained: False  # Freeze pretrained model parameters
use_only_embeddings: True  # Use only embeddings from pretrained model

# Attention Replacement Parameters
replace_attention: False  # Replace attention layers with AstroAttention (only True for use_only_embeddings: False)
layers_to_replace:  #'0,1,2'  # Comma-separated list of layer indices to replace attention (e.g., "0,1,2"). If None, replace all layers.

# Logging and Saving
wandb: False  # Use Weights & Biases for logging (set True and configure wandb_run_name to enable)
wandb_run_name: 'rmaat_run'  # Name for the Weights & Biases run
model_save_path: './models'  # Path to save the model and plots

# Seed for Reproducibility
seed: 42  # Seed value for reproducibility

# Pathfinder Dataset Parameters
# pathfinder_diff_levels: 'curv_baseline,curv_contour_length_9,curv_contour_length_14' # 'curv_baseline,curv_contour_length_9,curv_contour_length_14' # Comma-separated list of difficulty levels to use for the Pathfinder dataset.  If None, defaults to curv_contour_length_14.
# curriculum_learning: False  # Enable curriculum learning for the Pathfinder dataset (trains from easy to hard diff_levels): Not Implemented yet
