### CONFIGURATION FOR INSPIRATION RETRIEVAL SFT ###
### R1-Distill Native Format: No system prompt, reasoning starts directly ###
### Dataset: 168,880 samples, max tokens: ~9,784, mean: ~7,300 tokens ###

# MODIFY THIS: Point to your base model
model_name_or_path: <YOUR_HF_MODELS_DIR>/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
template: deepseekr1  # Correct template for R1-Distill models (adds <think> to prompt)

# MODIFY THIS: Point to your dataset directory
# This should contain your IR SFT data
# Data flow: IR_SFT_DATA_DIR from main.sh
dataset_dir: <YOUR_IR_SFT_DATA_DIR>/
dataset: train  # Mapped to actual file in dataset_info.json
eval_dataset: eval  # Mapped to actual file in dataset_info.json

### RESUME TRAINING CONFIGURATION ###
# To resume from checkpoint: uncomment the line below
# To start fresh training: keep the line commented out
# resume_from_checkpoint: <YOUR_CHECKPOINT_DIR>/lora_training_inspiration_retrieval/checkpoint-XXX
# ^ COMMENT OUT THE LINE ABOVE IF YOU WANT TO START TRAINING FROM SCRATCH

# LoRA Configuration (High Quality)
finetuning_type: lora
lora_rank: 256  # High rank for complex reasoning
lora_alpha: 512  # 2x rank
lora_dropout: 0.1
lora_target: all

# Training Setup
stage: sft
do_train: true
mask_history: true
disable_shuffling: false  # Disable shuffling to maintain original data order
seed: 42  # Random seed for reproducibility (Hitchhiker's Guide reference)

# Sequence and Batching
cutoff_len: 16384  # Sufficient for ALL 168,880 samples (max: ~9,784 tokens, mean: ~7,300 tokens)
packing: false
per_device_train_batch_size: 2  # Works with 16K context length
gradient_accumulation_steps: 8  # Adjust based on GPU count to maintain effective batch size 128

# Preprocessing (Parallel Tokenization)
preprocessing_num_workers: 32  # Use 32 CPU cores for parallel tokenization
# MODIFY THIS: Point to your tokenized cache directory (optional, speeds up repeated runs)
tokenized_path: <YOUR_TOKENIZED_CACHE_DIR>/inspiration_retrieval_lora

# Optimization
learning_rate: 0.00005
lr_scheduler_type: cosine
warmup_ratio: 0.1
weight_decay: 0.1
num_train_epochs: 1
max_grad_norm: 1.0  # Gradient clipping for training stability (DeepSpeed also has this but adding here for consistency)

# Precision Settings
bf16: true
gradient_checkpointing: true
flash_attn: sdpa  # Use PyTorch built-in SDPA (no GLIBC dependency)

# DeepSpeed Configuration for memory efficiency (helps with OOM on longer sequences)
deepspeed: SFT/deepspeed_zero3.json

# Evaluation Settings
do_eval: true
eval_strategy: steps
eval_steps: 132  # Evaluate every ~10% of epoch (1,319 total steps / 10)
per_device_eval_batch_size: 2  # Reduced from 4 due to longer sequences
eval_accumulation_steps: 4  # Increased to maintain effective eval batch
load_best_model_at_end: true  # Ensures final saved model is the best one
metric_for_best_model: eval_loss
greater_is_better: false

# Note: With 168,880 training samples and effective batch size of 128:
# Effective batch size = per_device_train_batch_size * gradient_accumulation_steps * num_GPUs
# For 8 GPUs:   gradient_accumulation_steps = 8  → 2 * 8 * 8 = 128
# For 64 GPUs:  gradient_accumulation_steps = 1  → 2 * 1 * 64 = 128
# For 128 GPUs: gradient_accumulation_steps = 1  → 2 * 1 * 128 = 256 (or reduce per_device to 1)
# Total steps = 168,880 / 128 = ~1,319 steps per epoch
# Memory optimization: Using DeepSpeed ZeRO-2 to reduce optimizer/gradient memory, allowing batch_size=2
# Token statistics: mean=7,300, median=7,261, max=9,784 (all covered by cutoff_len=16384)

# Saving and Logging
save_strategy: steps
save_steps: 132  # Save ~10 times per epoch (1,319 steps / 10)
save_total_limit: 3
logging_steps: 20  # Log ~66 times per epoch for good monitoring
report_to: tensorboard

# MODIFY THIS: Point to your checkpoint output directory
output_dir: <YOUR_CHECKPOINT_DIR>/lora_training_inspiration_retrieval
