### CONFIGURATION FOR FULL PARAMETER TRAINING ###
### Key fixes: ZeRO-3 + packing + lower grad_clip to stabilize loss ###
### R1-Distill Native Format: No system prompt, reasoning starts directly ###
### Dataset: 114,548 train samples (mixed normal + bounded x1, 1 epoch recommended) ###
### Previous: 132,217 train samples (mixed normal + bounded x2, 1 epoch recommended) ###

model_name_or_path: /pfs/training-data/hf/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
template: deepseekr1  # Correct template for R1-Distill models (adds <think> to prompt)

# MODIFY THIS: Point to your dataset directory
# This should contain your mixed composition SFT data
# Data flow: HC_GENERATION_DIR/mixed_composition_sft/ from main.sh
dataset_dir: <YOUR_HC_SFT_DATA_DIR>/
dataset: train 
# eval_dataset: eval  # No eval set for rejection sampling data

# Full Parameter Training (no freezing)
finetuning_type: full

# Training Setup
stage: sft
do_train: true
mask_history: true
disable_shuffling: false
seed: 42

# Sequence and Batching - Disabled packing to reduce memory pressure
cutoff_len: 8192
packing: false                     # Disabled: packing + ZeRO-3 causes OOM/illegal memory access
per_device_train_batch_size: 1     # Conservative for full param + ZeRO-3
gradient_accumulation_steps: 2     # For 64 GPUs: 1 * 2 * 64 = 128 effective batch

# Preprocessing (Parallel Tokenization)
preprocessing_num_workers: 32
# MODIFY THIS: Point to your tokenized cache directory (optional, speeds up repeated runs)
tokenized_path: <YOUR_TOKENIZED_CACHE_DIR>/hypothesis_composition


# Optimization - Conservative settings for full param stability
learning_rate: 1.0e-5              # Standard for full parameter training
lr_scheduler_type: cosine
warmup_ratio: 0.05                 # 5% warmup
weight_decay: 0.01                 # Slightly higher for regularization
num_train_epochs: 1                # 1 epoch to avoid overfitting
max_grad_norm: 0.5                 # LOWERED: Clamp aggressive gradients to prevent explosion

# Precision Settings
bf16: true
fp16: false
gradient_checkpointing: true
flash_attn: sdpa  # Use PyTorch built-in SDPA (no GLIBC dependency)

# DeepSpeed ZeRO-3 - More stable than ZeRO-2 for full param training
deepspeed: SFT/deepspeed_zero3.json

# Evaluation Settings (disabled - no eval set for rejection sampling data)
do_eval: false
# eval_strategy: steps
# eval_steps: 65
# per_device_eval_batch_size: 2
# eval_accumulation_steps: 4
# load_best_model_at_end: true
# metric_for_best_model: eval_loss
# greater_is_better: false

# Note: With 114,548 training samples and effective batch size of 128:
# For 64 GPUs: 1 (per_device) * 2 (gradient_accumulation) * 64 (GPUs) = 128
# Steps per epoch = 114,548 / 128 = ~895 steps
# Total steps (1 epoch) = ~895 steps
# Previous (132k 2x): Steps per epoch = 132,217 / 128 = ~1032 steps

# Saving and Logging
save_strategy: steps
# save_steps: 76  # Save ~10 times per epoch (757 steps / 10)
# save_steps: 90  # Save ~10 times per epoch (895 steps / 10)
save_steps: 103  # Save ~10 times per epoch (1032 steps / 10)

save_total_limit: 3
logging_steps: 1   # Show loss every step for debugging
report_to: tensorboard

# MODIFY THIS: Point to your checkpoint output directory
output_dir: <YOUR_CHECKPOINT_DIR>/full_param_training_hypothesis_composition
