### CONFIGURATION FOR HYPOTHESIS COMPOSITION TRAINING ###
### R1-Distill Native Format: No system prompt, reasoning starts directly ###
### Dataset: 167,691 train samples (rejection sampling with reranker score >= 0.7) ###

model_name_or_path: /pfs/training-data/hf/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
template: deepseekr1  # Correct template for R1-Distill models (adds <think> to prompt)

# MODIFY THIS: Point to your dataset directory
# This should contain your HC SFT data from rejection sampling
# Data flow: HC_GENERATION_DIR/*/sft_data from main.sh
dataset_dir: <YOUR_HC_SFT_DATA_DIR>/
dataset: train  # Mapped to sft_llama_factory_*.jsonl in dataset_info.json
# eval_dataset: eval  # No eval set for rejection sampling data

# LoRA Configuration (Optimized for complex reasoning task)
finetuning_type: lora
# lora_rank: 256  # High rank for complex reasoning
# lora_alpha: 512  # 2x rank
# lora_dropout: 0.1
# lora_target: all
# lora_rank: 128  # Reduced rank to prevent overfitting
# lora_alpha: 256  
# lora_dropout: 0.1  
# lora_target: ["q_proj", "v_proj", "k_proj", "o_proj"]  # Target attention layers only
lora_rank: 256  # High rank for complex structured reasoning
lora_alpha: 512  # 2x rank ratio
lora_dropout: 0.05  # Lower dropout with large dataset (185k samples)
lora_target: all  # Include MLP layers for knowledge adaptation

# Training Setup
stage: sft
do_train: true
mask_history: true
disable_shuffling: false  # Disable shuffling to maintain original data order
seed: 42  # Random seed for reproducibility (Hitchhiker's Guide reference)

# Sequence and Batching
cutoff_len: 8192  # Covers all samples (verify max token length for new dataset)
packing: false
per_device_train_batch_size: 1  # Reduced from 2 to prevent OOM/SIGSEGV
gradient_accumulation_steps: 2  # For 128 GPUs: 1 * 2 * 128 = 256 effective batch

# Preprocessing (Parallel Tokenization)
preprocessing_num_workers: 32  # Use 32 CPU cores for parallel tokenization
# MODIFY THIS: Point to your tokenized cache directory (optional, speeds up repeated runs)
tokenized_path: <YOUR_TOKENIZED_CACHE_DIR>/hypothesis_composition_lora

# Optimization
learning_rate: 0.00003  # 3e-5: balanced for LoRA with large batch
lr_scheduler_type: cosine
warmup_ratio: 0.1
weight_decay: 0.0     # No decay for 1 epoch (maximizes learning)
num_train_epochs: 1   # 1 epoch to avoid overfitting (DPO will use same data)
max_grad_norm: 1.0

# Precision Settings
bf16: true
gradient_checkpointing: true
flash_attn: sdpa  # Use PyTorch built-in SDPA (no GLIBC dependency)

# DeepSpeed Configuration for memory efficiency
deepspeed: SFT/deepspeed_zero2.json

# Evaluation Settings (disabled - no eval set for rejection sampling data)
do_eval: false
# eval_strategy: steps
# eval_steps: 65
# per_device_eval_batch_size: 4
# eval_accumulation_steps: 2
# load_best_model_at_end: true
# metric_for_best_model: eval_loss
# greater_is_better: false

# Note: With 167,691 training samples and effective batch size of 256:
# For 128 GPUs: 1 (per_device) * 2 (gradient_accumulation) * 128 (GPUs) = 256
# For 64 GPUs:  1 (per_device) * 4 (gradient_accumulation) * 64 (GPUs) = 256
# Steps per epoch = 167,691 / 256 = ~655 steps
# Total steps (1 epoch) = ~655 steps

# Saving and Logging
save_strategy: steps
save_steps: 65  # Save ~10 times per epoch (655 steps / 10)
save_total_limit: 3  # Keep only 3 most recent checkpoints to save space
logging_steps: 20
report_to: tensorboard

# MODIFY THIS: Point to your checkpoint output directory
output_dir: <YOUR_CHECKPOINT_DIR>/lora_training_hypothesis_composition
