# Configuration for SteerCLR Demo
# This is a lightweight configuration suitable for demonstration and quick experimentation
# For production runs, increase n_training_steps, n_vectors, and use larger datasets

# Model configuration  
model_name: "microsoft/DialoGPT-small"  # Small model for demo - can be changed to larger models
target_layer: 8   # Target layer for activation capture (adjusted for small model)
source_layer: 2   # Source layer for activation steering
source_submodule: "mlp.c_proj"  # Inject at MLP projection 
target_submodule: "mlp.c_proj"  # Capture at MLP projection

# Training hyperparameters (reduced for demo)
n_training_steps: 500   # Quick demo training
learning_rate: 1e-3
n_vectors: 16           # Small number of vectors for demo
warmup_steps_ratio: 0.1

# Numerical stability
epsilon: 1e-6
token_idxs: -3          # Mean over last 3 tokens

# Loss function weights
alpha: 0.0              # No magnitude loss for demo
alpha_p: 2.0            
alpha_q: 2.0            
beta: 1.0               # Focus on diversity
diversity_loss_type: "ntxent"
lambda_: 0.0            # No orthogonality loss for demo  
tau: 0.1                
orthogonality_style: "cosine_offdiag"

# Vector constraints
radius: 10.0            # Smaller radius for demo
normalize_steering_vectors: true

# Reproducibility
seed: 42

# Data configuration (full datasets)
train_texts_files:
- "data/train/generate_refusal.json"
- "data/train/generate_sycophancy.json"
- "data/train/generate_coordinate-other-ais.json"
- "data/train/generate_corrigible-neutral-HHH.json"
- "data/train/generate_hallucination.json"
- "data/train/generate_myopic-reward.json"
- "data/train/generate_survival-instinct.json"
batch_size: 4           # Small batch for demo
num_vectors_per_batch: 4
max_length: 128

# Output configuration
output_dir: "./outputs/demo"

# Device configuration  
device: "auto"          # Auto-detect best available device

# Validation configuration
val_texts_files: 
- "data/validation/refusal_test_dataset_open_ended.json"
val_frequency: 100      # Validate frequently for demo
val_num_samples: 5      # Small validation set
val_subset_seed: 42
save_generations: true
generate_during_training: true

# Generation parameters
val_max_new_tokens: 64
val_temperature: 0.7
val_top_p: 0.95
val_do_sample: true
val_vectors_per_call: 8

# Weights & Biases logging
wandb_project: "steerclr-demo"
wandb_entity: null      
wandb_run_name: null    
wandb_tags: ["demo", "steerclr", "unsupervised"]