# Configuration for Qwen2.5-7B-Instruct
# This configuration is optimized for the Qwen2.5-7B-Instruct model
# Requires significant computational resources (GPU with ~16GB+ VRAM recommended)

# Model configuration
model_name: "Qwen/Qwen2.5-7B-Instruct"
target_layer: 18        # Target layer for activation capture
source_layer: 4         # Source layer for activation steering
source_submodule: "mlp.down_proj"  # Inject at MLP down projection
target_submodule: "mlp.down_proj"  # Capture at MLP down projection

# Training hyperparameters
n_training_steps: 2000  # Reduced for demo (original: 5000)
learning_rate: 1e-3
n_vectors: 64           # Reduced for demo (original: 256)
warmup_steps_ratio: 0.1

# Numerical stability
epsilon: 1e-6
token_idxs: -5          # Mean over last 5 tokens

# Loss function weights
alpha: 0.0              # Weight for magnitude loss
alpha_p: 2.0            
alpha_q: 2.0            
beta: 1.0               # Weight for diversity loss
diversity_loss_type: "ntxent"
lambda_: 0.0            # Weight for orthogonality loss
tau: 0.1                # Temperature for diversity loss
orthogonality_style: "cosine_offdiag"

# Vector constraints
radius: 50.0            # Maximum L2 norm for steering vectors
normalize_steering_vectors: true

# Reproducibility
seed: 42

# Data configuration
train_texts_files:
- "data/train/generate_refusal.json"
- "data/train/generate_sycophancy.json"
- "data/train/generate_coordinate-other-ais.json"
- "data/train/generate_corrigible-neutral-HHH.json"
- "data/train/generate_hallucination.json"
- "data/train/generate_myopic-reward.json"
- "data/train/generate_survival-instinct.json"
batch_size: 8           # Reduced for demo (original: 24)
num_vectors_per_batch: 4  # Reduced for demo (original: 12)
max_length: 128

# Output configuration
output_dir: "./outputs/qwen_demo"

# Device configuration
device: "cuda"          # Requires CUDA for Qwen model

# Validation configuration
val_texts_files: 
- "data/validation/refusal_test_dataset_open_ended.json"
val_frequency: 250      # Validate every 250 steps
val_num_samples: 8      # Small validation subset
val_subset_seed: 42
save_generations: true
generate_during_training: true

# Generation parameters
val_max_new_tokens: 128
val_temperature: 0.7
val_top_p: 0.95
val_do_sample: false    # Deterministic for reproducibility
val_vectors_per_call: 16

# Weights & Biases logging
wandb_project: "steerclr-qwen-demo"
wandb_entity: null
wandb_run_name: null
wandb_tags: ["qwen2.5-7b", "steerclr", "demo"]
