# Joint Training Configuration
# Combines all 4 experiences (apibench, mllm, hugging-bench-1, hugging-bench-2)
# into one dataset for upper bound baseline in continual learning
experience_name: joint  # Will be set to "joint" automatically in joint training mode
variant_name: joint_training_upper_bound
extra_info: ''
output_root: cco/experiments
repo_id: huggyllama/llama-7b
retriever: null
system_prompt: ''
system_prompt_format: gorilla_prompt
epochs: 5
batch_size: 16
grad_accum: 8
lr: 0.0005
max_length: 550
max_grad_norm: 1.0
packing: false
group_by_length: true
completion_only_loss: true
label_smoothing: 0.05
lora_r: 32
lora_alpha: 64
lora_dropout: 0.05
target_modules:
- q_proj
- k_proj
- v_proj
- o_proj
- gate_proj
- down_proj
- up_proj
resume_from: null
lora_adapters: []
early_stopping_patience: 3
early_stopping_threshold: 0.01
no_validation: true  # Keep validation for joint training
hyperparameters_search: false
eval_at_step0: false
weight_decay: 0.001
warmup_steps: 10
lr_scheduler_type: linear
optim: adamw_torch
logging_steps: 5
save_strategy: epoch
save_total_limit: 1
metric_for_best_model: eval_loss
greater_is_better: false
activation_checkpointing: true
low_memory_mode: true
use_quantization: false
preference_mode: false
negative_sampling_strategy: mixed
num_rejections_per_example: 2
orpo_beta: 0.1
max_prompt_length: 512

# Joint training mode - combines all 4 experiences
joint_training: true

# Replay is disabled in joint training (all data is available)
replay_strategy: random
replay_min_per_domain: 5
replay_max_per_domain: null
replay_max_per_model: 3
replay_embedding_source: flagembedding

# Neighbor consistency disabled for joint training (not needed)
use_neighbor_consistency: false
neighbor_k: 3
neighbor_consistency_weight: 0.1
neighbor_consistency_temperature: 1.0
neighbor_source: apibench
neighbor_domain_filter_mode: strict
neighbor_domain_bias: 0.2
neighbor_min_same_domain: 1
neighbor_replay_only: false
neighbor_replay_similarity_threshold: 0.95
neighbor_max_consistency_samples: 4
neighbor_consistency_num_tokens: 5
neighbor_embedding_model: flagembedding
use_neighbor_contrastive: false
neighbor_contrastive_weight: 0.1
neighbor_contrastive_k: 3
neighbor_contrastive_num_negatives: 3
neighbor_contrastive_max_anchors_per_batch: 4
neighbor_contrastive_loss_type: softplus
neighbor_contrastive_margin: 0
neighbor_contrastive_apply_to: replay_only
use_xclr: false
xclr_weight: 0.2
xclr_tau: 0.1
xclr_tau_target: 0.1
xclr_graph_mode: taxonomy
xclr_domain_alpha: 0.5
xclr_apply_to: replay_only
xclr_proj_dim: 256
xclr_max_anchors_per_batch: 8
xclr_log_every: 100
xclr_modelcard_encoder: all-mpnet-base-v2
xclr_modelcard_blend: 0.0
xclr_debug: true
xclr_debug_log_every: 10
xclr_debug_max_print: 1
xclr_num_candidates: 31
xclr_min_pos: 1
xclr_candidate_sampling: prompt_similarity
xclr_target_graph: prompt_similarity
xclr_promptsim_k: 3
xclr_promptsim_retriever_type: flagembedding
xclr_promptsim_exclude_self: true
xclr_promptsim_retrieve_over_all: true
xclr_queue_size: 2048
xclr_queue_use: false
xclr_queue_device: cpu
xclr_num_queue_candidates: 256
xclr_force_positive_from_queue: false
xclr_use_two_stream_sampler: false
xclr_replay_per_batch: 4
xclr_proj_lr: null
xclr_stopgrad_base: false
xclr_stopgrad_warmup_steps: 20
loss_mode: router
router_loss_weight: 1.0
lm_loss_weight: 1.0
router_embedding_dim: 1024
router_tau: 0.08
router_pooling: last_token
router_proj_lr: 3e-4
router_embedding_lr: 5e-5
router_K_total: 64
router_K_semantic: 38
router_K_far: 10
router_K_hard: 15
router_mine_every_steps: 100
router_K_hard_pool: 50
router_semantic_pool_size: 1024
router_max_pool_size: 2048
router_semantic_pool_mode: parent_group
router_semantic_pool_max_domains: 8
router_semantic_pool_depth: 1
router_use_soft_targets: true
router_soft_target_eps: 0.02
router_soft_target_k_neighbors: 10
router_use_label_graph_reg: false
router_label_graph_lambda: 0.1
router_label_graph_tau: 0.07
router_label_graph_tau_target: 0.1
router_label_graph_max_models: 256
router_label_graph_alpha_domain: 0.5
semantic_batching: true
domains_per_batch: 2
router_registry_path: null
router_registry_init_mode: fresh  # Use "fresh" for joint training (no previous registry)
router_registry_base_path: null  # No base registry for joint training

# Two-phase training disabled for joint training (not needed)
router_two_phase_enable: false
router_phase1_frac: 0.4
router_phase1_loss_mode: router
router_phase1_replay_ratio: null
router_phase1_router_loss_weight: 1.0
router_phase1_lm_loss_weight: 0.0
router_phase1_proj_lr: 1e-4
router_phase1_embedding_lr: 5e-5
router_phase1_use_soft_targets: false
router_phase1_soft_target_eps: 0.02
router_replay_loss_multiplier: 5.0
router_exp1_preservation_enable: false
router_exp1_preservation_M_old: null

# Embedding anchoring disabled for joint training (not needed)
router_anchor_enable: false
router_anchor_lambda: 10000.0
router_anchor_mode: "normalized"
router_anchor_apply_phase: "phase1"
router_anchor_scope: "all_old"
router_anchor_M_old: null

# Projection anchoring disabled for joint training (not needed)
router_proj_anchor_enable: false
router_proj_anchor_lambda: 50000.0
router_proj_anchor_apply_phase: "phase1"

router_freeze_lm: true
