{
    "model": {
        "base_model": "Qwen/Qwen3-1.7B",
        "teacher_model": "Jacaranda/UlizaLlama3",
        "include_response": false,
        "is_do_alignment": true,
        "alignment_strategy": "longest",
        "projector": {
            "type": "AllInOneProjector",
            "params": {
                "hidden_dim": 1024,
                "num_layers": 3,
                "dropout": 0.1,
                "activation": "gelu",
                "use_layer_norm": true,
                "use_residual": true,
                "use_swiglu": true,
                "use_concat": true,
                "gate_granularity": "scalar",
                "gate_depends_on_input": false,
                "gate_input_features": "target_key",
                "gate_init_value": 0.0,
                "weight_granularity": "head",
                "weight_depends_on_input": true,
                "weight_input_features": "target_projected_key",
                "weight_init_value": 0.0,
                "use_gumbel": true,
                "initial_temperature": 1.0,
                "final_temperature": 0.001,
                "preserve_target_weight": false,
                "anneal_steps": 149,
                "scalar_temperature": 1.0,
                "max_sequence_length": 8192
            }
        },
        "mapping": "last_aligned"
    },
    "training": {
        "learning_rate": 3e-4,
        "weight_decay": 0.01,
        "num_epochs": 1,
        "max_length": 32768,
        "device": "cuda",
        "scheduler_type": "linear",
        "warmup_ratio": 0.1,
        "max_grad_norm": 1.0,
        "gradient_accumulation_steps": 2,
        "per_device_train_batch_size": 8,
        "num_processes": 8,
        "freeze": ["teacher","base"],
        "seed": 42
    },
    "output": {
        "output_dir": "local/checkpoints/multi_lingual_mmmlu",
        "save_steps": 6000,
        "eval_steps": 6000,
        "wandb_config": {
            "project": "Rosetta",
            "mode": "online",
            "run_name": "multi_lingual_0.7_qwen3_1.7B_ulizallama3"
        }
    },
    "data": {
        "type": "MMMLUChatDataset",
        "kwargs": {
            "subset": "SW_KE",
            "split": "train",
            "num_samples": null,
            "max_word_count": 512,
            "data_path": "local/teacher_datasets/MMMLU"
        },
        "train_ratio": 1.0
    }
}
