{
    "dataset": "slim_6b-all",
    "train_domains":"arxiv,book,cc,c4,github,stackexchange,wikipedia",
    "tgt_domains":"arxiv,book,cc,c4,github,stackexchange,wikipedia",
    "train_dw": "0.1428,0.1428,0.1428,0.1428,0.1428,0.1428,0.1428,0",
    "val_dw": "0.1428,0.1428,0.1428,0.1428,0.1428,0.1428,0.1428,0",
    "max_train_samples": null,
    "max_eval_samples": 20000,
    "max_downstream_samples": null,
    "max_token_length": 512,
    "seed": 42,
    "preprocessing_num_workers": 1,
    "model_name_or_path": null,
    "model_type": "gpt2",
    "config_overrides": "n_positions=512,n_embd=768,n_layer=36,n_head=24",
    "run_name": "BASE_82M",
    "output_dir": "/fs/scratch/PAS1687/GPT2_checkpoints/gpt2_large/extremes",
    "do_train": true,
    "do_eval": true,
    "do_predict": false,
    "learning_rate": 5e-4,
    "weight_decay": 1e-2,
    "reweight_domains": false,
    "doremi": false,
    "ref_model": null,
    "lr_scheduler_name": "linear_warmup_cosine",
    "lr_end": 1e-4,
    "reweight_eps": 0.0,
    "mu": 0.01,
    "dw_max": 5.0,
    "dw_min": 0.0,
    "max_grad_norm": 1.0,
    "per_device_train_batch_size": 12,
    "warmup_ratio": 0.05,
    "warmup_steps": 500,
    "max_steps": 20000,
    "save_steps": 2000,
    "eval_steps": 2000,
    "gradient_accumulation_steps": 1,
    "save_strategy": "steps",
    "evaluation_strategy": "steps",
    "logging_steps": 10,
    "save_total_limit": 10,
    "ddp_find_unused_parameters": false,
    "downstream_num_shots": 5,
    "downstream_datasets": null,
    "eval_all_checkpoints": false,
    "skip_perplexity_eval": false,
    "use_cpu": false,
    "ddp_backend": "nccl",
    "compute_pertoken_losses": false,
    "overwrite_output_dir": true,
    "local_rank": -1,
    "domain_update_per_iter": null,
    "reweight_samples": true,
    "kl_reg": 0.4,
    "reweight_strategy": "extremes",
    "burnout_steps": 1000
}