{
  "data_path": "/Inputs/obf_ds_content_document",
  "split": "950,40,10",
  "vocab_file": "/Inputs/vocab.json",
  "merge_file": "/Inputs/merges.txt",
  "tokenizer_type": "GPT2BPETokenizer",
  "save": "/Outputs/obf_ds_large/artifacts",
  "log_dir": "/Outputs/obf_ds_large/logs",
  "data_impl": "mmap",
  "num_workers": 4,
  "do_train": true,
  "do_valid": true,
  "do_test": true,

  "pipe_parallel_size": 0,
  "model_parallel_size": 1,
  "make_vocab_size_divisible_by": 1,

  # model settings
  "num_layers": 22,
  "hidden_size": 3072,
  "num_attention_heads": 24,
  "seq_length": 2048,
  "max_position_embeddings": 2048,
  "pos_emb": "rotary",
  "rotary_pct": 1,
  "rotary_emb_base": 1000000,
  "no_weight_tying": true,
  "gpt_j_residual": false,
  "output_layer_parallelism": "column",
  "norm": "rmsnorm",
  "rms_norm_epsilon": 1.0e-6,
  "optimizer": {
    "type": "Adam",
    "params": {
      "lr": 0.0005,
      "betas": [0.9, 0.95],
      "eps": 1.0e-8
    }
  },
  "min_lr": 0.000025,
  "seed": 99246,
  "gradient_clipping": 1.0,
  "weight_decay": 0.1,
  "attention_config": [[["flash"], 22]],

  "bf16": {
    "enabled": true
  },

  "train_iters": 520000,
  "lr_decay_iters": 520000,
  "distributed_backend": "nccl",
  "lr_decay_style": "cosine",
  "use_checkpoint_lr_scheduler": true,
  "warmup": 0.05,
  "checkpoint_scale": "linear",
  "checkpoint_factor": 40000,
  "extra_save_iters": [5000, 10000, 20000],
  "eval_interval": 40000,
  "eval_iters": 400,
  "log_interval": 1000,
  "steps_per_print": 1000,

  "scaled_upper_triang_masked_softmax_fusion": true,
  "bias_gelu_fusion": true,
  "rope_fusion": true,
  "use_bias_in_norms": false,
  "use_bias_in_attn_linear": false,
  "mlp_type": "llama",
  "activation": "silu",

  "train_micro_batch_size_per_gpu": 4,
  "train_batch_size": 256,
  "gradient_accumulation_steps": 8,
  "master_port": 30712,
  "num_gpus": 8,
  "num_nodes": 1,
  "dump_state": true,
  "world_size": 8,
  "global_num_gpus": 8,
  "precision": "bfloat16",
  "fp32_allreduce": true,
  "attention_softmax_in_fp32": true,

  "zero_optimization": {
    "stage": 2,
    "allgather_partitions": true,
    "allgather_bucket_size": 1500000000,
    "overlap_comm": true,
    "reduce_scatter": true,
    "reduce_bucket_size": 1500000000,
    "contiguous_gradients": true,
    "cpu_offload": false,
    "round_robin_gradients": true
  },

  "use_wandb": true,
  "wandb_group": "obf_ds",
  "wandb_run_name": "obf_ds_large",
  "wandb_team": "WS_NAME",
  "wandb_project": "Obscura",
  "log_grad_pct_zeros": true,
  "log_param_norm": true,
  "log_grad_norm": true,
}
