model:
  target: ltm.models.Autoregressive
  params:
    # torch_compile: true
    model_config:
      # target: transformers.LlamaForCausalLM
      target: ltm.modules.transformers.internlm3.modeling_internlm3.InternLM3ForCausalLM
      params:
        config:
          target: ltm.modules.transformers.internlm3.configuration_internlm3.InternLM3Config
          # we follow MDM's Tiny model configuration except max position embeddings
          # https://github.com/HKUNLP/diffusion-vs-ar/blob/main/model_config_tiny/config.json
          # 78.4 M parameters
          params:
            vocab_size: 8000
            max_position_embeddings: 32768 #5M
            hidden_size: 384
            num_attention_heads: 12
            num_hidden_layers: 3
            bos_token_id: 0
            eos_token_id: 0
            pad_token_id: 0
            rope_theta: 50000000
            rope_scaling:
              factor: 6.0
              rope_type: dynamic
            _attn_implementation: sdpa
    optimizer_config:
      target: torch.optim.AdamW
      params:
        lr: 5.e-4
        betas: [0.9, 0.999]
        eps: 1.e-08
        weight_decay: 0.01
    scheduler_config:
      name: cosine
      # num_warmup_steps: 2000
data:
  target: ltm.data.cd.CountDownDataModule
  params:
    train_split: cd4_train
    test_split: cd4_test
    max_length: 5_000_000
    val_size: 0.1
    num_workers: 8
    batch_size: 1
    tokenizer_config:
      target: transformers.AutoTokenizer
      params:
        pretrained_model_name_or_path: /path/to/home/workspace/code/lltm/training/data/ltm_tokenizers/cd-8k
        pad_token: "<|endoftext|>" # set pad token to eos token
        model_max_length: 1_000_000_000
    transform_config:
      target: ltm.data.transforms.PytracifyTransformForCD
      params:
        do_truncation: false
        solver_name_or_path: cd
        trace_formatter: numeric_depth
lightning:
  strategy:
    target: lightning.pytorch.strategies.DDPStrategy
  modelcheckpoint:
    monitor: val/loss
    mode: min
    save_top_k: 2
    save_last: true
    verbose: true
  trainer:
    devices: 0,1,2,3,4,5,6,7
    max_epochs: 100
    precision: 16-mixed
    gradient_clip_val: 1.0
    accumulate_grad_batches: 1
