# Repository for Approximately Normalized Transformer

### Installation

Requires CUDA 12.3+ and an Nvidia GPU (Ampere or later)

```bash
bash install_env.sh
```

### Data Preparation

```bash
mkdir data
python anGPT/data/preprocess_data.py
```

### Run 32M experiments

Run the following command to run the GPT+ 32M experiment. The results will be saved in the `experiments` directory. We
expect one node with 4 A100 40GB GPUs each. The runtime is <30min.

```bash
mkdir experiments
accelerate launch --config_file scripts/accelerate.yaml --main_process_ip $MASTER_ADDR  train_model.py -c default_config.yaml "experiment.session_name=NeurIPS25_OWT_32M
experiment.experiment_name=GPT+_32M_OWT_0B6_bs64_adamW_wd01_lr-0.01
train.max_steps=10000
optim.lr=0.01
optim.betas=(0.9,0.99)
optim.scheduler.num_warmup_steps=2000
optim.weight_decay=0.1
model.mode=GPT2
model.n_layer=6
model.n_head=4
model.n_embd=256
model.base_scale=0.02
model.scaled_projection=true
model.explicit_norm=false
model.explicit_norm_bounded=false
model.alpha_correction=true
model.qk_norm=true
data_base.batch_size=16"
```

Run the following command to run the GPT+ no QK norm 32M experiment.

```bash
accelerate launch --config_file scripts/accelerate.yaml --main_process_ip $MASTER_ADDR  train_model.py -c default_config.yaml "experiment.session_name=NeurIPS25_OWT_32M
experiment.experiment_name=GPT+_NoQKnorm_32M_OWT_0B6_bs64_adamW_wd01_lr-0.01
train.max_steps=10000
optim.lr=0.01
optim.betas=(0.9,0.99)
optim.scheduler.num_warmup_steps=2000
optim.weight_decay=0.1
model.mode=GPT2
model.n_layer=6
model.n_head=4
model.n_embd=256
model.base_scale=0.02
model.scaled_projection=true
model.explicit_norm=false
model.explicit_norm_bounded=false
model.alpha_correction=true
model.qk_norm=false
model.post_norm=true
model.post_norm=true
data_base.batch_size=16"
```

Run the following command to run the nGPT 32M experiment.

```bash
accelerate launch --config_file scripts/accelerate.yaml --main_process_ip $MASTER_ADDR  train_model.py -c default_config.yaml "experiment.session_name=NeurIPS25_OWT_32M
experiment.experiment_name=nGPT_32M_OWT_0B6_bs64_adamW_wd01_lr-0.0046
train.max_steps=10000
optim.lr=0.0046
optim.betas=(0.9,0.99)
optim.scheduler.num_warmup_steps=1
optim.weight_decay=0.0
model.mode=nGPT
model.n_layer=6
model.n_head=4
model.n_embd=256
model.base_scale=0.0625
model.scaled_projection=true
model.explicit_norm=true
model.explicit_norm_bounded=false
model.out_norm_dim_0=true
model.alpha_correction=false
model.qk_norm=true
model.post_norm=true
data_base.batch_size=16"
```

Run the following command to run the anGPT 32M experiment.

```bash
accelerate launch --config_file scripts/accelerate.yaml --main_process_ip $MASTER_ADDR  train_model.py -c default_config.yaml "experiment.session_name=NeurIPS25_OWT_32M
experiment.experiment_name=anGPT_32M_OWT_0B6_bs64_adamW_wd01_lr-0.0046
train.max_steps=10000
optim.lr=0.0046
optim.betas=(0.9,0.99)
optim.scheduler.num_warmup_steps=1
optim.weight_decay=0.0
model.mode=aGPT
model.n_layer=6
model.n_head=4
model.n_embd=256
model.base_scale=0.01
model.scaled_projection=false
model.explicit_norm=true
model.explicit_norm_bounded=true
model.aGPT_attn_in_scale=scalar
model.alpha_correction=true
model.qk_norm=true
model.post_norm=true
data_base.batch_size=16"
```

### Run 0.5B sanity check experiments on a slurm cluster.

Run the following command to run a GPT+ 0.5B experiment. We expect 8 nodes with 4 A100 40GB GPUs each. You may need to
adopt the template slurm script `slurm_launch.sh` to your cluster. The runtime is between 1h and 1.5h.

```bash
sbatch --nodes=8 --time=2:00:00 slurm_launch.sh "experiment.session_name=NeurIPS25_OWT_0B5
experiment.experiment_name=GPT+_0B5_OWT_10B_bs512_adamW_wd01_lr-0.0025
train.grad_accumulation=2
train.max_steps=10000
optim.lr=0.0025
optim.scheduler.num_warmup_steps=2000
optim.weight_decay=0.1
model.mode=GPT2
model.n_layer=24
model.n_head=16
model.n_embd=1024
model.base_scale=0.02
model.scaled_projection=true
model.explicit_norm=false
model.explicit_norm_bounded=false
model.h_in_norm=false
model.alpha_correction=true
model.qk_norm=true
model.qk_norm_scale_init=20"
```

Run the following command to run the GPT+ without QK norm 0B5 experiment.

```bash
sbatch --nodes=8 --time=2:00:00 slurm_launch.sh "experiment.session_name=NeurIPS25_OWT_0B5
experiment.experiment_name=GPT+_NoQKnorm_0B5_OWT_10B_bs512_adamW_wd01_lr-0.0025
train.grad_accumulation=2
train.max_steps=10000
optim.lr=0.0025
optim.scheduler.num_warmup_steps=2000
optim.weight_decay=0.1
model.mode=GPT2
model.n_layer=24
model.n_head=16
model.n_embd=1024
model.base_scale=0.02
model.scaled_projection=true
model.explicit_norm=false
model.explicit_norm_bounded=false
model.alpha_correction=true
model.qk_norm=false
model.post_norm=true
model.post_norm=true"
```

Run the following command to run the nGPT 0B5 experiment.

```bash
sbatch --nodes=8 --time=2:00:00 slurm_launch.sh "experiment.session_name=NeurIPS25_OWT_0B5
experiment.experiment_name=nGPT_0B5_OWT_10B_bs512_adamW_wd01_lr-0.002
train.grad_accumulation=2
train.max_steps=10000
optim.lr=0.002
optim.scheduler.num_warmup_steps=1
optim.weight_decay=0.0
model.mode=nGPT
model.n_layer=24
model.n_head=16
model.n_embd=1024
model.base_scale=0.031
model.scaled_projection=true
model.explicit_norm=true
model.explicit_norm_bounded=false
model.out_norm_dim_0=true
model.alpha_correction=false
model.qk_norm=true
model.post_norm=true"
```

Run the following command to run the anGPT 0B5 experiment.

```bash
sbatch --nodes=8 --time=2:00:00 slurm_launch.sh "experiment.session_name=NeurIPS25_OWT_0B5
experiment.experiment_name=anGPT_0B5_OWT_10B_bs512_adamW_wd01_lr-0.002
train.grad_accumulation=2
train.max_steps=10000
optim.lr=0.002
optim.scheduler.num_warmup_steps=1
optim.weight_decay=0.0
model.mode=aGPT
model.n_layer=24
model.n_head=16
model.n_embd=1024
model.base_scale=0.01
model.scaled_projection=false
model.explicit_norm=true
model.explicit_norm_bounded=true
model.aGPT_attn_in_scale=scalar
model.alpha_correction=true
model.qk_norm=true
model.qk_norm_scale_init=20
model.post_norm=true"
```
