"""
This training script can be run both on a single gpu in debug mode,
and also in a larger training run with distributed data parallel (ddp).

To run on a single GPU, example:
$ python train.py --batch_size=32 --compile=False

To run with DDP on 4 gpus on 1 node, example:
$ torchrun --standalone --nproc_per_node=4 train.py

To run with DDP on 4 gpus across 2 nodes, example:
- Run on the first (master) node with example IP 123.456.123.456:
$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py
- Run on the worker node:
$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py
(If your cluster does not have Infiniband interconnect prepend NCCL_IB_DISABLE=1)
"""

import os
import time
import math
import random
import pickle
from contextlib import nullcontext
import logging

import numpy as np
import torch
import torch.nn.functional as F
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group

from src.model.model_llama import build_llama_model
from src.utils.optimizer import get_optimizer
from src.utils.logger import get_logger, setup_experiment_environment, save_training_monitor, save_gradients

# -----------------------------------------------------------------------------
# default config values designed to train a gpt2 (124M) on OpenWebText
# I/O
out_dir = 'out'
eval_interval = 256 * 491520
log_interval = 16 * 491520
save_interval = 1024 * 491520
eval_iters = 100
eval_only = False # if True, script exits right after the first eval
save_grad = False # if True, save gradients at specified intervals
# wandb logging
wandb_log = True # disabled by default
wandb_project = 'llama_finewebedu'
wandb_run_name = '170M_constant_lr' # 'run' + str(time.time())
experiment_name = '170M_constant_lr'
# data
data_root = None
dataset = 'finewebedu'
# gradient accumulation steps modification
gradient_accumulation_steps_list = [5 * 8, 20 * 8, 80 * 8]
change_grad_accum_at_tokens_list = [24576 * 491520, 36864 * 491520, 40960 * 491520]
batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
block_size = 1024
# model
model_name = "170M"
# SGD or AdamW
optimizer_name = "adamw"
learning_rate = 2 ** (-10) # max learning rate
weight_decay = 0.1 # default for AdamW and SGD
momentum = 0.95 # default for SGD
beta1 = 0.95 # default for AdamW
beta2 = 0.95 # default for AdamW
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
# learning rate decay settings
max_tokens = 40960 * 491520 # total number of training tokens
warmup_iters = 1024 # how many steps to warm up for
lr_decay_iters = 1e10 # should be ~= max_iters per Chinchilla
min_lr = learning_rate # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
# DDP settings
backend = 'nccl' # 'nccl', 'gloo', etc.
# resume
resume_path = None  #the path of the resume file
resume_iter = 0 #the iter of the resume start
# system
seed = 41
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
compile = False # use PyTorch 2.0 to compile the model to be faster
# -----------------------------------------------------------------------------
config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str, list))]
exec(open('configurator.py').read()) # overrides from command line or config file
config = {k: globals()[k] for k in config_keys} # will be useful for logging
# -----------------------------------------------------------------------------

# initialize logger
logger = get_logger(__name__)

# load resume state if specified
if resume_path:
    logger.info(f"Resuming training from {resume_path} with iter {resume_iter}")
    resume_pkl = os.path.join(resume_path, 'train_monitor', f'training_monitor_iter_{resume_iter}.pkl')
    with open(resume_pkl, 'rb') as f:
        state = pickle.load(f)
    # modify global variables and config
    saved_config  = state['config']
    for key, value in saved_config.items():
        if key in ['resume', 'resume_monitor', 'device', 'dtype', 'ddp']:
            continue
        globals()[key] = value
        config[key] = value
        
# various inits, derived attributes, I/O setup
ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run?
if ddp:
    init_process_group(backend=backend)
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
    seed_offset = ddp_rank # each process gets a different seed
    # world_size number of processes will be training simultaneously, so we can scale
    # down the desired gradient accumulation iterations per process proportionally
    for i in range(len(gradient_accumulation_steps_list)):
        assert gradient_accumulation_steps_list[i] % ddp_world_size == 0
        gradient_accumulation_steps_list[i] //= ddp_world_size
else:
    # if not ddp, we are running on a single gpu, and one process
    master_process = True
    seed_offset = 0
    ddp_world_size = 1

# create the directory to save the experiment
if master_process:
    # Only master process sets up experiment directory
    logger.info("Setting up experiment environment...")
    experiment_paths = setup_experiment_environment(
        base_output_dir=out_dir,
        experiment_name=experiment_name,
        config=config,
        resume_path=resume_path,
        resume_iter=resume_iter,
    )
    # Set WANDB_DIR for this experiment
    os.environ['WANDB_DIR'] = experiment_paths['wandb_dir']
else:
    # For non-master processes in DDP, create fallback paths
    # These won't be used for actual saving but prevent errors
    experiment_paths = {}

def set_seed(seed):
    """
    Set the seed for the random number generators.
    """
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

set_seed(seed if not resume_path else seed+100)    
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
# note: float16 data type will automatically use a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

# poor man's data loader
data_dir = os.path.join(data_root, dataset)
trainset = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
valset = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')

def get_batch(split):
    # We recreate np.memmap every batch to avoid a memory leak, as per
    # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122
    data = trainset if split == 'train' else valset
    ix_list = []
    for _ in range(ddp_world_size):
        ix_list.append(torch.randint(len(data) - block_size, (batch_size,)))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix_list[ddp_rank]])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix_list[ddp_rank]])
    if device_type == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

# init a new model from scratch
logger.info("Initializing a new model from scratch")
model = build_llama_model(model_name)
model.to(device)

# optimizer
optimizer = get_optimizer(
    optimizer_name, 
    model, 
    lr=learning_rate, 
    wd=weight_decay,
    momentum=momentum,
    beta1=beta1,
    beta2=beta2,
)

# load resume state if specified
if resume_path:
    model.load_state_dict(state['model_state_dict'])
    optimizer.load_state_dict(state['optimizer_state_dict'])
    
# compile the model
if compile:
    logger.info("compiling the model... (takes a ~minute)")
    unoptimized_model = model
    model = torch.compile(model) # requires PyTorch 2.0

# wrap model into DDP container
if ddp:
    model = DDP(model, device_ids=[ddp_local_rank])

# helps estimate an arbitrarily accurate loss over either split using many batches
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            with ctx:
                logits = model(X).logits.view(batch_size*block_size, -1)
            loss = F.cross_entropy(logits, Y.view(-1), ignore_index=-1)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# learning rate decay scheduler (cosine with warmup)
def get_lr(it, min_lr, max_lr, warmup_iters, lr_decay_iters):
    assert max_lr == min_lr, "max_lr and min_lr must be the same"
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return max_lr * (it + 1) / (warmup_iters + 1)
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (max_lr - min_lr)

# batch size scheduler
def get_grad_accum_step(token_num, change_grad_accum_at_tokens_list, gradient_accumulation_steps_list):
    assert len(gradient_accumulation_steps_list) == len(change_grad_accum_at_tokens_list), \
        "gradient_accumulation_steps_list and change_grad_accum_at_tokens_list must have the same length"
    # find the appropriate gradient accumulation steps based on current iteration
    for i, change_tokens in enumerate(change_grad_accum_at_tokens_list):
        if token_num < change_tokens:
            grad_accum = gradient_accumulation_steps_list[i]
            break
    else:
        # if we've passed all change points, use the final value
        grad_accum = gradient_accumulation_steps_list[-1]
    
    # calculate tokens per iteration based on current grad accum steps
    tokens_per_iter = grad_accum * ddp_world_size * batch_size * block_size
    return grad_accum, tokens_per_iter

# logging
if master_process:
    # Create log file in experiment directory
    os.makedirs(experiment_paths['logs_dir'], exist_ok=True)
    log_file_path = os.path.join(experiment_paths['logs_dir'], f"training.log")
    file_handler = logging.FileHandler(log_file_path)
    file_handler.setFormatter(logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    ))
    logger.addHandler(file_handler)
    logger.info(f"Training log will be saved to: {log_file_path}")

    if wandb_log:
        import wandb
        wandb_run_name = f"resume_{resume_path.split('/')[-1]}_{resume_iter}" if resume_path else wandb_run_name
        wandb.init(project=wandb_project, name=wandb_run_name, config=config)
    
# training loop
X, Y = get_batch('train') # fetch the very first batch
token_num = 0 if not resume_path else state['token_num']
iter_num = 0 if not resume_path else state['iter_num']
losses = None if not resume_path else state['losses'] 
t0 = time.time()
raw_model = model.module if ddp else model # unwrap DDP container if needed
while True:    
    # determine and set the learning rate for this iteration
    lr = get_lr(iter_num, min_lr, learning_rate, warmup_iters, lr_decay_iters)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # determine and set the gradient accumulation step for this iteration
    gradient_accumulation_steps, tokens_per_iter = get_grad_accum_step(
        token_num, change_grad_accum_at_tokens_list, gradient_accumulation_steps_list
    )

    # evaluate the loss on train/val sets and write checkpoints
    if token_num % eval_interval == 0 and master_process:
        losses = estimate_loss()
        logger.info(f"Token {token_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if wandb_log:
            wandb.log({
                "iter": iter_num,
                "train/loss": losses['train'],
                "val/loss": losses['val'],
                "lr": lr,
            }, step=token_num)
    
    # save monitor
    if token_num % save_interval == 0 and master_process:
        logger.info(f"Saving training monitor for iteration {iter_num}...")
        save_training_monitor(
            save_path = experiment_paths['monitor_dir'],
            config = config, 
            model = raw_model, 
            optimizer = optimizer,
            lr = lr,
            losses = losses,
            token_num = token_num,
            iter_num = iter_num,
        )
    
    if iter_num == 0 and eval_only:
        break

    # forward backward update, with optional gradient accumulation to simulate larger batch size
    # and using the GradScaler if data type is float16
    model.require_backward_grad_sync = True
    for micro_step in range(gradient_accumulation_steps):
        with ctx:
            logits = model(X).logits.view(batch_size*block_size, -1)
        loss = F.cross_entropy(logits, Y.view(-1), ignore_index=-1)
        loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation
        # immediately async prefetch next batch while model is doing the forward pass on the GPU
        X, Y = get_batch('train')
        # backward pass
        loss.backward()
    
    # save gradients at specified intervals
    if save_grad and token_num % save_interval == 0 and master_process:
        logger.info(f"Saving gradients at token {token_num}...")
        save_gradients(
            save_path = os.path.join(experiment_paths['monitor_dir'], f'gradients'),
            model = raw_model, 
            iter_num = iter_num
        )
    
    # clip the gradient
    if grad_clip != 0.0:
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    # step the optimizer
    optimizer.step()
    # flush the gradients as soon as we can, no need for this memory anymore
    optimizer.zero_grad(set_to_none=True)

    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if token_num % log_interval == 0 and master_process:
        # get loss as float. note: this is a CPU-GPU sync point
        # scale up to undo the division above, approximating the true total loss (exact would have been a sum)
        lossf = loss.item() * gradient_accumulation_steps
        logger.info(f"Token {token_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
    token_num += tokens_per_iter
    iter_num += 1

    # termination conditions
    if token_num > max_tokens:
        break

if ddp:
    destroy_process_group()