import os
import time
import json
import random
import argparse
import numpy as np

import torch
import torch.nn as nn
import torch.utils.data
import torch.distributed as dist
import wandb
import transformers
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from transformers import LlamaForCausalLM as HF_LlamaForCausalLM
from transformers import GPT2LMHeadModel, GPT2Config
from transformers import DebertaV2ForMaskedLM, DebertaV2Config

try:
    from transformers import Qwen2ForCausalLM
except:
    from peft_pretraining.modeling_qwen2 import Qwen2ForCausalLM

try:
    from transformers import GemmaForCausalLM
except:
    from peft_pretraining.modeling_gemma import GemmaForCausalLM

try:
    from transformers import MistralForCausalLM
except:
    from peft_pretraining.modeling_mistral import MistralForCausalLM

from optimizer import configure_optimizer
import datasets
import datasets.distributed
# import wandb

from tqdm import tqdm
from loguru import logger

from peft_pretraining import training_utils, args_utils
from peft_pretraining.dataloader import PreprocessedIterableDataset
from peft_pretraining.modeling_llama import LlamaForCausalLM

import bitsandbytes as bnb

transformers.logging.set_verbosity_error()

os.environ["WANDB_MODE"] = "offline"

def parse_args(args):
    parser = argparse.ArgumentParser()

    parser.add_argument("--model_config", type=str, required=True)
    parser.add_argument("--model_type", type=str, default="llama", 
                        choices=["llama", "qwen", "gpt2", "deberta", "gemma", "mistral"],
                        help="Type of model to train: llama, qwen, gpt2, deberta, gemma, or mistral")
    parser.add_argument("--use_hf_model", default=False, action="store_true")
    parser.add_argument("--continue_from", type=str, default=None)
    parser.add_argument("--batch_size", type=int, required=True)
    parser.add_argument("--gradient_accumulation", type=int, default=None)
    parser.add_argument("--max_train_tokens", type=training_utils.max_train_tokens_to_number, default=None,
                        help="Number of tokens to train on. Overwrites num_training_steps. "
                             "You can use M and B suffixes, e.g. 100M or 1B.")
    parser.add_argument("--total_batch_size", type=int, default=None)
    parser.add_argument("--max_length", type=int, default=256)
    parser.add_argument("--optimizer", default="Adam")
    parser.add_argument("--lr", type=float, default=1e-4)
    parser.add_argument("--scheduler", type=str, default="cosine", choices=["linear", "cosine", "cosine_restarts"])
    parser.add_argument("--warmup_ratio", type=float, default=0.1)
    parser.add_argument("--min_lr_ratio", type=float, default=0.1)
    parser.add_argument("--activation_checkpointing", action="store_true")
    parser.add_argument("--weight_decay", type=float, default=0.0)
    parser.add_argument("--eval_every", type=int, default=9999999999)
    parser.add_argument("--num_training_steps", type=int, default=10_000,
                        help="Number of **update steps** to train for. "
                             "Notice that gradient accumulation is taken into account.")
    
    parser.add_argument("--save_every", type=int, default=10_000)
    parser.add_argument("--save_dir", type=str, default='saved')
    parser.add_argument("--tags", type=str, default=None)
    parser.add_argument("--dtype", type=str, default="bfloat16" if torch.cuda.is_bf16_supported() else "float32")
    parser.add_argument("--workers", type=int, default=8)
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--grad_clipping", type=float, default=0.0)   
    parser.add_argument("--epoch", type=int, default=10)

    # GaLore parameters
    parser.add_argument("--rank", type=int, default=128)
    parser.add_argument("--update_proj_gap", type=int, default=50)
    parser.add_argument("--scale", type=float, default=1.0)
    parser.add_argument("--proj_type", type=str, default="std")
    # Wave parameters
    parser.add_argument("--level",    type=int, default=2         )
    parser.add_argument("--n_steps",    type=int, default=5         )
    parser.add_argument("--beta1", type=float, default=0.9)
    parser.add_argument("--beta2", type=float, default=0.999)
    parser.add_argument("--momentum", type=float, default=0.0)
    # disable ddp, single_gpu
    parser.add_argument("--single_gpu", default=False, action="store_true")


    args = parser.parse_args(args)

    args = args_utils.check_args_torchrun_main(args)
    return args


def create_model(args, model_config):
    """Create model based on model_type argument"""
    if args.use_hf_model:
        model = AutoModelForCausalLM.from_config(model_config)
    elif args.model_type == "gpt2":
        # For GPT-2, we use the standard HuggingFace implementation
        model = GPT2LMHeadModel(model_config)
    elif args.model_type == "deberta":
        # For DeBERTa, we use DebertaV2ForMaskedLM for MLM pretraining
        model = DebertaV2ForMaskedLM(model_config)
    elif args.model_type == "gemma":
        # For Gemma, we use GemmaForCausalLM
        model = GemmaForCausalLM(model_config)
    elif args.model_type == "mistral":
        # For Mistral, we use MistralForCausalLM
        model = MistralForCausalLM(model_config)
    elif args.model_type == "qwen" or "qwen" in args.model_config:
        model = Qwen2ForCausalLM(model_config)
    else:  # llama or default
        model = LlamaForCausalLM(model_config)
    
    return model


def get_tokenizer_for_model(args):
    """Get T5 tokenizer for all model types (unified tokenizer)"""
    # Use T5 tokenizer for all models as in original code
    tokenizer = AutoTokenizer.from_pretrained("t5-base", model_max_length=args.max_length)
    # Ensure pad_token_id is valid
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = 0
    
    return tokenizer


def compute_loss(model, batch, labels, args):
    """Compute loss based on model type"""
    if args.model_type == "deberta":
        # For DeBERTa MLM, we need to mask some tokens randomly
        # This is a simplified MLM implementation
        input_ids = batch["input_ids"].clone()
        
        # Create random mask (15% of tokens)
        mask_prob = 0.15
        mask_token_id = model.module.config.vocab_size - 1  # Use last token as mask
        
        # Random masking
        rand = torch.rand(input_ids.shape, device=input_ids.device)
        mask_arr = (rand < mask_prob) & (input_ids != 0)  # Don't mask padding
        
        # Store original tokens as labels
        labels = input_ids.clone()
        labels[~mask_arr] = -100  # Only compute loss on masked tokens
        
        # Replace masked tokens
        input_ids[mask_arr] = mask_token_id
        
        batch["input_ids"] = input_ids
        loss = model(**batch, labels=labels).loss
    else:
        # For causal language models (GPT-2, Llama, Qwen)
        loss = model(**batch, labels=labels).loss
    
    return loss


@torch.no_grad()
def evaluate_model(model, preprocess_batched, pad_idx, global_rank, world_size, device, batch_size, args):
    _time = time.time()
    val_data = datasets.load_dataset("/datasets/C4/c4", "en", split="validation", streaming=True) #DGX
    val_data = val_data.shuffle(seed=42)
    logger.info(f"Loaded validation dataset in {time.time() - _time:.2f} seconds")

    if not args.single_gpu:
        val_data = datasets.distributed.split_dataset_by_node(val_data, rank=global_rank, world_size=world_size)

    val_data_mapped = val_data.map(
        preprocess_batched,
        batched=True,
        remove_columns=["text", "timestamp", "url"],
    )
    val_data_mapped.batch = lambda batch_size: training_utils.batch_fn(val_data_mapped, batch_size)

    target_eval_tokens = 10_000_000
    evaluated_on_tokens = 0
    total_loss = torch.tensor(0.0).to(device)
    total_batches = 1
    logger.info(f"Eval set prepared in {time.time() - _time:.2f} seconds")

    for batch in val_data_mapped.batch(batch_size=batch_size):
        if evaluated_on_tokens > target_eval_tokens:
            break
        total_batches += 1

        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch["input_ids"].clone()
        labels[labels == pad_idx] = -100
        
        loss = compute_loss(model, batch, labels, args)
        total_loss += loss.detach()

        evaluated_on_tokens += (batch["input_ids"] != pad_idx).sum().item() * world_size

    total_loss = total_loss / total_batches

    # Gather losses across all GPUs
    gathered_losses = [torch.zeros_like(total_loss) for _ in range(world_size)]
    dist.all_gather(gathered_losses, total_loss)
    total_loss = sum([t.item() for t in gathered_losses]) / world_size

    return total_loss, evaluated_on_tokens


def main(args):
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    eval_loss_list = []
    eval_tokens_list = []
    ckpt_path = os.path.join('checkpoints', args.save_dir)

    logger.info(f"Using check point path {ckpt_path}")
    assert "LOCAL_RANK" in os.environ, "torchrun should set LOCAL_RANK"
    global_rank = int(os.environ['RANK'])
    local_rank = int(os.environ["LOCAL_RANK"])
    world_size = int(os.environ["WORLD_SIZE"])
    torch.cuda.set_device(local_rank)

    logger.info(f"Global rank {global_rank}, local rank {local_rank}, device: {torch.cuda.current_device()}") 

    dist.init_process_group(backend="nccl", rank=global_rank, world_size=world_size)
    torch.cuda.empty_cache()
    dist.barrier()

    logger.info("Process group initialized")
    device = f"cuda:{local_rank}"

    if args.total_batch_size is not None:
        if args.gradient_accumulation is None:
            assert args.total_batch_size % world_size == 0, "total_batch_size must be divisible by world_size"
            args.gradient_accumulation = args.total_batch_size // (args.batch_size * world_size)
            assert args.gradient_accumulation > 0, "gradient_accumulation must be greater than 0"

    assert args.gradient_accumulation * args.batch_size * world_size == args.total_batch_size, \
        "gradient_accumulation * batch_size * world_size must be equal to total_batch_size"

    # turn off logger
    if global_rank != 0: logger.remove()
            
    # initialize wandb without config (it is passed later)
    if global_rank == 0:
        wandb.init(project="c4")
        
    logger.info(f"Using dist with rank {global_rank} (only rank 0 will log)")
    logger.info("*" * 40)
    logger.info(f"Starting training with the arguments")
    for k, v in vars(args).items():
        logger.info(f"{k:30} {v}")
    logger.info("*" * 40)

    data = datasets.load_dataset("/datasets/C4/c4", "en", split="train", streaming=True)

    seed_for_shuffle = 42 
    
    logger.info(f"Shuffling data with seed {seed_for_shuffle}")
    data: datasets.Dataset = data.shuffle(seed=seed_for_shuffle)
    if not args.single_gpu:
        data = datasets.distributed.split_dataset_by_node(
            data, rank=global_rank, world_size=world_size,
        )

    # Get appropriate tokenizer based on model type
    tokenizer = get_tokenizer_for_model(args)

    def preprocess_batched(batch):
        batch = tokenizer(
            batch["text"],
            max_length=args.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        return batch

    dataset = PreprocessedIterableDataset(data, tokenizer, batch_size=args.batch_size, max_length=args.max_length)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=None, num_workers=args.workers)

    model_config = AutoConfig.from_pretrained(args.model_config)
    
    # Create model based on model_type
    model = create_model(args, model_config)

    model.config.pad_token_id = tokenizer.pad_token_id
    if hasattr(model, "generation_config") and args.model_type != "deberta":
        model.generation_config.pad_token_id = tokenizer.pad_token_id

    if args.activation_checkpointing:
        model.gradient_checkpointing_enable()

    global_step = 0
    update_step = 0
    beginning_step = 0
    tokens_seen = 0
    tokens_seen_before = 0
    
    if args.continue_from is not None:
        logger.info("*" * 40)
        logger.info(f"Loading model from {args.continue_from}")
        checkpoint_path = os.path.join(args.continue_from, "pytorch_model.bin")
        model.load_state_dict(torch.load(checkpoint_path, map_location="cpu"), strict=True)
        logger.info(f"Model successfully loaded (strict=True policy)")

        if os.path.exists(os.path.join(args.continue_from, "training_state.json")):
            logger.info(f"Loading training state like global_step, update_step, and tokens_seen from {args.continue_from}")
            with open(os.path.join(args.continue_from, "training_state.json")) as f:
                _old_state = json.load(f)
            global_step = _old_state["global_step"]
            update_step = _old_state["update_step"]
            tokens_seen = _old_state["tokens_seen"]
            tokens_seen_before = _old_state["tokens_seen_before"]
            logger.info(f"global_step       : {global_step}")
            logger.info(f"update_step       : {update_step}")
            logger.info(f"tokens_seen       : {tokens_seen}")
            logger.info(f"tokens_seen_before: {tokens_seen_before}")
            logger.info(f"Will train for {args.num_training_steps - update_step} update steps")
        else:
            logger.warning(f"Did not find training state in {args.continue_from}, global step will start from zero")
        logger.info("*" * 40)


    if args.dtype in ["bf16", "bfloat16"]:
        model = model.to(device=device, dtype=torch.bfloat16)
    else:
        model = model.to(device=device)

    n_total_params = sum(p.numel() for p in model.parameters())
    trainable_params = [p for p in model.parameters() if p.requires_grad]
    # Initialize wandb
    run_config = dict(vars(args))
    run_config.update({
        "max_lr": run_config.pop("lr"),  # rename lr to max_lr to avoid conflicts with scheduler
        "total_params_M": n_total_params / 1_000_000,
        "dataset": 'c4',
        "model": model_config.to_dict(),
        "world_size": world_size,
        "device": str(device),
    })

    if global_rank == 0:
        wandb.config.update(run_config, allow_val_change=True)
        wandb.save(os.path.abspath(__file__), policy="now") # save current script
        # fix tqdm visual length to 80 so that the progress bar
        # doesn't jump around when changing from external display to laptop
        pbar = tqdm(total=args.num_training_steps - update_step, desc="Update steps", ncols=80)
    
    logger.info(f"Saving model to {ckpt_path} every {args.save_every} update steps")
    
    layer_wise_flag = False

    optimizer = configure_optimizer(args, logger, model, model_config)


    if not layer_wise_flag:
        scheduler = training_utils.get_scheculer(
            optimizer=optimizer,
            scheduler_type=args.scheduler,
            num_training_steps=args.num_training_steps,
            warmup_ratio=args.warmup_ratio,
            min_lr_ratio=args.min_lr_ratio,
        )

    if not args.single_gpu:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            broadcast_buffers=False,
        )

    # global steps and others are defined above
    pad_idx = tokenizer.pad_token_id
    update_time = time.time()
    local_step = 0  # when continue_from is used, local_step != global_step

    # ##############################
    # TRAINING LOOP
    # we'll never go through all the data, so no need for epochs
    # ##############################

    flag = False
    for epoch in np.arange(args.epoch):
        for batch_idx, batch in enumerate(dataloader):
            global_step += 1
            local_step += 1

            if update_step > args.num_training_steps:
                logger.info(f"Reached max number of update steps (f{args.num_training_steps}). Stopping training.")
                print(f"Rank {global_rank} stopping training.")
                flag = True
                break

            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch["input_ids"].clone()
            labels[labels == pad_idx] = -100
            tokens_seen += (batch["input_ids"] != pad_idx).sum().item() * world_size

            loss = compute_loss(model, batch, labels, args)
            scaled_loss = loss / args.gradient_accumulation
            scaled_loss.backward()

            if global_step % args.gradient_accumulation != 0:
                continue

            # The below code is only executed during the update step
            
            # add grad clipping
            if args.grad_clipping != 0.0: torch.nn.utils.clip_grad_norm_(trainable_params, args.grad_clipping)

            if global_rank == 0: pbar.update(1)
            
            if not layer_wise_flag:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
            else:
                lr = optimizer.step()
                optimizer.zero_grad()

            update_step += 1
            update_time = time.time() - update_time

            # save checkpoint by save_every
            if local_step > args.gradient_accumulation and update_step % args.save_every == 0 and global_rank == 0:
                current_model_directory = f"{ckpt_path}/model_{update_step}"
                logger.info(f"Saving model and optimizer to {current_model_directory}, update step {update_step}")
                os.makedirs(ckpt_path, exist_ok=True)
                model.module.save_pretrained(current_model_directory, max_shard_size='100GB')

                optimizer_checkpoint = {
                    "optimizer": optimizer.state_dict(),
                    "scheduler": scheduler.state_dict(),
                    "update_step": update_step,
                    "global_step": global_step,
                    "config": run_config,
                    # "wandb": wandb.run.dir,
                    "dtype": args.dtype,
                }
                torch.save(optimizer_checkpoint, f"{current_model_directory}/optimizer.pt")

                training_state_checkpoint = {
                    "global_step": global_step,
                    "update_step": update_step,
                    "tokens_seen": tokens_seen,
                    "tokens_seen_before": tokens_seen_before,
                    "update_time": update_time,
                }
                with open(f"{current_model_directory}/training_state.json", "w") as f:
                    json.dump(training_state_checkpoint, f, indent=4)
                    
                # save wandb related info
                wandb_info = {
                    "wandb_id": wandb.run.id,
                }
                with open(f"{ckpt_path}/wandb.json", "w") as f:
                    json.dump(wandb_info, f, indent=4)

            # evaluation
            if update_step % args.eval_every == 0:
                logger.info(f"Performing evaluation at step {update_step}")
                total_loss, evaluated_on_tokens = evaluate_model(
                    model, preprocess_batched, pad_idx, global_rank, world_size, device, args.batch_size, args
                )
                if global_rank == 0:
                    wandb.log({
                        "final_eval_loss": total_loss,
                        "final_eval_tokens": evaluated_on_tokens,
                        },
                        step=global_step,
                    )
                logger.info(f"Eval loss at step {update_step}: {total_loss}")

                logger.info(f"Eval tokens at step {update_step}: {evaluated_on_tokens}")
                eval_loss_list.append(total_loss)
                eval_tokens_list.append(evaluated_on_tokens)

            lr = optimizer.param_groups[0]["lr"]
            tokens_in_update = tokens_seen - tokens_seen_before
            tokens_seen_before = tokens_seen
            batches_in_update = args.gradient_accumulation * world_size

            if global_rank == 0:
                wandb.log({
                    "loss": loss.item(),
                    "lr": lr,
                    "update_step": update_step,
                    "tokens_seen": tokens_seen,
                    "throughput_tokens": tokens_in_update / update_time,
                    "throughput_examples": args.total_batch_size / update_time,
                    "throughput_batches": batches_in_update / update_time,
                    },
                    step=global_step,
                )
                logger.info({
                    "loss": loss.item(),
                    "update_step": update_step,
                    "lr": lr,
                    })
            update_time = time.time()

        if flag:
            break

    # ##############################
    # END of training loop
    # ##############################
    logger.info("Token seen:", {tokens_seen})
    logger.info("Training finished")
    if global_rank == 0: pbar.close()

    current_model_directory = f"{ckpt_path}/model_{update_step}"
    logger.info(f"Eval loss list:{eval_loss_list}")
    
    if global_rank == 0 and not os.path.exists(current_model_directory):
        logger.info(f"Saving model and optimizer to {current_model_directory}, update step {update_step}")
        os.makedirs(ckpt_path, exist_ok=True)
        model.module.save_pretrained(current_model_directory)

        optimizer_checkpoint = {
            "optimizer": optimizer.state_dict(),
            "scheduler": scheduler.state_dict(),
            "update_step": update_step,
            "global_step": global_step,
            "config": run_config,
            "wandb": wandb.run.dir,
            "dtype": args.dtype,
        }
        torch.save(optimizer_checkpoint, f"{current_model_directory}/optimizer.pt")

        training_state_checkpoint = {
            "global_step": global_step,
            "update_step": update_step,
            "tokens_seen": tokens_seen,
            "tokens_seen_before": tokens_seen_before,
            "update_time": update_time,
        }
        with open(f"{current_model_directory}/training_state.json", "w") as f:
            json.dump(training_state_checkpoint, f, indent=4)

    # Final evaluation
    logger.info("Running final evaluation")
    model.eval()
    del loss, optimizer, scheduler
    import gc; gc.collect()
    torch.cuda.empty_cache()

    total_loss, evaluated_on_tokens = evaluate_model(
        model, preprocess_batched, pad_idx, global_rank, world_size, device, args.batch_size, args
    )

    if global_rank == 0:
        wandb.log({
            "final_eval_loss": total_loss,
            "final_eval_tokens": evaluated_on_tokens,
            },
            step=global_step,
        )
        logger.info({
            "final_eval_loss": total_loss,
            "final_eval_tokens": evaluated_on_tokens,
            })
        logger.info(f"Final eval loss: {total_loss}")
        
    logger.info("Script finished successfully")

    print(f"Rank {global_rank} finished successfully")


if __name__ == "__main__":
    print("Starting script")
    args = parse_args(None)
    main(args)