import os
import time
import json
import random
import argparse
import numpy as np
import re
import pickle
import glob

import torch
import torch.nn as nn
import torch.utils.data
import torch.distributed as dist

import transformers
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from transformers import LlamaForCausalLM as HF_LlamaForCausalLM

import datasets
import datasets.distributed
import wandb

from tqdm import tqdm
from loguru import logger
import fnmatch
from peft_pretraining import training_utils, args_utils
from peft_pretraining.dataloader import PreprocessedIterableDataset
from peft_pretraining.modeling_llama import LlamaForCausalLM

# import bitsandbytes as bnb
from galore_torch import GaLoreAdamW, GaLoreAdamW8bit, GaLoreAdafactor

from peft import LoraConfig, get_peft_model
from utils import *
import itertools
import sys
import traceback
from datetime import datetime
import hashlib

from dst_scheduler import DSTScheduler
class DualWriter:
    def __init__(self, file):
        self.file = file
        self.console = sys.__stdout__

    def write(self, message):
        self.console.write(message)  # Write to the console
        self.file.write(message)     # Write to the file
        self.file.flush()            # Ensure immediate write to file

    def flush(self):
        self.console.flush()
        if not self.file.closed:
            self.file.flush()

#################################################
def completed(run_name, steps, mapping=None):
    match = False
    if mapping is None:
        all_files = os.listdir("./tmp")
        for filename in all_files:
            full_path = f"tmp/{filename}"
            if os.path.isfile(full_path) and filename.endswith('.txt') and (run_name in filename):
                with open(full_path, 'r', encoding='utf-8') as f:
                    content = f.read()     
                    # 使用正则表达式匹配 xxxx 浮点数 yyy 的形式
                    if steps == 10000:
                        match = re.search(r"update_step: 10001, \{'final_eval_loss': (\d+\.\d+),", content)
                    elif steps == 40000:
                        match = re.search(r"update_step: 40001, \{'final_eval_loss': (\d+\.\d+),", content)
                    elif steps == 20000:
                        match = re.search(r"update_step: 20001, \{'final_eval_loss': (\d+\.\d+),", content)
                    elif steps == 60000:
                        match = re.search(r"update_step: 60001, \{'final_eval_loss': (\d+\.\d+),", content)
                    if match:
                        return True
    else:
        for filename, hash_name in mapping.items():
            full_path = f"tmp/{hash_name}.txt"
            if os.path.isfile(full_path) and (run_name in filename):
                with open(full_path, 'r', encoding='utf-8') as f:
                    content = f.read()     
                    # 使用正则表达式匹配 xxxx 浮点数 yyy 的形式
                    if steps == 10000:
                        match = re.search(r"update_step: 10001, \{'final_eval_loss': (\d+\.\d+),", content)
                    elif steps == 40000:
                        match = re.search(r"update_step: 40001, \{'final_eval_loss': (\d+\.\d+),", content)
                    elif steps == 20000:
                        match = re.search(r"update_step: 20001, \{'final_eval_loss': (\d+\.\d+),", content)
                    elif steps == 60000:
                        match = re.search(r"update_step: 60001, \{'final_eval_loss': (\d+\.\d+),", content)
                    if match:
                        return True
    
    return False
#################################################


def eval_zero_shot(model_name, task_list=["qqp","rte","mnli","mrpc","sst2","cola", "qnli", "stsb"], 
        num_fewshot=0, use_accelerate=True, add_special_tokens=False):
    from lm_eval import tasks, evaluator 
    def pattern_match(patterns, source_list):
        task_names = set()
        for pattern in patterns:
            for matching in fnmatch.filter(source_list, pattern):
                task_names.add(matching)
        return list(task_names)
    task_names = pattern_match(task_list, tasks.ALL_TASKS)
    model_args = f"pretrained={model_name},cache_dir=./llm_weights"
    limit = None 
    if "70b" in model_name or "65b" in model_name:
        limit = 2000
    if use_accelerate:
        model_args = f"pretrained={model_name},use_accelerate=True,device_map_option=\"auto\""
    results = evaluator.simple_evaluate(
        model="hf-causal-experimental",
        model_args=model_args,
        tasks=task_names,
        num_fewshot=num_fewshot,
        batch_size=None,
        max_batch_size=None,
        device=None,
        no_cache=True,
        limit=limit,
        description_dict={},
        decontamination_ngrams_path=None,
        check_integrity=False,
        write_out=False,
        output_base_path=None
    )
    print("********************************")
    print("zero_shot evaluation results")
    print(evaluator.make_table(results))
    return results 

# Custom function to log uncaught exceptions globally
def log_uncaught_exceptions(exc_type, exc_value, exc_traceback):
    # Format the exception traceback as a string
    tb_str = "".join(traceback.format_exception(exc_type, exc_value, exc_traceback))

    # Write traceback to both console and log file
    print(f"Uncaught Exception:\n{tb_str}")  # Print stack trace
    logger.error(f"Uncaught Exception:\n{tb_str}")  # Log stack trace with loguru


transformers.logging.set_verbosity_error()

def parse_args(args):
    parser = argparse.ArgumentParser()

    parser.add_argument('--run_name', type=str, required=True)
    parser.add_argument("--model_config", type=str, required=True)
    parser.add_argument("--dataset_name", type=str, required=True)
    parser.add_argument("--use_hf_model", default=False, action="store_true")
    parser.add_argument("--continue_from", type=str, default=None)
    parser.add_argument("--batch_size", type=int, required=True)
    parser.add_argument("--gradient_accumulation", type=int, default=None)
    parser.add_argument("--total_batch_size", type=int, default=None)
    parser.add_argument("--max_length", type=int, default=256)
    parser.add_argument("--optimizer", default="Adam")
    parser.add_argument("--lr", type=float, default=1e-4)
    parser.add_argument("--min_lr_ratio", type=float, default=0.1)
    parser.add_argument("--activation_checkpointing", action="store_true")
    parser.add_argument("--weight_decay", type=float, default=0.0)
    parser.add_argument("--warmup_steps", type=int, default=1_000)
    parser.add_argument("--eval_every", type=int, default=5_000)
    parser.add_argument("--num_training_steps", type=int, default=10_000,
                        help="Number of **update steps** to train for. "
                             "Notice that gradient accumulation is taken into account.")
    parser.add_argument("--max_train_tokens", type=training_utils.max_train_tokens_to_number, default=None,
                        help="Number of tokens to train on. Overwrites num_training_steps. "
                             "You can use M and B suffixes, e.g. 100M or 1B.")
    parser.add_argument("--save_every", type=int, default=10_000)
    parser.add_argument("--save_dir", type=str, default=None)
    parser.add_argument("--tags", type=str, default=None)
    parser.add_argument("--dtype", type=str, default="bfloat16" if torch.cuda.is_bf16_supported() else "float32")
    parser.add_argument("--workers", type=int, default=8)
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--name", type=str, default="test")
    parser.add_argument("--grad_clipping", type=float, default=0.0) 
    parser.add_argument("--scheduler", type=str, default="cosine_restarts", choices=["linear", "cosine", "cosine_restarts"]) 
    # beta1 for adafactor
    parser.add_argument("--beta1", type=float, default=0.0)
    
    # DST parameters
    parser.add_argument("--dst_scheduler", default=False, action="store_true")
    parser.add_argument("--zeta", type=float, default=0.3)
    parser.add_argument("--update_interval", type=int, default=200)
    parser.add_argument("--sparsity", type=float, default=0.99)
    parser.add_argument("--remove_method", type=str, default="weight_magnitude")
    parser.add_argument("--regrow_method", type=str, default="random")
    parser.add_argument("--init_mode", type=str, default="xavier")
    parser.add_argument("--chain_removal", action="store_true")
    parser.add_argument("--T_decay", default="no_decay", type=str)
    parser.add_argument("--adaptive_zeta", action="store_true")
    parser.add_argument("--WS", action="store_true")
    parser.add_argument("--ws_beta", default=0.25, type=float)
    parser.add_argument("--itop", action="store_true", help="activate ITOP analysis")
    parser.add_argument("--EM_S", action="store_true", help="EM_S solution")
    parser.add_argument("--factor", type=float, default=0.01)
    parser.add_argument("--granet", action="store_true", help="granet solution")
    parser.add_argument("--granet_init_sparsity", default=0.9, type=float)
    parser.add_argument("--granet_init_step", default=0, type=int)
    parser.add_argument("--ssam", action="store_true", help="ssam solution")
    parser.add_argument("--history_weights", action="store_true", help="save the weights of the model after each evolutionary epoch")
    parser.add_argument("--new_history_weights", action="store_true", help="save the weights of the model after each evolutionary epoch")
    parser.add_argument("--gmp", action="store_true")
    parser.add_argument("--pruning_scheduler", type=str, default="none", help="none, linear, granet, s_shape")
    parser.add_argument("--pruning_method", type=str, default="none", help="ri, weight_magnitude, MEST")
    parser.add_argument("--sparsity_distribution", type=str, default="uniform", help="uniform, non-uniform")
    parser.add_argument("--early_stop", action="store_true")
    parser.add_argument("--pruning_T_end", default=None, type=float, help="pruning T end")
    parser.add_argument("--save_new", action="store_true")
    parser.add_argument("--saving_steps", default=10, type=float)
    parser.add_argument("--adaptive_shield",  action="store_true")
    parser.add_argument("--shield_threshold", default=0.05, type=float)
    parser.add_argument("--start_T", default=1, type=float)
    parser.add_argument("--end_T", default=1, type=float)
    
    parser.add_argument("--sltrain", action="store_true")
    parser.add_argument("--onebitlowranktrain", action="store_true")
    parser.add_argument("--lora_alpha", type=float, default=None)
    parser.add_argument("--lora_dropout", type=float, default=0.)
    parser.add_argument("--train_scaling", default=False, action="store_true")
    parser.add_argument("--rank", type=int, default=128)

    ####################################
    parser.add_argument("--communication", default=False, action="store_true")
    parser.add_argument("--evolution", default=False, action="store_true")
    parser.add_argument("--evolution_initialization", default=False, action="store_true")
    parser.add_argument("--evolution_strategy", type=str, default="sparse", choices=["sparse", "lora"], help="Strategy for evolution step")
    parser.add_argument("--evolution_every", type=int, default=1000, help="How often to perform evolution step")
    parser.add_argument("--aloss", type=str, default=None, choices=["orthogonal_w", "orthogonal_wx", "alignment_wx"])
    parser.add_argument("--aloss_weight", type=float, default=1, help="Weight for additional loss")
    parser.add_argument("--act_lora", default=False, action="store_true", help="Use activation for lora part in sllinear")
    
    parser.add_argument("--static_dst", default=False, action="store_true")
    parser.add_argument("--no_compute_similarity", default=False, action="store_true")
    parser.add_argument("--cola", default=False, action="store_true") 
    ####################################
    
    
    
    
    
    # disable ddp, single_gpu
    parser.add_argument("--single_gpu", default=False, action="store_true")
    parser.add_argument("--no_log", default=False, action="store_true")
    parser.add_argument("--no_decay", default=False, action="store_true")
    parser.add_argument("--iterative_warmup_steps", type=int, default=0)
    parser.add_argument("--log_to_file", default=False, action="store_true")
    parser.add_argument("--only_save_last", default=False, action="store_true")
    parser.add_argument("--save_path", type=str, default=None)
    

    # DNM
    parser.add_argument("--DNM", action="store_true")
    parser.add_argument("--cross", action="store_true")
    # parser.add_argument("--BA", action="store_true")
    parser.add_argument("--random_rewiring", type=float, default=1.0)
    parser.add_argument("--M", type=int, help="number of dendrites")
    #parser.add_argument("--M_std", type=int, help="standard deviation of the gaussian distribution in case M_dist=gaussian or M_dist=spatial_gaussian or M_dist=spatial_inversegaussian")
    #parser.add_argument("--M_spread", type=int, help="spread of the uniform distribution of the number of dendrites in case M_dist=uniform")
    #parser.add_argument("--gamma_std", type=int, help="standard deviation of the distribution of window sizes in case gamma_dist=gaussian or gamma_dist=spatial_gaussian or gamma_dist=spatial_inversegaussian")
    #parser.add_argument("--degree_std", type=int, help="standard deviation of the distribution of output nodes degrees")
    #parser.add_argument("--degree_spread", type=int, help="spread of the degree of output nodes")
    #parser.add_argument("--synaptic_std", type=int, help="standard deviation of the distribution of synapses across sendrites")
    #parser.add_argument("--synaptic_spread", type=int, help="spread of the distribution of synapses across sendrites")
    parser.add_argument("--M_dist", type=str, help="Distribution of the number of dendrites among neurons", choices=["fixed", "gaussian", "uniform", "spatial_gaussian", "spatial_inversegaussian"], default="fixed")
    parser.add_argument("--gamma", type=float, help="dendritic spreading parameter")
    parser.add_argument("--gamma_dist", type=str, default = "fixed", choices=["fixed", "gaussian", "uniform", "spatial_gaussian", "spatial_inversegaussian"])
    parser.add_argument("--degree_dist", type=str, default = "fixed", choices=["fixed", "gaussian", "uniform", "spatial_gaussian", "spatial_inversegaussian"])
    parser.add_argument("--synaptic_dist", type=str, help="distribution of the nodes in the synapses", default="fixed", choices=["fixed", "gaussian", "uniform", "spatial_gaussian", "spatial_inversegaussian"])
    parser.add_argument("--BHI", action="store_true", help="Bipartite Hyperbolic Initialisation")
    parser.add_argument("--CWS", action="store_true", help="Cannistraci Watts Strogatz")
    parser.add_argument("--WS1", action="store_true", help="First variation of WS initialisation")
    parser.add_argument("--WS2", action="store_true", help="Second variation of WS initialisation")
    parser.add_argument("--WS3", action="store_true", help="Third variation of WS initialisation")    
    parser.add_argument("--delta", type=float, help="δ locality parameter (0 ≤ δ ≤ 1)")
    parser.add_argument("--delta_dist", type=str, help="distribution of δ over the neurons", default="fixed")
    parser.add_argument("--sigma_x", type=float, help="standard deviation over the x")
    parser.add_argument("--sigma_y", type=float, help="standard deviation over the y")
    parser.add_argument("--rho", type=float, help="correlation")
    parser.add_argument("--QHI", action="store_true", help="General Hyperbolic Initialisation")
    parser.add_argument("--BHI_T", type=float, default=0.0, help="nPSO temperature. If 0: purely greedy by hyperbolic distance; >0: probabilistic sampling.")
    parser.add_argument("--BHI_gamma", type=float, default=2.0, help="Power-law exponent gamma controlling the radial coordinate dynamics in nPSO.") 
    parser.add_argument("--BHI_distr", type=int, default=0, help=(    
        "angular coordinate distribution:"
        "  0                uniform on [0,2π) (PSO model)"
        "  C>0              integer # of equidistant GaussianMixture components"
        "  GaussianMixture  sklearn GaussianMixture object for custom GM"
        "  (angles, probs, centers)  3-tuple of lists for fully custom mixture"
        )
    )
    parser.add_argument("--rewire_mode", type=str, choices=["none", "uniform", "random"], default="none", help=(
        "Optional bipartite rewiring after nPSO wiring: "
        "'none' = keep original edges; "
        "'uniform' = redistribute B-endpoint degrees as evenly as possible; "
        "'random' = reassign B endpoints at random (preserving A-degrees)."
        )
    )
    parser.add_argument("--degree_allocation", action="store_true", help= "Enable the spatial-sorting degree allocation strategy: for each bipartite block, compute each neuron's degree and then reorder them so that the highest-degree neuron sits at the center, the next two occupy the centers of the two halves, the next four the centers of the four quarters, and so on. For the third sandwich only the input side is sorted (the final output layer remains unconstrained).")
    


    
    args = parser.parse_args(args)

    #############################################
    if args.communication and (not args.sltrain):
        print("Communication is only supported for sltrain")
        exit()
    if args.evolution and (not args.sltrain):
        print("Evolution is only supported for sltrain")
        exit()
    if args.lora_alpha is None or args.lora_alpha <= 0:
        args.lora_alpha = args.rank
    #############################################

    args.run_name+=f"_step_{args.num_training_steps}_lr_{args.lr}_ui_{args.update_interval}_s_{args.sparsity}_z_{args.zeta}_scheduler_{args.scheduler}"
    if args.dst_scheduler:
        if args.static_dst:
            args.run_name += "_staticdst"
        else:
            if args.WS:
                args.run_name += f"_WS_{args.ws_beta}"

            if args.WS3:
                args.run_name += f"_WS3_{args.delta}_{args.degree_dist}"
            if "soft" in args.remove_method:
                tmp_name = args.remove_method
                args.remove_method += f"start_T_{args.start_T}_end_T_{args.end_T}"
            if args.gmp:
                args.run_name += f"_gmp_{args.granet_init_sparsity}_{args.sparsity_distribution}"

            elif args.granet:
                args.run_name += f"_granet_{args.granet_init_sparsity}_{args.sparsity_distribution}_{args.pruning_method}_{args.pruning_scheduler}_{args.pruning_T_end}"
                args.run_name += f"_rg_{args.regrow_method}_rm_{args.remove_method}_cr_{args.chain_removal}"

                if args.EM_S:
                    args.run_name += "_EM_S"
                elif args.adaptive_zeta:
                    args.run_name += "_az"

                if args.adaptive_shield:
                    args.run_name += f"_adaptive_shield_thre_{args.shield_threshold}"
                elif args.save_new:
                    args.run_name += f"_save_new_steps_{args.saving_steps}"
            else:
                args.run_name += f"_rg_{args.regrow_method}_rm_{args.remove_method}_cr_{args.chain_removal}"

                if args.EM_S:
                    args.run_name += "_EM_S"
                elif args.adaptive_zeta:
                    args.run_name += "_az"

                if args.adaptive_shield:
                    args.run_name += f"_adaptive_shield_thre_{args.shield_threshold}"
                elif args.save_new:
                    args.run_name += f"_save_new_steps_{args.saving_steps}"

    if args.history_weights:
        if args.new_history_weights:
            args.run_name += "_new_his"
        else:
            args.run_name += "_his"
            
    if args.sltrain:
        args.run_name += f"_sltrain_{args.rank}_{args.lora_alpha}"
        ##################################
        if args.cola:
            args.run_name += "_cola"
        if args.communication:
            args.run_name += "_commu"
        if args.act_lora:
            args.run_name += "_actlora"
        if args.evolution:
            args.run_name += "_evol_" + args.evolution_strategy + "_ee" + str(args.evolution_every)
        if args.evolution_initialization:
            args.run_name += "_einit_" + args.evolution_strategy
        if args.aloss is not None:
            args.run_name += f"_aloss_{args.aloss}_alw{args.aloss_weight}"
        ##################################

    if args.onebitlowranktrain:
        args.run_name += f"_1bitlowranktrain_{args.rank}_{args.lora_alpha}"

    if "soft" in args.remove_method:
        args.remove_method = tmp_name
    
    print(args.run_name)
    args = args_utils.check_args_torchrun_main(args)
    return args


@torch.no_grad()
def evaluate_model(model, preprocess_batched, pad_idx, global_rank, world_size, device, batch_size, dataset_name, train_valid_data):
    _time = time.time()
    if args.dataset_name == "openwebtext":
        val_data = train_valid_data['test']
        remove_columns = ['text']
    elif args.dataset_name == "c4":
        while True:
            try:
                val_data = datasets.load_dataset("c4", "en", split="validation", streaming=True, trust_remote_code=True)
                break
            except Exception as e:
                print(f"Error loading validation dataset: {e}")
                time.sleep(5)
        # val_data = datasets.load_dataset("c4", "en", split="validation", streaming=True, trust_remote_code=True)
        remove_columns = ["text", "timestamp", "url"]
    #################################################################
    elif args.dataset_name == "c4_ant":
        val_files = glob.glob(f"/modelopsnas/modelops/463248/hf_cache/datasets/c4_json/c4/en/*validation*.json.gz")
        val_data = datasets.load_dataset("json", data_files=val_files, split="train", streaming=False)
        remove_columns = ["text", "timestamp", "url"]
    #################################################################
    val_data = val_data.shuffle(seed=42)
    logger.info(f"Loaded validation dataset in {time.time() - _time:.2f} seconds")

    if not args.single_gpu:
        val_data = datasets.distributed.split_dataset_by_node(val_data, rank=global_rank, world_size=world_size)

    val_data_mapped = val_data.map(
        preprocess_batched,
        batched=True,
        remove_columns=remove_columns,
    )
    val_data_mapped.batch = lambda batch_size: training_utils.batch_fn(val_data_mapped, batch_size)

    target_eval_tokens = 10_000_000
    evaluated_on_tokens = 0
    total_loss = torch.tensor(0.0).to(device)
    total_batches = 1
    logger.info(f"Eval set prepared in {time.time() - _time:.2f} seconds")

    for batch in val_data_mapped.batch(batch_size=batch_size):
        if evaluated_on_tokens > target_eval_tokens:
            break
        total_batches += 1

        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch["input_ids"].clone()
        labels[labels == pad_idx] = -100
        loss = model(**batch, labels=labels).loss
        total_loss += loss.detach()

        evaluated_on_tokens += (batch["input_ids"] != pad_idx).sum().item() * world_size

    total_loss = total_loss / total_batches

    # Gather losses across all GPUs
    gathered_losses = [torch.zeros_like(total_loss) for _ in range(world_size)]
    dist.all_gather(gathered_losses, total_loss)
    total_loss = sum([t.item() for t in gathered_losses]) / world_size

    return total_loss, evaluated_on_tokens


def main(args):
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    assert "LOCAL_RANK" in os.environ, "torchrun should set LOCAL_RANK"
    global_rank = int(os.environ['RANK'])
    local_rank = int(os.environ["LOCAL_RANK"])
    world_size = int(os.environ["WORLD_SIZE"])
    torch.cuda.set_device(local_rank)
    logger.info(f"world_size: {world_size}")
    logger.info(f"Global rank {global_rank}, local rank {local_rank}, device: {torch.cuda.current_device()}")

    dist.init_process_group(backend="nccl", rank=global_rank, world_size=world_size)

    logger.info("Process group initialized")
    device = f"cuda:{local_rank}"

    if args.total_batch_size is not None:
        if args.gradient_accumulation is None:
            assert args.total_batch_size % world_size == 0, "total_batch_size must be divisible by world_size"
            args.gradient_accumulation = args.total_batch_size // (args.batch_size * world_size)
            assert args.gradient_accumulation > 0, "gradient_accumulation must be greater than 0"

    assert args.gradient_accumulation * args.batch_size * world_size == args.total_batch_size, \
        "gradient_accumulation * batch_size * world_size must be equal to total_batch_size"

    # turn off logger
    if global_rank != 0: logger.remove()

    model_name = os.path.splitext(os.path.basename(args.model_config))[0]
    
    #################################################
    mapping_file = "tmp/mapping.json"
    if os.path.exists(mapping_file):
        with open(mapping_file, 'r') as f:
            mapping = json.load(f)
    else:
        mapping = {}
        with open(mapping_file, 'w') as f:
            json.dump(mapping, f)
    if completed(f"galore-{args.dataset_name}-{model_name}_{args.run_name}_2025-", args.num_training_steps, mapping):
        print("Runned before")
        exit(0)
    #################################################
        
    # initialize wandb without config (it is passed later)
    if global_rank == 0:
        wandb.init(project=f"galore-{args.dataset_name}", name=f"{model_name}_{args.run_name}", mode="disabled" if args.no_log else "online")

    if args.log_to_file is True and global_rank == 0:
        now = datetime.now()
        date_time_str = now.strftime("%Y-%m-%d_%H-%M-%S")
        log_file_name = f'tmp/galore-{args.dataset_name}-{model_name}_{args.run_name}_{date_time_str}'

        ###############################################################
        real_log_file_name = f'galore-{args.dataset_name}-{model_name}_{args.run_name}_{date_time_str}'
        hash_name = hashlib.md5(real_log_file_name.encode()).hexdigest()
        mapping[real_log_file_name] = f'{date_time_str}_{hash_name}'
        with open(mapping_file, 'w') as f:
            json.dump(mapping, f)
        log_file_name = f'tmp/{date_time_str}_{hash_name}'
        ###############################################################

        log_file = open(f"{log_file_name}.txt", 'w')
        sys.stdout = DualWriter(log_file)
        logger.remove()  # Remove default handler
        logger.add(sys.stdout, format="{time} {level} {message}", level="INFO")  # Add logging to console
        logger.add(f"{log_file_name}.log", format="{time} {level} {message}", level="INFO",
                   rotation="10 MB")  # Add logging to file
        sys.excepthook = log_uncaught_exceptions
        
    logger.info(f"Using dist with rank {global_rank} (only rank 0 will log)")
    logger.info("*" * 40)
    logger.info(f"Starting training with the arguments")
    for k, v in vars(args).items():
        logger.info(f"{k:30} {v}")
    logger.info("*" * 40)

    train_valid_data = None

    if args.dataset_name == "openwebtext":
        train_valid_data = datasets.load_dataset('openwebtext', split='train', trust_remote_code=True)
        # train_valid_data = datasets.load_dataset('../../dataset/openwebtext', split='train')

        train_valid_data = train_valid_data.train_test_split(test_size=0.05, seed=args.seed)
        data = train_valid_data["train"]
    elif args.dataset_name == "c4":
        data = datasets.load_dataset("c4", "en", split="train", trust_remote_code=True)
    #################################################################
    elif args.dataset_name == "c4_ant":
        train_files = glob.glob(f"/modelopsnas/modelops/463248/hf_cache/datasets/c4_json/c4/en/*train*.json.gz")
        data = datasets.load_dataset("json", data_files=train_files, split="train", streaming=False)
    #################################################################
    seed_for_shuffle = 42
    
    logger.info(f"Shuffling data with seed {seed_for_shuffle}")
    data: datasets.Dataset = data.shuffle(seed=seed_for_shuffle)
    if not args.single_gpu:
        data = datasets.distributed.split_dataset_by_node(
            data, rank=global_rank, world_size=world_size,
        )

    # it doesn't matter which tokenizer we use, because we train from scratch
    # T5 tokenizer was trained on C4 and we are also training on C4, so it's a good choice

    while True:
        try:
            tokenizer = AutoTokenizer.from_pretrained("t5-base", model_max_length=args.max_length, streaming=True, trust_remote_code=True)
            break
        except Exception as e:
            print(f"Error loading tokenizer: {e}")
            time.sleep(5)
    # tokenizer = AutoTokenizer.from_pretrained("t5-base", model_max_length=args.max_length, streaming=True, trust_remote_code=True)

    def preprocess_batched(batch):
        batch = tokenizer(
            batch["text"],
            max_length=args.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        return batch


    dataset = PreprocessedIterableDataset(data, tokenizer, batch_size=args.batch_size, max_length=args.max_length)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=None, num_workers=args.workers)

    model_config = AutoConfig.from_pretrained(args.model_config)
    if args.use_hf_model:
        model: HF_LlamaForCausalLM = AutoModelForCausalLM.from_config(model_config)
    else:
        model = LlamaForCausalLM(model_config)


    # if args.activation_checkpointing:
    #     model.gradient_checkpointing_enable()

    global_step = 0
    update_step = 0
    beginning_step = 0
    tokens_seen = 0
    tokens_seen_before = 0


    if args.sltrain or args.onebitlowranktrain:
        model = build_slmodel(model, args)
    
    if args.dtype in ["bf16", "bfloat16"]:
        model = model.to(device=device, dtype=torch.bfloat16)
    else:
        model = model.to(device=device)




    n_total_params = sum(p.numel() for p in model.parameters())
    trainable_params = [p for p in model.parameters() if p.requires_grad]
    # Initialize wandb
    run_config = dict(vars(args))
    run_config.update({
        "max_lr": run_config.pop("lr"),  # rename lr to max_lr to avoid conflicts with scheduler
        "total_params_M": n_total_params / 1_000_000,
        "dataset": args.dataset_name,
        "model": model_config.to_dict(),
        "world_size": world_size,
        "device": str(device),
        "a_screen_id": os.environ.get('STY', 'Not in a screen session')
    })
    
    if global_rank == 0:
        wandb.config.update(run_config, allow_val_change=True)
        wandb.save(os.path.abspath(__file__), policy="now") # save current script
        # fix tqdm visual length to 80 so that the progress bar
        # doesn't jump around when changing from external display to laptop
        pbar = tqdm(total=args.num_training_steps - update_step, desc="Update steps", ncols=80)
    

    # print params and trainable params
    logger.info(f"\n{model}\n")
    logger.info(f"Total params: {sum(p.numel() for p in model.parameters()) / 1_000_000:.2f}M")
    logger.info(f"Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad) / 1_000_000:.2f}M")
    
    if args.optimizer.lower() == "adam":
        optimizer = torch.optim.Adam(trainable_params, lr=args.lr, weight_decay=args.weight_decay)
    # implement sgd
    elif args.optimizer.lower() == "sgd":
        optimizer = torch.optim.SGD(trainable_params, lr=args.lr, weight_decay=args.weight_decay, momentum=args.beta1)
    
    # # 8-bit Adam
    # elif args.optimizer.lower() == "adam8bit":
    #     optimizer = bnb.optim.Adam8bit(trainable_params, lr=args.lr, weight_decay=args.weight_decay)
        
    else:
        raise ValueError(f"Optimizer {args.optimizer} not supported")


    scheduler = training_utils.get_scheculer(
        optimizer=optimizer,
        scheduler_type=args.scheduler,
        num_training_steps=args.num_training_steps,
        warmup_steps=args.warmup_steps,
        min_lr_ratio=args.min_lr_ratio,
        restart_warmup_steps=args.iterative_warmup_steps,
        cycle_length=args.update_interval,
        no_decay=args.no_decay,
    )


    T_end = int(args.num_training_steps*0.75)
    if args.pruning_T_end is None:
        args.pruning_T_end = T_end

    if args.dst_scheduler:
        pruner = DSTScheduler(model, optimizer, alpha=args.zeta, delta=args.update_interval, static_topo=args.static_dst, T_end=int(args.num_training_steps*0.75), ignore_linear_layers=False, sparsity_distribution=args.sparsity_distribution, grad_accumulation_n=args.gradient_accumulation, args=args)

    

    if args.continue_from is not None:
        logger.info("*" * 40)
        logger.info(f"Loading model from {args.continue_from}")
        checkpoint_path = os.path.join(args.continue_from, "all_model.pt")
        state_dict = torch.load(checkpoint_path, map_location="cpu")
        prefix = "module."
        state_dict = {key.replace(prefix, ""): value for key, value in state_dict.items()}
        model.load_state_dict(state_dict, strict=True)
        logger.info(f"Model successfully loaded (strict=True policy)")
        for n, m in model.named_modules():
            if isinstance(m, LoraLayer):
                print(f"{n}")
                print_lora(m, "reload")

        if os.path.exists(os.path.join(args.continue_from, "training_state.json")):
            logger.info(
                f"Loading training state like global_step, update_step, and tokens_seen from {args.continue_from}")
            with open(os.path.join(args.continue_from, "training_state.json")) as f:
                _old_state = json.load(f)
            global_step = _old_state["global_step"]
            update_step = _old_state["update_step"]
            tokens_seen = _old_state["tokens_seen"]
            tokens_seen_before = _old_state["tokens_seen_before"]
            logger.info(f"global_step       : {global_step}")
            logger.info(f"update_step       : {update_step}")
            logger.info(f"tokens_seen       : {tokens_seen}")
            logger.info(f"tokens_seen_before: {tokens_seen_before}")
            logger.info(f"Will train for {args.num_training_steps - update_step} update steps")
        else:
            logger.warning(f"Did not find training state in {args.continue_from}, global step will start from zero")

        optimizer_checkpoint = torch.load(os.path.join(args.continue_from, "optimizer.pt"), map_location="cpu")
        optimizer.load_state_dict(optimizer_checkpoint["optimizer"])
        pruner.load_state_dict(optimizer_checkpoint["dst_scheduler"])
        # update_step = optimizer_checkpoint["update_step"]
        # global_step = optimizer_checkpoint["global_step"]
        logger.info(f"Optimizer and scheduler restored from {args.continue_from}")
        logger.info("*" * 40)

    if not args.single_gpu:
        model: LlamaForCausalLM = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            broadcast_buffers=False,
        )

    # global steps and others are defined above
    pad_idx = tokenizer.pad_token_id
    update_time = time.time()
    model.module.generation_config.pad_token_id = pad_idx
    local_step = 0  # when continue_from is used, local_step != global_step
    
    if args.sltrain and args.evolution_initialization:
        print("Performing evolution step for initialization")
        pruner.evolution_step(args.evolution_strategy)
        print(f"Evolution step done: {args.evolution_strategy}")

    # ##############################
    # TRAINING LOOP
    # we'll never go through all the data, so no need for epochs
    # ##############################

    # @retry(stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=4, max=10))
    sim_dic = {"cos": {}, "mse": {}}
    for batch_idx, batch in enumerate(dataloader):

        global_step += 1
        local_step += 1

        if update_step > args.num_training_steps:
            logger.info(f"Reached max number of update steps (f{args.num_training_steps}). Stopping training.")
            print(f"Rank {global_rank} stopping training.")
            break

        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch["input_ids"].clone()
        labels[labels == pad_idx] = -100
        tokens_seen += (batch["input_ids"] != pad_idx).sum().item() * world_size

        loss = model(**batch, labels=labels).loss
        ####################################################
        if args.sltrain and (args.aloss is not None):
            if args.aloss == "orthogonal_w":
                new_loss = pruner.orthogonal_loss_w() * args.aloss_weight
                loss += new_loss
                print(f"Orthogonal loss (W): {new_loss.item()}")
            elif args.aloss == "orthogonal_wx":
                new_loss = pruner.orthogonal_loss_wx() * args.aloss_weight
                loss += new_loss
                print(f"Orthogonal loss (Wx): {new_loss.item()}")
            elif args.aloss == "alignment_wx":
                new_loss = pruner.alignment_loss_wx() * args.aloss_weight
                loss += new_loss
                print(f"Alignment loss (Wx): {new_loss.item()}")
            else:
                raise ValueError(f"Unknown additional loss {args.aloss}")
        ####################################################
        scaled_loss = loss / args.gradient_accumulation
        scaled_loss.backward()

        if global_step % args.gradient_accumulation != 0:
            continue


        # The below code is only executed during the update step
        
        # add grad clipping
        if args.grad_clipping != 0.0: torch.nn.utils.clip_grad_norm_(trainable_params, args.grad_clipping)

        if global_rank == 0: pbar.update(1)
        
        if args.dst_scheduler:
            if pruner():
                optimizer.step()
        else:
            # for l, m in model.named_modules():
            #     if isinstance(m, onebitlowranklinear):
            #         m._update_weight()
            optimizer.step()
        scheduler.step()
        optimizer.zero_grad()



        update_step += 1
        update_time = time.time() - update_time
        
        
        if global_rank == 0 and (args.no_compute_similarity is False):
            compute_similarity(model, sim_dic)

        # save checkpoint by save_every
        if local_step > args.gradient_accumulation and update_step % args.save_every == 0 and global_rank == 0:
            if args.only_save_last is True:
                current_model_directory = args.save_dir
            else:
                current_model_directory = f"{args.save_dir}/model_{update_step}"
            logger.info(f"Saving model and optimizer to {current_model_directory}, update step {update_step}")
            os.makedirs(args.save_dir, exist_ok=True)

            torch.save(model.state_dict(), os.path.join(current_model_directory, "all_model.pt"))
            # model.module.save_pretrained(current_model_directory, max_shard_size='100GB')

            optimizer_checkpoint = {
                "optimizer": optimizer.state_dict(),
                "update_step": update_step,
                "global_step": global_step,
                "config": run_config,
                "wandb": wandb.run.dir,
                "dtype": args.dtype,
            }
            torch.save(optimizer_checkpoint, f"{current_model_directory}/optimizer.pt")

            training_state_checkpoint = {
                "global_step": global_step,
                "update_step": update_step,
                "tokens_seen": tokens_seen,
                "tokens_seen_before": tokens_seen_before,
                "update_time": update_time,
            }
            with open(f"{current_model_directory}/training_state.json", "w") as f:
                json.dump(training_state_checkpoint, f, indent=4)
                
            # save wandb related info
            wandb_info = {
                "wandb_id": wandb.run.id,
            }
            with open(f"{args.save_dir}/wandb.json", "w") as f:
                json.dump(wandb_info, f, indent=4)

        # evaluation
        if update_step % args.eval_every == 0:
            logger.info(f"Performing evaluation at step {update_step}")
            total_loss, evaluated_on_tokens = evaluate_model(
                model, preprocess_batched, pad_idx, global_rank, world_size, device, args.batch_size, args.dataset_name, train_valid_data
            )
            if global_rank == 0:
                wandb.log({
                    "final_eval_loss": total_loss,
                    "final_eval_tokens": evaluated_on_tokens,
                    },
                    step=update_step,
                )
                if args.no_log:
                    print(f"update_step: {update_step},", {
                        "final_eval_loss": total_loss,
                        "final_eval_tokens": evaluated_on_tokens,
                        })
            logger.info(f"Eval loss at step {update_step}: {total_loss}")

        lr = optimizer.param_groups[0]["lr"]

        tokens_in_update = tokens_seen - tokens_seen_before
        tokens_seen_before = tokens_seen
        batches_in_update = args.gradient_accumulation * world_size

        if global_rank == 0:
            wandb.log({
                "loss": loss.item(),
                "lr": lr,
                "update_step": update_step,
                "tokens_seen": tokens_seen,
                "throughput_tokens": tokens_in_update / update_time,
                "throughput_examples": args.total_batch_size / update_time,
                "throughput_batches": batches_in_update / update_time,
                },
                step=update_step,
            )
            if args.no_log:
                print(f"update_step: {update_step},", {
                    "loss": loss.item(),
                    "lr": lr,
                    "update_step": update_step,
                    "tokens_seen": tokens_seen,
                    "throughput_tokens": tokens_in_update / update_time,
                    "throughput_examples": args.total_batch_size / update_time,
                    "throughput_batches": batches_in_update / update_time,
                    })
        update_time = time.time()
        
        ################################################
        if args.sltrain and args.evolution and update_step % args.evolution_every == 0:
            pruner.evolution_step(args.evolution_strategy)
            print(f"Evolution step performed, Strategy {args.evolution_strategy}")
        ################################################

    # ##############################
    # END of training loop
    # ##############################
    
    if args.log_to_file is True and global_rank == 0:
        with open(f"{log_file_name}.pkl", "wb") as f:
            pickle.dump(sim_dic, f)
    
    logger.info("Training finished")
    if global_rank == 0: pbar.close()

    if args.only_save_last is True:
        current_model_directory = f"{args.save_dir}/"
    else:
        current_model_directory = f"{args.save_dir}/model_{update_step}"
    if global_rank == 0:
        # logger.info(f"Saving model and optimizer to {current_model_directory}, update step {update_step}")
        # os.makedirs(args.save_dir, exist_ok=True)
        model.module.save_pretrained(f"trained_model/{args.run_name}")
        tokenizer.save_pretrained(f"trained_model/{args.run_name}")
        # torch.save(model.state_dict(), os.path.join(current_model_directory, "all_model.pt"))

        # optimizer_checkpoint = {
        #     "optimizer": optimizer.state_dict(),
        #     "update_step": update_step,
        #     "global_step": global_step,
        #     "config": run_config,
        #     "wandb": wandb.run.dir,
        #     "dtype": args.dtype,
        # }
        # torch.save(optimizer_checkpoint, f"{current_model_directory}/optimizer.pt")

        training_state_checkpoint = {
            "global_step": global_step,
            "update_step": update_step,
            "tokens_seen": tokens_seen,
            "tokens_seen_before": tokens_seen_before,
            "update_time": update_time,
        }
        # with open(f"{current_model_directory}/training_state.json", "w") as f:
        #     json.dump(training_state_checkpoint, f, indent=4)

    # Final evaluation
    logger.info("Running final evaluation")
    model.eval()
    
    del loss, optimizer
    import gc; gc.collect()
    torch.cuda.empty_cache()

    total_loss, evaluated_on_tokens = evaluate_model(
        model, preprocess_batched, pad_idx, global_rank, world_size, device, args.batch_size, args.dataset_name, train_valid_data
    )

    if global_rank == 0:
        wandb.log({
            "final_eval_loss": total_loss,
            "final_eval_tokens": evaluated_on_tokens,
            },
            step=update_step,
        )
        if args.no_log:
            print(f"update_step: {update_step},", {
            "final_eval_loss": total_loss,
            "final_eval_tokens": evaluated_on_tokens,
            })
        logger.info(f"Final eval loss: {total_loss}")

    logger.info("Script finished successfully")
    print(f"Rank {global_rank} finished successfully")

    
    if args.log_to_file is True and global_rank == 0:
        sys.stdout = sys.__stdout__

        # Close the file
        log_file.close()


    
if __name__ == "__main__":
    print("Starting script")
    args = parse_args(None)
    main(args)
