# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import random
from collections import defaultdict
import copy
import json
import os
from os.path import exists, join, isdir
from dataclasses import dataclass, field
import sys
from typing import Optional, Dict, Sequence
import numpy as np
from tqdm import tqdm
import logging
import datasets
# import bitsandbytes as bnb
import pandas as pd
import importlib
from packaging import version
from read_json import sort_dict_by_value, print_dict, divide_dict
from packaging.version import parse

import torch
import transformers
from transformers.models.llama.modeling_llama import LlamaRMSNorm
from transformers.utils.import_utils import is_sagemaker_mp_enabled
from torch.nn.utils.rnn import pad_sequence
import argparse
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    set_seed,
    Seq2SeqTrainer,
    BitsAndBytesConfig,
    LlamaTokenizer, TrainerState, TrainerControl,
)
from transformers.modeling_utils import unwrap_model
from transformers.utils import is_peft_available
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES

from datasets import load_dataset, Dataset, load_from_disk
import evaluate

from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    AdaLoraConfig,
    get_peft_model,
    PeftModel
)
from peft.tuners.lora import LoraLayer
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

def init_wandb(project_name):
    os.environ["WANDB_PROJECT"]=project_name

def is_ipex_available():
    def get_major_and_minor_from_version(full_version):
        return str(version.parse(full_version).major) + "." + str(version.parse(full_version).minor)

    _torch_version = importlib.metadata.version("torch")
    if importlib.util.find_spec("intel_extension_for_pytorch") is None:
        return False
    _ipex_version = "N/A"
    try:
        _ipex_version = importlib.metadata.version("intel_extension_for_pytorch")
    except importlib.metadata.PackageNotFoundError:
        return False
    torch_major_and_minor = get_major_and_minor_from_version(_torch_version)
    ipex_major_and_minor = get_major_and_minor_from_version(_ipex_version)
    if torch_major_and_minor != ipex_major_and_minor:
        warnings.warn(
            f"Intel Extension for PyTorch {ipex_major_and_minor} needs to work with PyTorch {ipex_major_and_minor}.*,"
            f" but PyTorch {_torch_version} is found. Please switch to the matching version and run again."
        )
        return False
    return True
    

if torch.cuda.is_available():   
    torch.backends.cuda.matmul.allow_tf32 = True

logger = logging.getLogger(__name__)

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"

@dataclass
class ModelArguments:
    # Optional[str] is equivalent to Union[str, None].
    model_name_or_path: Optional[str] = field(
        default="EleutherAI/pythia-12b"
    )
    trust_remote_code: Optional[bool] = field(
        default=False,
        metadata={"help": "Enable unpickling of arbitrary code in AutoModelForCausalLM#from_pretrained."}
    )
    use_auth_token: Optional[bool] = field(
        default=False,
        metadata={"help": "Enables using Huggingface auth token from Git Credentials."}
    )

@dataclass
class DataArguments:
    eval_dataset_size: int = field(
        default=1024, metadata={"help": "Size of validation dataset."}
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
            "value if set."
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
            "value if set."
        },
    )
    source_max_len: int = field(
        default=1024,
        metadata={"help": "Maximum source sequence length. Sequences will be right padded (and possibly truncated)."},
    )
    target_max_len: int = field(
        default=256,
        metadata={"help": "Maximum target sequence length. Sequences will be right padded (and possibly truncated)."},
    )
    dataset: str = field(
        default='alpaca',
        metadata={"help": "Which dataset to finetune on. See datamodule for options."}
    )
    dataset_format: Optional[str] = field(
        default=None,
        metadata={"help": "Which dataset format is used. [alpaca|chip2|self-instruct|hh-rlhf]"}
    )

@dataclass
class TrainingArguments(transformers.Seq2SeqTrainingArguments):
    cache_dir: Optional[str] = field(
        default=None
    )
    train_on_source: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to train on the input in addition to the target text."}
    )
    mmlu_split: Optional[str] = field(
        default='eval',
        metadata={"help": "The MMLU split to run on"}
    )
    mmlu_dataset: Optional[str] = field(
        default='mmlu-fs',
        metadata={"help": "MMLU dataset to use: options are `mmlu-zs` for zero-shot or `mmlu-fs` for few shot."}
    )
    do_mmlu_eval: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to run the MMLU evaluation."}
    )
    max_mmlu_samples: Optional[int] = field(
        default=None,
        metadata={"help": "If set, only evaluates on `max_mmlu_samples` of the MMMLU dataset."}
    )
    mmlu_source_max_len: int = field(
        default=2048,
        metadata={"help": "Maximum source sequence length for mmlu."}
    )
    full_finetune: bool = field(
        default=False,
        metadata={"help": "Finetune the entire model without adapters."}
    )
    adam8bit: bool = field(
        default=False,
        metadata={"help": "Use 8-bit adam."}
    )
    double_quant: bool = field(
        default=True,
        metadata={"help": "Compress the quantization statistics through double quantization."}
    )
    quant_type: str = field(
        default="nf4",
        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
    )
    bits: int = field(
        default=32,
        metadata={"help": "How many bits to use."}
    )
    lora_r: int = field(
        default=64,
        metadata={"help": "Lora R dimension."}
    )
    target_r: int = field(
        default=32,
        metadata={"help": "Target R dimension for AdaLoRA."}
    )
    lora_alpha: float = field(
        default=16,
        metadata={"help": " Lora alpha."}
    )
    lora_dropout: float = field(
        default=0.0,
        metadata={"help":"Lora dropout."}
    )
    max_memory_MB: int = field(
        default=80000,
        metadata={"help": "Free memory per gpu."}
    )
    report_to: str = field(
        default='none',
        metadata={"help": "To use wandb or something else for reporting."}
    )
    run_name: str = field(
        default='none',
        metadata={"help": "To use wandb, and the running name."}
    )
    whether_quantize: bool = field(default=False, metadata={"help": 'Whether to quantize the model.'})
    gamma_learning_ratio: float = field(default=1, metadata={"help": 'The learning rate for the gamma parameters'})
    weight_learning_ratio: float = field(default=1.0, metadata={"help": 'The learning rate for the weight parameters'})


    output_dir: str = field(default='./output', metadata={"help": 'The output dir for logs and checkpoints'})
    optim: str = field(default='paged_adamw_32bit', metadata={"help": 'The optimizer to be used'})
    per_device_train_batch_size: int = field(default=1, metadata={"help": 'The training batch size per GPU. Increase for better speed.'})
    per_device_eval_batch_size: int = field(default=1, metadata={"help": 'The evaluation batch size per GPU. Increase for better speed.'})
    gradient_accumulation_steps: int = field(default=16, metadata={"help": 'How many gradients to accumulate before to perform an optimizer step'})
    max_steps: int = field(default=10000, metadata={"help": 'How many optimizer update steps to take'})
    total_steps: int = field(default=10000, metadata={"help": 'How many optimizer update steps to take'})
    weight_decay: float = field(default=0.0, metadata={"help": 'The L2 weight decay rate of AdamW'}) # use lora dropout instead for regularization if needed
    learning_rate: float = field(default=0.0002, metadata={"help": 'The learnign rate'})
    lora_learning_rate: float = field(default=0.0002, metadata={"help": 'The learnign rate for lora'})
    remove_unused_columns: bool = field(default=False, metadata={"help": 'Removed unused columns. Needed to make this codebase work.'})
    max_grad_norm: float = field(default=0.3, metadata={"help": 'Gradient clipping max norm. This is tuned and works well for all models tested.'})
    gradient_checkpointing: bool = field(default=True, metadata={"help": 'Use gradient checkpointing. You want to use this.'})
    do_train: bool = field(default=True, metadata={"help": 'To train or not to train, that is the question?'})
    lr_scheduler_type: str = field(default='constant', metadata={"help": 'Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis'})
    warmup_ratio: float = field(default=0.03, metadata={"help": 'Fraction of steps to do a warmup for'})
    logging_steps: int = field(default=10, metadata={"help": 'The frequency of update steps after which to log the loss'})
    group_by_length: bool = field(default=True, metadata={"help": 'Group sequences into batches with same length. Saves memory and speeds up training considerably.'})
    save_strategy: str = field(default='steps', metadata={"help": 'When to save checkpoints'})
    save_steps: int = field(default=250, metadata={"help": 'How often to save a model'})
    save_total_limit: int = field(default=40, metadata={"help": 'How many checkpoints to save before the oldest is overwritten'})

    whether_localization: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to run localization."}
    )

    block_wise: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to use block-wise for localization."}
    )

    add_weight: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to add weight."}
    )

    freeze_lora: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to freeze lora."}
    )
    freeze_lora_weights: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to freeze lora weights."}
    )
    freeze_out_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Freeze out dir."}
    )
    peft_path: Optional[str] = field(
        default=None,
        metadata={"help": "Remote PEFT path."}
    )
    lw_init_value: Optional[float] = field(
        default=1.0,
        metadata={"help": "Initial value for lora weight."}
    )
    further_tune_lora: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to further tune lora."}
    )
    further_tune_gamma: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to further tune lora gamma."}
    )
    reset_low_score_layers: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to freeze low score layers."}
    )
    peft_type: Optional[str] = field(
        default='lora',
        metadata={"help": "PEFT type."}
    )
    orth_reg_weight: Optional[float] = field(
        default=0.5,
        metadata={"help": "Orthogonal regularization weight."}
    )
    deltaT: Optional[int] = field(
        default=1,
        metadata={"help": "DeltaT for AdaLoRA."}
    )
    tinit: Optional[int] = field(
        default=500,
        metadata={"help": "Tinit for AdaLoRA."}
    )
    tfinal: Optional[int] = field(
        default=8000,
        metadata={"help": "Tfinal for AdaLoRA."}
    )

@dataclass
class GenerationArguments:
    # For more hyperparameters check:
    # https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
    # Length arguments
    max_new_tokens: Optional[int] = field(
        default=256,
        metadata={"help": "Maximum number of new tokens to be generated in evaluation or prediction loops"
                          "if predict_with_generate is set."}
    )
    min_new_tokens : Optional[int] = field(
        default=None,
        metadata={"help": "Minimum number of new tokens to generate."}
    )

    # Generation strategy
    do_sample: Optional[bool] = field(default=False)
    num_beams: Optional[int] = field(default=1)
    num_beam_groups: Optional[int] = field(default=1)
    penalty_alpha: Optional[float] = field(default=None)
    use_cache: Optional[bool] = field(default=True)

    # Hyperparameters for logit manipulation
    temperature: Optional[float] = field(default=1.0)
    top_k: Optional[int] = field(default=50)
    top_p: Optional[float] = field(default=1.0)
    typical_p: Optional[float] = field(default=1.0)
    diversity_penalty: Optional[float] = field(default=0.0)
    repetition_penalty: Optional[float] = field(default=1.0)
    length_penalty: Optional[float] = field(default=1.0)
    no_repeat_ngram_size: Optional[int] = field(default=0)

@dataclass
class ExtraArguments:
    project_name: Optional[str] = field(
        default='PEFT'
    )
    mmlu_freq: Optional[int] = field(
        default=200,
        metadata={"help": "Frequency of MMLU evaluation."}
    )
    lora_drop_prob: Optional[float] = field(
        default=0.0,
        metadata={"help": "Stocastic drop LoRA Layers."}
    )

    layer_json_file: Optional[str] = field(
        default=None,
        metadata={"help": "Layer json file for localization."}
    )

    layer_score_file: Optional[str] = field(
        default=None,
        metadata={"help": "Layer json file for localization."}
    )
    only_permutation: Optional[bool] = field(
        default=True,
        metadata={"help": "Whether to use only permutation."}
    )
    desired_std_ratio: Optional[float] = field(
        default=1.0,
        metadata={"help": "Desired std for lora score."}
    )
    max_learning_rate_ratio: Optional[float] = field(
        default=1.2,
        metadata={"help": "Max learning rate ratio for lora score."}
    )
    min_learning_rate_ratio: Optional[float] = field(
        default=0.5,
        metadata={"help": "Max learning rate ratio for lora score."}
    )
    desired_mean: Optional[float] = field(
        default=1.0,
        metadata={"help": "Desired mean for lora score."}
    )
    dynamic_lr: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to use dynamic learning rate."}
    )

    topK: Optional[int] = field(
        default=0,
        metadata={"help": "TopK for localization."}
    )
    divide_ratio: Optional[float] = field(
        default=0.0,
        metadata={"help": "Divide ratio for localization. If 0.0, use topK."}
    )
    # total_steps: Optional[int] = field(
    #     default=2000,
    #     metadata={"help": "Total steps for localization."}
    # )
    whether_unfreeze_normal: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to freeze normal layers."}
    )
    interval: Optional[int] = field(
        default=400,
        metadata={"help": "Interval for training lora"}
    )
    lora_gamma_interval: Optional[int] = field(
        default=1,
        metadata={"help": "Interval for training lora gamma."}
    )
    upperbound: Optional[float] = field(
        default=1.0
    )
    default_localization: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to use default localization."}
    )
    Random: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to use random Selection task-specific layers."}
    )
    start_iter: Optional[int] = field(
        default=0,
        metadata={"help": "Start Iteration for training lora"}
    )
    all_layers: Optional[str] = field(
        default='ALL',
        metadata={"help": "Whether to use all layers. ALL, ATT, FFN"}
    )
    ratio: Optional[float] = field(
        default=32.0,
        metadata={"help": "Ratio for Norm Lora layers."}
    )
    use_sigmoid: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to use sigmoid."}
    )
    sigmoid_init_value: Optional[float] = field(
        default=5.0,
        metadata={"help": "Sigmoid init value."}
    )
    seperate_training: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to use seperate training."}
    )

def find_all_linear_names(args, model, include_lm_head=False):
    cls = torch.nn.Linear
    # cls = bnb.nn.Linear4bit if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if not include_lm_head:
        if 'lm_head' in lora_module_names: # needed for 16-bit
            lora_module_names.remove('lm_head')

    return list(lora_module_names)

def find_all_norm_layers_for_llama(args, model):
    cls = LlamaRMSNorm
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            print(name)
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    return list(lora_module_names)

def unfreeze_all_normal_layers(model):
    for name, module in model.named_modules():
        if 'norm' in name:
            if hasattr(module, 'weight'):
                module.weight.requires_grad = True
            if hasattr(module, 'bias'):
                module.bias.requires_grad = True

class SavePeftModelCallback(transformers.TrainerCallback):
    def __init__(self, train_dataset=None):
        self.train_dataset = train_dataset

    def save_model(self, args, state, kwargs, suffix=None):
        #print('Saving PEFT checkpoint...')
        if state.best_model_checkpoint is not None:
            checkpoint_folder = os.path.join(state.best_model_checkpoint, "adapter_model")
        else:
            if suffix is None:
                suffix = f"{state.global_step}"

            checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-suffix")
            if args.freeze_lora or args.freeze_lora_weights:
                checkpoint_folder = os.path.join(args.freeze_out_dir, f"{PREFIX_CHECKPOINT_DIR}-suffix")

        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
        kwargs["model"].save_pretrained(peft_model_path)

        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")

        if os.path.exists(pytorch_model_path):
            os.remove(pytorch_model_path)

        if args.add_weight:
            model = kwargs["model"]
            file_content = {'weight_score': {}, }
            for name, module in model.named_modules():
                if hasattr(module, 'lora_weight'):
                    file_content['weight_score'][name] = module.lora_weight['default'].item()

            with open(os.path.join(args.output_dir, f'weight_score_{state.epoch}.json'), 'w') as fout:
                fout.write(json.dumps(file_content))

        if args.whether_localization:
            model = kwargs["model"]
            if args.block_wise:
                file_content = {'block_score': {},
                                'lora_score': {},
                                'diff_up': {},
                                'diff_down': {},
                                'upperbound_up_times': {},
                                'lowerbound_down_times': {},
                                'lora_gamma_value_list': {},
                                }
                for name, module in model.named_modules():
                    if hasattr(module, 'block_score'):
                        file_content['block_score'][name] = module.block_score
                        file_content['lowerbound_down_times'][name] = module.lowerbound_down_times
                for name, module in model.named_modules():
                    if hasattr(module, 'lora_score'):
                        file_content['lora_score'][name] = module.lora_score
                        file_content['diff_up'][name] = module.diff_up
                        file_content['diff_down'][name] = module.diff_down
                        file_content['upperbound_up_times'][name] = module.upperbound_up_times
                        file_content['lowerbound_down_times'][name] = module.lowerbound_down_times
                    if hasattr(module, 'lora_gamma_value_list'):
                        file_content['lora_gamma_value_list'][name] = module.lora_gamma_value_list
            else:
                # Save the score information
                file_content = {
                    'lora_score': {},
                    'diff_up': {},
                    'diff_down': {},
                    'upperbound_up_times': {},
                    'lowerbound_down_times': {},
                    'lora_gamma_value_list': {},
                }
                for name, module in model.named_modules():
                    if hasattr(module, 'lora_score'):
                        file_content['lora_score'][name] = module.lora_score
                        file_content['diff_up'][name] = module.diff_up
                        file_content['diff_down'][name] = module.diff_down
                        file_content['upperbound_up_times'][name] = module.upperbound_up_times
                        file_content['lowerbound_down_times'][name] = module.lowerbound_down_times
                    if hasattr(module, 'lora_gamma_value_list'):
                        file_content['lora_gamma_value_list'][name] = module.lora_gamma_value_list

            with open(os.path.join(args.results_dir, f'lora_score_{state.global_step}.json'), 'w') as fout:
                fout.write(json.dumps(file_content))

            with open(os.path.join(args.results_dir, f'lowerbound_{state.global_step}.json'), 'w') as fout:
                fout.write(json.dumps(file_content['lowerbound_down_times']))

        return peft_model_path

    def on_epoch_end(self, args, state, control, **kwargs):
        if state.is_world_process_zero:
            self.save_model(args, state, kwargs)
            return control

    def on_train_end(self, args, state, control, **kwargs):
        if state.is_world_process_zero:
            def touch(fname, times=None):
                with open(fname, 'a'):
                    os.utime(fname, times)

            touch(join(args.output_dir, 'completed'))
            path = self.save_model(args, state, kwargs, suffix='final_v2')

            # def read_model(args, peft_path=''):
            #     if torch.cuda.is_available():
            #         n_gpus = torch.cuda.device_count()
            #     if is_ipex_available() and torch.xpu.is_available():
            #         n_gpus = torch.xpu.device_count()
            #
            #     max_memory = f'{args.max_memory_MB}MB'
            #     max_memory = {i: max_memory for i in range(n_gpus)}
            #
            #     # print(f'loading base model {args.model_name_or_path}...')
            #     compute_dtype = (torch.float16 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32))
            #
            #     model = AutoModelForCausalLM.from_pretrained(
            #         "meta-llama/Llama-2-7b-hf",
            #         cache_dir=args.cache_dir,
            #         device_map='cpu',
            #         # max_memory=max_memory,
            #         torch_dtype=(torch.float16 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32)),
            #     )
            #
            #     if compute_dtype == torch.float16 and args.bits == 4:
            #         if torch.cuda.is_bf16_supported():
            #             print('=' * 80)
            #             print('Your GPU supports bfloat16, you can accelerate training with the argument --bf16')
            #             print('=' * 80)
            #
            #     if compute_dtype == torch.float16 and (is_ipex_available() and torch.xpu.is_available()):
            #         compute_dtype = torch.bfloat16
            #         print('Intel XPU does not support float16 yet, so switching to bfloat16')
            #
            #     # setattr(model, 'model_parallel', True)
            #     # setattr(model, 'is_parallelizable', True)
            #
            #     model.config.torch_dtype = (
            #         torch.float32 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32))
            #
            #     tokenizer = AutoTokenizer.from_pretrained(
            #         "meta-llama/Llama-2-7b-hf",
            #         cache_dir=args.cache_dir,
            #         padding_side="right",
            #         use_fast=False,
            #         add_eos_token=True,
            #         add_bos_token=True,
            #         # Needed for HF name change
            #     )
            #     if tokenizer._pad_token is None:
            #         smart_tokenizer_and_embedding_resize(
            #             args,
            #             special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
            #             tokenizer=tokenizer,
            #             model=model,
            #         )
            #
            #     print("Loading adapters from checkpoint.")
            #     if len(peft_path) > 0:
            #         model = PeftModel.from_pretrained(model, peft_path, is_trainable=False)
            #     # model = PeftModel.from_pretrained(model, join(args.peft_path), is_trainable=False)
            #
            #     for name, module in model.named_modules():
            #         if isinstance(module, LoraLayer):
            #             # module = module.to(torch.bfloat16)
            #             if args.bf16:
            #                 module = module.to(torch.bfloat16)
            #
            #     return model, tokenizer
            #
            # model_save, tokenizer = read_model(args, peft_path=path)
            # for n1, p1 in model_save.named_parameters():
            #     for n2, p2 in kwargs['model'].named_parameters():
            #         if n1 == n2:
            #             original_dtype = p1.dtype
            #             p1.to(p2.device)
            #             diff = (p1 - p2).abs().sum().item()
            #             p1.to(original_dtype)
            #             logger.info(f"Diff for {n1}: {diff}")

class AdaLoRACallback(transformers.TrainerCallback):
    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        model = kwargs["model"]
        model.update_and_allocate(state.global_step)

def get_accelerate_model(args, checkpoint_dir):

    if torch.cuda.is_available():
        n_gpus = torch.cuda.device_count()
    if is_ipex_available() and torch.xpu.is_available():
        n_gpus = torch.xpu.device_count()

    max_memory = f'{args.max_memory_MB}MB'
    max_memory = {i: max_memory for i in range(n_gpus)}
    #
    #
    # if we are in a distributed setting, we need to set the device map and max memory per device
    # if os.environ.get('LOCAL_RANK') is not None:
    #     local_rank = int(os.environ.get('LOCAL_RANK', '0'))
    #     device_map = {'': local_rank}
    #     max_memory = {'': max_memory[local_rank]}

    local_rank = int(os.environ.get('LOCAL_RANK', '0'))
    device_map = {'': local_rank}
    using_device_map = True
    # read json
    if args.deepspeed is None:
        device_map = "auto"
        using_device_map = True
    else:
        with open(args.deepspeed, 'r') as f:
            deepspeed_config = json.load(f)
            if deepspeed_config['zero_optimization']['stage'] == 3:
                using_device_map = False

    if args.full_finetune: assert args.bits in [16, 32]

    # print(f'loading base model {args.model_name_or_path}...')
    compute_dtype = (torch.float16 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32))
    if args.whether_quantize:
        model = AutoModelForCausalLM.from_pretrained(
            args.model_name_or_path,
            cache_dir=args.cache_dir,
            load_in_4bit=args.bits == 4,
            load_in_8bit=args.bits == 8,
            device_map=device_map,
            max_memory=max_memory,
            quantization_config=BitsAndBytesConfig(
                load_in_4bit=args.bits == 4,
                load_in_8bit=args.bits == 8,
                llm_int8_threshold=6.0,
                llm_int8_has_fp16_weight=False,
                bnb_4bit_compute_dtype=compute_dtype,
                bnb_4bit_use_double_quant=args.double_quant,
                bnb_4bit_quant_type=args.quant_type,
            ),
            torch_dtype=(torch.float32 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32)),
        )
    else:
        if args.full_finetune and not args.do_train and checkpoint_dir is not None:
            if using_device_map:
                model = AutoModelForCausalLM.from_pretrained(
                    checkpoint_dir,
                    cache_dir=args.cache_dir,
                    device_map=device_map,
                    # device_map=device_map,
                    # max_memory=max_memory
                )
            else:
                model = AutoModelForCausalLM.from_pretrained(
                    checkpoint_dir,
                    cache_dir=args.cache_dir,
                )
        else:
            if using_device_map:
                model = AutoModelForCausalLM.from_pretrained(
                    args.model_name_or_path,
                    cache_dir=args.cache_dir,
                    device_map=device_map,
                    # device_map=device_map,
                    # max_memory=max_memory,
                    torch_dtype=(torch.float16 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32)),
                )
            else:
                model = AutoModelForCausalLM.from_pretrained(
                    args.model_name_or_path,
                    cache_dir=args.cache_dir,
                    # device_map={"": int(os.environ.get("LOCAL_RANK") or 0)},
                    # device_map=device_map,
                    # max_memory=max_memory,
                    torch_dtype=(torch.float16 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32)),
                )


    if compute_dtype == torch.float16 and args.bits == 4:
        if torch.cuda.is_bf16_supported():
            print('='*80)
            print('Your GPU supports bfloat16, you can accelerate training with the argument --bf16')
            print('='*80)
            
    if compute_dtype == torch.float16 and (is_ipex_available() and torch.xpu.is_available()):
        compute_dtype = torch.bfloat16
        print('Intel XPU does not support float16 yet, so switching to bfloat16')

    if args.deepspeed is None:
        setattr(model, 'model_parallel', True)
        setattr(model, 'is_parallelizable', True)

    model.config.torch_dtype=(torch.float32 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32))

    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        args.model_name_or_path,
        cache_dir=args.cache_dir,
        padding_side="right",
        use_fast=False,
        add_eos_token=True,
        add_bos_token=True,
        # Needed for HF name change
    )

    if tokenizer._pad_token is None:
        tokenizer.add_special_tokens(
            {"pad_token": tokenizer.unk_token}
        )
        model.config.pad_token_id = tokenizer.unk_token_id

    # For Debug, print the model structure
    # print(model)
    # for name, pararm in model.named_modules():
    #     if hasattr(pararm, 'weight'):
    #         print(name, pararm.weight.dtype)

    # inputs = {'input_ids': torch.tensor([1]*361, dtype=torch.int64),
    #           'attention_mask': torch.tensor([True]*361, dtype=torch.bool),
    #           'labels': torch.tensor([13]*361, dtype=torch.int64)}
    # output = model(**inputs)

    if args.do_train:
        if not args.full_finetune:
            model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=args.gradient_checkpointing)

            if args.peft_path is not None:
                checkpoint_dir = args.peft_path

            if checkpoint_dir is not None:
                print("Loading adapters from checkpoint.")

                model = PeftModel.from_pretrained(model, checkpoint_dir, is_trainable=True)

            else:
                print(f'adding LoRA modules...')

                if args.all_layers == 'ALL':
                    modules = find_all_linear_names(args, model)
                elif args.all_layers == 'ALL+NORM':
                    modules = find_all_linear_names(args, model) + find_all_norm_layers_for_llama(args, model)
                elif args.all_layers == 'ALL+NORM+LM':
                    modules = find_all_linear_names(args, model, include_lm_head=True) + find_all_norm_layers_for_llama(args, model)
                elif args.all_layers == 'ATT':
                    modules = ['k_proj', 'v_proj', 'q_proj', 'o_proj']
                elif args.all_layers == 'FFN':
                    modules = ['down_proj', 'up_proj', 'gate_proj']
                else:
                    raise ValueError(f'Invalid all_layers: {args.all_layers}')
                if args.peft_type == 'adalora':
                    config = AdaLoraConfig(
                        target_r=args.target_r,
                        init_r=args.lora_r,
                        tinit=args.tinit,
                        tfinal=args.tfinal,
                        total_step=args.total_steps,
                        orth_reg_weight=args.orth_reg_weight,
                        deltaT=args.deltaT,
                        lora_alpha=args.lora_alpha,
                        target_modules=modules,
                        lora_dropout=args.lora_dropout,
                    )
                else:
                    raise ValueError(f'Invalid peft type: {args.peft_type}')

                model = get_peft_model(model, config)
        else:
            print('Full finetune, no adapters.')

    else:
        if not args.full_finetune and checkpoint_dir is not None:
            print("Loading adapters from checkpoint.")
            model = PeftModel.from_pretrained(model, join(checkpoint_dir, 'adapter_model'), is_trainable=False)


    # Set the model to the correct dtype
    for name, module in model.named_modules():
        if isinstance(module, LoraLayer):
            # module = module.to(torch.float32)
            if args.bf16:
                module = module.to(torch.bfloat16)
        # if 'norm' in name:
        #     module = module.to(torch.float32)
        # if 'lm_head' in name or 'embed_tokens' in name:
        #     if hasattr(module, 'weight'):
        #         if args.bf16 and module.weight.dtype == torch.float32:
        #             module = module.to(torch.bfloat16)
    if local_rank == 0:
        for name, param in model.named_parameters():
            print(f'{name}: {param.dtype}')

    return model, tokenizer


# def create_optimizer(args, model):
#     """
#     Setup the optimizer.
#
#     We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
#     Trainer's init through `optimizers`, or subclass and override this method in a subclass.
#     """
#     basic_parameters = []
#     lora_gamma_parameters = []
#     lora_weight_parameters = []
#     basic_parameters_name = []
#
#     for n, p in model.named_parameters():
#         if p.requires_grad:
#             if 'lora_gamma' in n:
#                 lora_gamma_parameters.append(p)
#             elif 'lora_weight' in n:
#                 lora_weight_parameters.append(p)
#             else:
#                 basic_parameters_name.append(n)
#                 basic_parameters.append(p)
#
#     optimizer_grouped_parameters = [
#         {
#             "params": basic_parameters,
#             "weight_decay": args.weight_decay,
#             "lr": args.learning_rate,
#         },
#         {
#             "params": lora_gamma_parameters,
#             "weight_decay": args.weight_decay,
#             "lr": args.learning_rate * args.gamma_learning_ratio,
#         },
#         {
#             "params": lora_weight_parameters,
#             "weight_decay": args.weight_decay,
#             "lr": args.learning_rate * args.weight_learning_ratio,
#         },
#     ]
#
#     optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, weight_decay=args.weight_decay)
#
#     return optimizer

def read_layer_score_file(args):
    if args.layer_score_file is not None:
        if args.block_wise:
            key = 'block_score'
        else:
            key = 'lora_score'
        with open(args.layer_score_file, 'r') as f:
            layer_json = json.load(f)
            lora_score = sort_dict_by_value(layer_json[key], largest_first=True)

    return lora_score


def print_trainable_parameters(args, model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    if args.bits == 4: trainable_params /= 2
    print(
        f"trainable params: {trainable_params} || "
        f"all params: {all_param} || "
        f"trainable: {100 * trainable_params / all_param}"
    )

def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))
    
    if num_new_tokens > 0:
        input_embeddings_data = model.get_input_embeddings().weight.data
        output_embeddings_data = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings_data[-num_new_tokens:] = input_embeddings_avg
        output_embeddings_data[-num_new_tokens:] = output_embeddings_avg

    model.config.pad_token_id = tokenizer.pad_token_id

@dataclass
class DataCollatorForCausalLM(object):
    tokenizer: transformers.PreTrainedTokenizer
    source_max_len: int
    target_max_len: int
    train_on_source: bool
    predict_with_generate: bool

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        # Extract elements
        sources = [example['input'] for example in instances]
        targets = [example['output'] for example in instances]
        # Tokenize
        tokenized_sources_with_prompt = self.tokenizer(
            sources,
            max_length=self.source_max_len,
            truncation=True,
            add_special_tokens=False,
        )
        tokenized_targets = self.tokenizer(
            targets,
            max_length=self.target_max_len,
            truncation=True,
            add_special_tokens=False,
        )
        # Build the input and labels for causal LM
        input_ids = []
        labels = []
        for tokenized_source, tokenized_target in zip(
            tokenized_sources_with_prompt['input_ids'],
            tokenized_targets['input_ids']
        ):
            if not self.predict_with_generate:
                input_ids.append(torch.tensor(tokenized_source + tokenized_target))
                if not self.train_on_source:
                    labels.append(
                        torch.tensor([IGNORE_INDEX for _ in range(len(tokenized_source))] + copy.deepcopy(tokenized_target))
                    )
                else:
                    labels.append(torch.tensor(copy.deepcopy(tokenized_source + tokenized_target)))
            else:
                input_ids.append(torch.tensor(tokenized_source))
        # Apply padding
        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) if not self.predict_with_generate else None

        data_dict = {
            'input_ids': input_ids,
            'attention_mask':input_ids.ne(self.tokenizer.pad_token_id),
        }
        if labels is not None:
            data_dict['labels'] = labels
        return data_dict

UNNATURAL_INSTRUCTIONS_PROMPT_DICT = {
    "prompt_input": (
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: "
    )
}

def extract_unnatural_instructions_data(examples):
    out = {
        'input': None,
        'output': None,
    }
    for example_instances in examples['instances']:
        prompt_format = UNNATURAL_INSTRUCTIONS_PROMPT_DICT["prompt_input"]
        input={
            "instruction": examples['instruction'],
            "input": example_instances['input'],
        }
        out['input'] = '<s>' + prompt_format.format(**input)
        out['output'] = example_instances['output'] + '</s>'
    # if extract_reformulations:
    #     for example_reformulations in examples['reformulations']:
    #         if example_reformulations is not None:
    #             for instance in example_reformulations:
    #                 out['input'].append(instance['instruction_with_input'])
    #                 out['output'].append(instance['output'])
    return out

def extract_unnatural_instructions_data_v2(examples):
    out = {
        'input': None,
        'output': None,
    }
    for example_instances in examples['instances']:
        prompt_format = UNNATURAL_INSTRUCTIONS_PROMPT_DICT["prompt_input"]
        input={
            "instruction": examples['instruction'],
            "input": example_instances['input'],
        }
        out['input'] = ''
        out['output'] = '<s>' + prompt_format.format(**input) + example_instances['output'] + '</s>'
    # if extract_reformulations:
    #     for example_reformulations in examples['reformulations']:
    #         if example_reformulations is not None:
    #             for instance in example_reformulations:
    #                 out['input'].append(instance['instruction_with_input'])
    #                 out['output'].append(instance['output'])
    return out

ALPACA_PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: "
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response: "
    ),
}

ALPACA_PROMPT_DICT_2 = {
    "prompt_input": (
        "[INST] <<SYS>> Below is an instruction that describes a task, paired with an input that provides further context."
        "Write a response that appropriately completes the request.\n\n<</SYS>>"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n[/INST] ### Response: "
    ),
    "prompt_no_input": (
        "[INST] <<SYS>> Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n<</SYS>>"
        "### Instruction:\n{instruction}\n\n[/INST] ### Response: "
    ),
}

ALPACA_PROMPT_DICT_3 = {
    "prompt_input": (
        "[INST] <<SYS>>\nYou are a helpful, unbiased, uncensored assistant.\n<</SYS>>\n\n"
        "{instruction} The input is {input} [/INST] "
    ),
    "prompt_no_input": (
        "[INST] <<SYS>>\nYou are a helpful, unbiased, uncensored assistant.\n<</SYS>>\n\n"
        "{instruction} [/INST] "
    )
}

def extract_alpaca_dataset(example):
    if example.get("input", "") != "":
        prompt_format = ALPACA_PROMPT_DICT["prompt_input"]
    else:
        prompt_format = ALPACA_PROMPT_DICT["prompt_no_input"]
    return {'input': prompt_format.format(**example)}

def extract_alpaca_dataset_llama2(example):
    out = {
        'input': None,
        'output': None,
    }

    if example.get("input", "") != "":
        prompt_format = ALPACA_PROMPT_DICT["prompt_input"]
    else:
        prompt_format = ALPACA_PROMPT_DICT["prompt_no_input"]

    out['input'] = '<s>' + prompt_format.format(**example)
    out['output'] = example['output'] + '</s>'
    return out

def extract_alpaca_dataset_llama2_v2(example):
    out = {
        'input': "",
        'output': None,
    }

    if example.get("input", "") != "":
        prompt_format = ALPACA_PROMPT_DICT["prompt_input"]
    else:
        prompt_format = ALPACA_PROMPT_DICT["prompt_no_input"]

    input = '<s>' + prompt_format.format(**example)
    output = example['output'] + '</s>'

    out['output'] = input + output

    return out

LIMA_PROMPT_DICT = (
    "[INST] {instruction} [/INST] "
)

# NORobots_PROMPT_DICT = {
#     "prompt_system": (
#         "<s> [INST] <<SYS>>\n{system}\n<</SYS>>\n\n"
#         "{user} [/INST] {assistant} </s>"
#     ),
#     "prompt_no_system": (
#         "<s> [INST] {user} [/INST] {assistant} </s>"
#     )
# }

NORobots_PROMPT_DICT = {
    "prompt_system": (
        "<s>{system}\n\n"
        "User: {user} Assistant: {assistant} </s>"
    ),
    "prompt_no_system": (
        "<s> User: {user} Assistant: {assistant} </s>"
    )
}

def extract_norobots_data(examples):
    messages = examples['messages']
    out = {
        'input': '',
        'output': '',
    }

    if messages[0]['role'] == 'system':
        data = {
            'system': messages[0]['content'],
            'user': messages[1]['content'],
            'assistant': messages[2]['content'],
        }
        assert messages[0]['role'] == 'system'
        assert messages[1]['role'] == 'user'
        assert messages[2]['role'] == 'assistant'

        out['output'] = NORobots_PROMPT_DICT["prompt_system"].format(**data)

        cur_index = 3
    elif messages[0]['role'] == 'user':
        data = {
            'user': messages[0]['content'],
            'assistant': messages[1]['content']
        }

        assert messages[0]['role'] == 'user'
        assert messages[1]['role'] == 'assistant'

        out['output'] = NORobots_PROMPT_DICT["prompt_no_system"].format(**data)
        cur_index = 2
    else:
        raise ValueError(f"Invalid role: {messages[0]['role']}")

    for i in range(cur_index, len(messages), 2):
        if i + 1 == len(messages):
            if messages[i]['role'] == 'user':
                user = messages[i]['content']
                assistant = ''
            elif messages[i]['role'] == 'assistant':
                user = ''
                assistant = messages[i]['content']
            else:
                raise ValueError(f"Invalid role: {messages[i]['role']}")
        else:
            user = messages[i]['content']
            assistant = messages[i + 1]['content']
            assert messages[i]['role'] == 'user'
            assert messages[i + 1]['role'] == 'assistant'

        out['output'] += f'<s>User: {user} Assistant: {assistant} </s>'

    return out

def extract_lima_data(examples):
    if len(examples['conversations']) == 1:
        Input = {'instruction': examples['conversations'][0]}
        out = {
            'input': None,
            'output': '',
        }
        out['input'] = '<s>' + LIMA_PROMPT_DICT.format(**Input)
    else:
        output = ''
        p = int(len(examples['conversations']) / 2)
        for i in range(p):
            start = examples['conversations'][2 * i]
            end = examples['conversations'][2 * i + 1]
            output += f'<s>[INST] {start} [/INST] {end} </s>'

        out = {
            'input': '',
            'output': output,
        }

    return out

def local_dataset(dataset_name):
    if dataset_name.endswith('.json') or dataset_name.endswith('.jsonl'):
        full_dataset = Dataset.from_json(path_or_paths=dataset_name)
    elif dataset_name.endswith('.csv'):
        full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name))
    elif dataset_name.endswith('.tsv'):
        full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name, delimiter='\t'))
    else:
        raise ValueError(f"Unsupported dataset format: {dataset_name}")

    split_dataset = full_dataset.train_test_split(test_size=0.1)
    return split_dataset

def make_data_module(tokenizer: transformers.PreTrainedTokenizer, args) -> Dict:
    """
    Make dataset and collator for supervised fine-tuning.
    Datasets are expected to have the following columns: { `input`, `output` }

    Available datasets to be selected with `dataset` argument:
        - alpaca, 52002 examples
        - alpaca cleaned, 51942 examples
        - chip2 (OIG), 210289 examples
        - self-instruct, 82612 examples
        - hh-rlhf (Anthropic), 160800 examples
        - longform, 23.7k examples
        - oasst1 (OpenAssistant) primary message tree only, 9,846 examples

    Coming soon:
        - unnatural instructions core, 66010 examples
        - unnatural instructions full, 240670 examples
        - alpaca-gpt4, 52002 examples
        - unnatural-instructions-gpt4, 9000 examples
        - supernatural-instructions, 69624 examples (same as paper with 100 ex/task more can be used)
        - flan (FLAN v2), up to 20M examples available
        - vicuna
    """
    def load_data(dataset_name):
        if dataset_name == 'alpaca':
            return load_dataset("tatsu-lab/alpaca")
            # return load_from_disk('./datasets/alpaca')
        elif dataset_name == 'alpaca-gpt4':
            return load_from_disk('./dataset/alpaca-gpt4')
            # return load_dataset("vicgalle/alpaca-gpt4")
        elif dataset_name == 'alpaca-clean':
            return load_dataset("yahma/alpaca-cleaned")
        elif dataset_name == 'chip2':
            return load_dataset("laion/OIG", data_files='unified_chip2.jsonl')
        elif dataset_name == 'self-instruct':
            return load_dataset("yizhongw/self_instruct", name='self_instruct')
        elif dataset_name == 'hh-rlhf':
            return load_dataset("Anthropic/hh-rlhf")
        elif dataset_name == 'longform':
            return load_dataset("akoksal/LongForm")
        elif dataset_name == 'oasst1':
            return load_from_disk("./dataset/openassistant-guanaco")
            # return load_dataset("timdettmers/openassistant-guanaco")
        elif dataset_name == 'unnatural-instructions-core':
            return load_from_disk("./dataset/unnatural-instructions-core")
            # return load_dataset("mrm8488/unnatural-instructions-core")
        elif dataset_name == 'LIMA':
            return load_from_disk("./dataset/lima")
        elif dataset_name == 'no_robots':
            return load_from_disk("./dataset/no_robots")
            # return load_dataset("GAIR/lima")
        elif dataset_name == 'vicuna':
            raise NotImplementedError("Vicuna data was not released.")
        else:
            if os.path.exists(dataset_name):
                try:
                    args.dataset_format = args.dataset_format if args.dataset_format else "input-output"
                    full_dataset = local_dataset(dataset_name)
                    return full_dataset
                except:
                    raise ValueError(f"Error loading dataset from {dataset_name}")
            else:
                raise NotImplementedError(f"Dataset {dataset_name} not implemented yet.")

    def format_dataset(dataset, dataset_format):
        if (
            dataset_format == 'alpaca' or dataset_format == 'alpaca-clean' or dataset_format == 'alpaca-gpt4' or
            (dataset_format is None and args.dataset in ['alpaca', 'alpaca-clean', 'alpaca-gpt4'])
        ):
            # dataset = dataset.map(extract_alpaca_dataset, remove_columns=['instruction'])
            if args.train_with_input:
                dataset = dataset.map(extract_alpaca_dataset_llama2_v2, remove_columns=['instruction'])
            else:
                dataset = dataset.map(extract_alpaca_dataset_llama2, remove_columns=['instruction'])

        elif dataset_format == 'chip2' or (dataset_format is None and args.dataset == 'chip2'):
            dataset = dataset.map(lambda x: {
                'input': x['text'].split('\n<bot>: ')[0].replace('<human>: ', ''),
                'output': x['text'].split('\n<bot>: ')[1],
            })
        elif dataset_format == 'self-instruct' or (dataset_format is None and args.dataset == 'self-instruct'):
            for old, new in [["prompt", "input"], ["completion", "output"]]:
                dataset = dataset.rename_column(old, new)
        elif dataset_format == 'hh-rlhf' or (dataset_format is None and args.dataset == 'hh-rlhf'):
            dataset = dataset.map(lambda x: {
                'input': '',
                'output': x['chosen']
            })
        elif dataset_format == 'LIMA' or (dataset_format is None and args.dataset == 'LIMA'):
            dataset = dataset.map(extract_lima_data)
        elif dataset_format == 'no_robots' or (dataset_format is None and args.dataset == 'no_robots'):
            dataset = dataset.map(extract_norobots_data)
            dataset['train'] = dataset['train_sft']
            dataset['test'] = dataset['test_sft']
        elif dataset_format == 'oasst1' or (dataset_format is None and args.dataset == 'oasst1'):
            dataset = dataset.map(lambda x: {
                'input': '',
                'output': '<s>' + x['text'] + '</s>',
            })
        elif dataset_format == 'unnatural-instructions-core' or (dataset_format is None and args.dataset == 'unnatural-instructions-core'):
            dataset = dataset.map(extract_unnatural_instructions_data)
        elif dataset_format == 'input-output':
            # leave as is
            pass
        # Remove unused columns.
        dataset = dataset.remove_columns(
            [col for col in dataset.column_names['train'] if col not in ['input', 'output']]
        )
        return dataset

     # Load dataset.
    dataset = load_data(args.dataset)
    dataset = format_dataset(dataset, args.dataset_format)

    # for i in range(4):
    #     print(dataset['train'][i])
    #     print(dataset['train'][-i])
    # print(len(dataset['train']))
    # print(dataset[0])

    # Split train/eval, reduce size
    if args.do_eval or args.do_predict:
        if 'eval' in dataset:
            eval_dataset = dataset['eval']
        else:
            print('Splitting train dataset in train and validation according to `eval_dataset_size`')
            dataset = dataset["train"].train_test_split(
                test_size=args.eval_dataset_size, shuffle=True, seed=42
            )
            eval_dataset = dataset['test']
        if args.max_eval_samples is not None and len(eval_dataset) > args.max_eval_samples:
            eval_dataset = eval_dataset.select(range(args.max_eval_samples))
        if args.group_by_length:
            eval_dataset = eval_dataset.map(lambda x: {'length': len(x['input']) + len(x['output'])})
    if args.do_train:
        train_dataset = dataset['train']
        # train_dataset = train_dataset[:args.max_train_samples]
        if args.max_train_samples is not None and len(train_dataset) > args.max_train_samples:
            train_dataset = train_dataset.select(range(args.max_train_samples))
        if args.group_by_length:
            train_dataset = train_dataset.map(lambda x: {'length': len(x['input']) + len(x['output'])})

    data_collator = DataCollatorForCausalLM(
        tokenizer=tokenizer,
        source_max_len=args.source_max_len,
        target_max_len=args.target_max_len,
        train_on_source=args.train_on_source,
        predict_with_generate=args.predict_with_generate,
    )
    return dict(
        train_dataset=train_dataset if args.do_train else None,
        eval_dataset=eval_dataset if args.do_eval else None,
        predict_dataset=eval_dataset if args.do_predict else None,
        data_collator=data_collator
    )

def get_last_checkpoint(checkpoint_dir):
    if isdir(checkpoint_dir):
        is_completed = exists(join(checkpoint_dir, 'completed'))
        # if is_completed: return None, True # already finished
        max_step = 0
        for filename in os.listdir(checkpoint_dir):
            if isdir(join(checkpoint_dir, filename)) and filename.startswith('checkpoint'):
                max_step = max(max_step, int(filename.replace('checkpoint-', '')))
        if max_step == 0: return None, is_completed # training started, but no checkpoint
        checkpoint_dir = join(checkpoint_dir, f'checkpoint-{max_step}')
        print(f"Found a previous checkpoint at: {checkpoint_dir}")
        return checkpoint_dir, is_completed # checkpoint found!
    return None, False # first training

class AdaLoraTrainer(Seq2SeqTrainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def compute_loss(self, model, inputs, return_outputs=False):
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)

        orth_reg_weight = model.peft_config[model.trainable_adapter_name].orth_reg_weight
        assert orth_reg_weight > 0

        if hasattr(outputs, "loss"):
            regu_loss = 0
            num_param = 0
            for n, p in model.named_parameters():
                if ("lora_A" in n or "lora_B" in n) and model.trainable_adapter_name in n:
                    para_cov = p @ p.T if "lora_A" in n else p.T @ p
                    I = torch.eye(*para_cov.size(), out=torch.empty_like(para_cov))
                    I.requires_grad = False
                    num_param += 1
                    regu_loss += torch.norm(para_cov - I, p="fro").to(outputs.loss.device)
            if num_param > 0:
                regu_loss = regu_loss / num_param
            else:
                regu_loss = 0
            outputs.loss += orth_reg_weight * regu_loss

        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            unwrapped_model = unwrap_model(model)
            if is_peft_available() and isinstance(unwrapped_model, PeftModel):
                model_name = unwrapped_model.base_model.model._get_name()
            else:
                model_name = unwrapped_model._get_name()
            if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
                loss = self.label_smoother(outputs, labels, shift_labels=True)
            else:
                loss = self.label_smoother(outputs, labels)
        else:
            if isinstance(outputs, dict) and "loss" not in outputs:
                raise ValueError(
                    "The model did not return a loss from the inputs, only the following keys: "
                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
                )
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

        return (loss, outputs) if return_outputs else loss

def train():
    hfparser = transformers.HfArgumentParser((
        ModelArguments, DataArguments, TrainingArguments, GenerationArguments, ExtraArguments
    ))
    model_args, data_args, training_args, generation_args, project_args, extra_args = \
        hfparser.parse_args_into_dataclasses(return_remaining_strings=True)
    training_args.generation_config = transformers.GenerationConfig(**vars(generation_args))
    args = argparse.Namespace(
        **vars(model_args), **vars(data_args), **vars(training_args), **vars(project_args)
    )
    print(args)

    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    if training_args.should_log:
        transformers.utils.logging.set_verbosity_info()
    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, bfloat16 training: {training_args.bf16}"
    )

    if 'wandb' in args.report_to:
        init_wandb(args.project_name)

    if 'checkpoint-' in args.output_dir:
        checkpoint_dir = args.output_dir
    else:
        checkpoint_dir, completed_training = get_last_checkpoint(args.output_dir)
        if completed_training and args.do_train:
            raise ValueError('Training already completed. Please delete the output folder to start a new training.')

    model, tokenizer = get_accelerate_model(args, checkpoint_dir)

    model.config.use_cache = False
    print('loaded model')
    set_seed(args.seed)

    data_module = make_data_module(tokenizer=tokenizer, args=args)

    # if args.dynamic_lr:
    #     optimzer = create_optimizer_based_on_lora_score(args, model)
    # else:
    #     optimzer = create_optimizer(args, model)

    trainer = AdaLoraTrainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        **{k:v for k,v in data_module.items() if k != 'predict_dataset'},
    )

    for name, moudle in model.named_modules():
        if hasattr(moudle, 'trainer'):
            moudle.trainer = trainer

    assert args.peft_type in ['adalora']

    # Callbacks
    if not args.full_finetune:
        trainer.add_callback(SavePeftModelCallback)
    if args.peft_type == 'adalora':
        trainer.add_callback(AdaLoRACallback)

    # Verifying the datatypes and parameter counts before training.
    if training_args.local_rank == 0:
        print_trainable_parameters(args, model)
        dtypes = {}
        for _, p in model.named_parameters():
            dtype = p.dtype
            if dtype not in dtypes: dtypes[dtype] = 0
            dtypes[dtype] += p.numel()
        total = 0
        for k, v in dtypes.items(): total+= v
        for k, v in dtypes.items():
            print(k, v, v/total)

    all_metrics = {"run_name": args.run_name}
    # Training
    if args.do_train:
        logger.info("*** Train ***")
        # Note: `resume_from_checkpoint` not supported for adapter checkpoints by HF.
        # Currently adapter checkpoint is reloaded as expected but optimizer/scheduler states are not.
        train_result = trainer.train()
        local_rank = int(os.environ.get('LOCAL_RANK', '0'))
        # if local_rank == 0:
        trainer.save_model(os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-final"))
        trainer.save_state()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        all_metrics.update(metrics)


    # Evaluation
    if args.do_eval:
        logger.info("*** Evaluate ***")
        metrics = trainer.evaluate(metric_key_prefix="eval")
        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)
        all_metrics.update(metrics)
    # Prediction
    if args.do_predict:
        logger.info("*** Predict ***")
        prediction_output = trainer.predict(test_dataset=data_module['predict_dataset'],metric_key_prefix="predict")
        prediction_metrics = prediction_output.metrics
        predictions = prediction_output.predictions
        predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
        predictions = tokenizer.batch_decode(
            predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )
        with open(os.path.join(args.output_dir, 'predictions.jsonl'), 'w') as fout:
            for i, example in enumerate(data_module['predict_dataset']):
                example['prediction_with_input'] = predictions[i].strip()
                example['prediction'] = predictions[i].replace(example['input'], '').strip()
                fout.write(json.dumps(example) + '\n')
        print(prediction_metrics)
        trainer.log_metrics("predict", prediction_metrics)
        trainer.save_metrics("predict", prediction_metrics)
        all_metrics.update(prediction_metrics)

    if (args.do_train or args.do_eval or args.do_predict):
        with open(os.path.join(args.output_dir, "metrics.json"), "w") as fout:
            fout.write(json.dumps(all_metrics))

if __name__ == "__main__":
    train()
