import os
import sys
import argparse
from typing import List
from pathlib import Path

import torch
import transformers
from datasets import load_dataset

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from utils.prompter import Prompter, ZeroPrompter  # 请确保您有这些模块

device = "cuda" if torch.cuda.is_available() else "cpu"

# Wanda 稀疏化函数
def wanda_sparsify(weight, sparsity_ratio):
    weight_flattened = weight.view(-1)
    k = int(sparsity_ratio * weight_flattened.numel())
    threshold = torch.topk(torch.abs(weight_flattened), k, largest=False).values.max()
    sparse_mask = torch.abs(weight) > threshold
    sparse_weight = weight * sparse_mask.float()
    return sparse_weight

# 计算稀疏部分并执行 SVD
def initialize_lora_with_wanda_svd(weight, sparsity_ratio, reduced_rank):
    sparse_weight = wanda_sparsify(weight, sparsity_ratio)
    sparse_part = weight - sparse_weight
    U, S, Vh = torch.linalg.svd(sparse_part, full_matrices=False)
    L = U[:, :reduced_rank] @ torch.diag(torch.sqrt(S[:reduced_rank]))
    R = torch.diag(torch.sqrt(S[:reduced_rank])) @ Vh[:reduced_rank, :]
    return L, R

# 定义辅助函数
def get_lora_weight(module_lora):
    if isinstance(module_lora, torch.nn.ModuleDict):
        # 如果是 ModuleDict，取出 'default' 键对应的模块
        return module_lora['default'].weight.data
    else:
        return module_lora.weight.data

def set_lora_weight(module_lora, new_weight):
    if isinstance(module_lora, torch.nn.ModuleDict):
        module_lora['default'].weight.data = new_weight
    else:
        module_lora.weight.data = new_weight

def main(args):
    # 设置 WandB
    os.environ["WANDB_PROJECT"] = args.wandb_project

    # 加载基础模型
    tokenizer = transformers.AutoTokenizer.from_pretrained(args.base_model)
    model = transformers.AutoModelForCausalLM.from_pretrained(args.base_model)

    # 准备模型进行 LoRA 和 int8 训练
    model = prepare_model_for_int8_training(model)
    config = LoraConfig(
        r=args.lora_r,
        lora_alpha=args.lora_alpha,
        target_modules=args.lora_target_modules.split(","),
        lora_dropout=args.lora_dropout,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, config)
    model.print_trainable_parameters()

    gradient_accumulation_steps = args.batch_size // args.micro_batch_size
    if not args.no_instruction:
        prompter = Prompter(args.prompt_template_name)
    else:
        prompter = ZeroPrompter()

    world_size = int(os.environ.get("WORLD_SIZE", 1))
    ddp = world_size != 1
    if ddp:
        gradient_accumulation_steps = gradient_accumulation_steps // world_size

    if device == 'cuda':
        model.half()

    tokenizer.pad_token_id = 0
    tokenizer.padding_side = "left"

    def tokenize(prompt, add_eos_token=True):
        result = tokenizer(
            prompt,
            truncation=True,
            max_length=args.cutoff_len,
            padding=False,
            return_tensors=None,
        )
        if (
            result["input_ids"][-1] != tokenizer.eos_token_id
            and len(result["input_ids"]) < args.cutoff_len
            and add_eos_token
        ):
            result["input_ids"].append(tokenizer.eos_token_id)
            result["attention_mask"].append(1)

        result["labels"] = result["input_ids"].copy()

        return result

    def generate_and_tokenize_prompt(data_point):
        if 'lamini' in args.data_path.lower():
            full_prompt = prompter.generate_prompt(
                data_point["instruction"],
                None,
                data_point["response"],
            )
        elif 'alpaca' in args.data_path.lower():
            full_prompt = prompter.generate_prompt(
                data_point["instruction"],
                data_point.get("input", None),
                data_point["output"],
            )
        else:
            raise NotImplementedError

        tokenized_full_prompt = tokenize(full_prompt)
        if not args.train_on_inputs:
            user_prompt = prompter.generate_prompt(
                data_point["instruction"], data_point.get("input", None)
            )
            tokenized_user_prompt = tokenize(
                user_prompt, add_eos_token=args.add_eos_token
            )
            user_prompt_len = len(tokenized_user_prompt["input_ids"])

            if args.add_eos_token:
                user_prompt_len -= 1

            tokenized_full_prompt["labels"] = (
                [-100] * user_prompt_len + tokenized_full_prompt["labels"][user_prompt_len:]
            )
        return tokenized_full_prompt

    def split_and_tokenizer(test_data, tokenizer, seq_len, field_name):
        test_ids = tokenizer("\n\n".join(test_data[field_name]), return_tensors='pt').input_ids[0]
        nsamples = test_ids.numel() // seq_len

        test_set = []
        for i in range(nsamples):
            batch = test_ids[(i * seq_len):((i + 1) * seq_len)]
            test_set.append({
                'input_ids': batch,
                'labels': batch
            })
        return test_set

    # Wanda 稀疏化和 LoRA 初始化
    sparsity_ratio = 0.5  # 您可以根据需要调整稀疏度

    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear) and any(target in name for target in config.target_modules):
            weight = module.weight.data
            reduced_rank = config.r

            if args.use_sparse_init:
                # 使用稀疏部分初始化 LoRA 权重
                lora_A, lora_B = initialize_lora_with_wanda_svd(weight, sparsity_ratio, reduced_rank)
                # 设置 LoRA 权重
                set_lora_weight(module.lora_A, lora_A)
                set_lora_weight(module.lora_B, lora_B)
            else:
                # 不使用稀疏初始化，保持默认权重
                pass

    # 加载训练和验证数据集
    data = load_dataset(args.data_path)
    if args.cache_dataset and os.path.exists(f'datasets/cache/{args.data_path}.bin'):
        preprocess_data = torch.load(f'datasets/cache/{args.data_path}.bin')
        train_data, val_data = preprocess_data['train'], preprocess_data['val']
    else:
        train_val = data["train"].train_test_split(
            test_size=args.val_set_size, shuffle=True, seed=42
        )
        train_data = (
            train_val["train"].shuffle().map(generate_and_tokenize_prompt)
        )
        val_data = {
            args.data_path: train_val["test"].shuffle().map(generate_and_tokenize_prompt),
        }
        if args.cache_dataset and args.local_rank == 0:
            cache_file = f'datasets/cache/{args.data_path}.bin'
            cache_dir = os.path.dirname(cache_file)
            Path(cache_dir).mkdir(parents=True, exist_ok=True)

            torch.save({
                'train': train_data, 'val': val_data
            }, cache_file)

    # 加载额外的验证数据集（如果提供）
    if args.extra_val_dataset:
        seq_len = 128
        for extra_dataset in args.extra_val_dataset.split(','):
            if 'wikitext2' in extra_dataset:
                test_data = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
                test_data = split_and_tokenizer(test_data, tokenizer, seq_len, field_name='text')
            elif 'ptb' in extra_dataset:
                test_data = load_dataset('ptb_text_only', 'penn_treebank', split='test')
                test_data = split_and_tokenizer(test_data, tokenizer, seq_len, field_name='sentence')
            val_data[extra_dataset] = test_data

    trainer = transformers.Trainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=val_data,
        args=transformers.TrainingArguments(
            per_device_train_batch_size=args.micro_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            warmup_steps=100,
            num_train_epochs=args.num_epochs,
            learning_rate=args.learning_rate,
            fp16=True,
            logging_steps=10,
            logging_first_step=True,
            optim="adamw_torch",
            evaluation_strategy="steps",
            save_strategy="steps",
            eval_steps=100,
            save_steps=200,
            output_dir=args.output_dir,
            save_total_limit=20,
            load_best_model_at_end=True,
            ddp_find_unused_parameters=None,
            group_by_length=args.group_by_length,
            report_to="wandb",
            run_name=os.path.basename(args.output_dir),
            metric_for_best_model=f"{args.data_path}_loss",
        ),
        data_collator=transformers.DataCollatorForSeq2Seq(
            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
        ),
    )

    model.config.use_cache = False
    old_state_dict = model.state_dict
    model.state_dict = (
        lambda self, *_, **__: get_peft_model_state_dict(
            self, old_state_dict()
        )
    ).__get__(model, type(model))

    trainer.train(resume_from_checkpoint=args.resume_from_checkpoint)

    model.state_dict = old_state_dict
    model.save_pretrained(args.output_dir)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Tuning LLM with LoRA')
    
    # 模型类型和路径
    parser.add_argument('--base_model', type=str, default="baffo32/decapoda-research-llama-7B-hf", help='基础模型名称')
    parser.add_argument('--prune_model', type=str, help='剪枝模型名称')
    parser.add_argument('--data_path', type=str, default="yahma/alpaca-cleaned", help='数据路径')
    parser.add_argument('--cache_dataset', action="store_true", default=False)
    parser.add_argument('--extra_val_dataset', type=str, default=None, help='额外的验证数据集，用逗号分隔')
    parser.add_argument('--output_dir', type=str, default="./lora-alpaca", help='输出目录')

    # 训练超参数
    parser.add_argument('--batch_size', type=int, default=128, help='批量大小')
    parser.add_argument('--micro_batch_size', type=int, default=4, help='微批量大小')
    parser.add_argument('--num_epochs', type=int, default=5, help='训练轮数')
    parser.add_argument('--learning_rate', type=float, default=3e-4, help='学习率')
    parser.add_argument('--cutoff_len', type=int, default=256, help='截断长度')
    parser.add_argument('--val_set_size', type=int, default=2000, help='验证集大小')
    parser.add_argument('--prompt_template_name', type=str, default="alpaca", help="使用的提示模板，将默认为 alpaca。")
    parser.add_argument('--no_instruction', action='store_true', default=False, help="是否不使用指令模板。")

    # LoRA 配置
    parser.add_argument('--lora_r', type=int, default=8, help='LoRA rank')
    parser.add_argument('--lora_alpha', type=int, default=16, help='LoRA alpha')
    parser.add_argument('--lora_dropout', type=float, default=0.05, help='LoRA dropout')
    parser.add_argument('--lora_target_modules', type=str, default="q_proj,k_proj,v_proj,o_proj,gate_proj,down_proj,up_proj", help='LoRA 目标模块')
    parser.add_argument('--use_sparse_init', action='store_true', help='使用稀疏部分初始化 LoRA 权重')

    # LLM 超参数
    parser.add_argument('--train_on_inputs', default=False, action="store_true", help='在输入上训练。如果为 False，则在损失中屏蔽输入。')
    parser.add_argument('--add_eos_token', default=False, action="store_true")
    parser.add_argument('--group_by_length', default=False, action="store_true", help="更快，但会产生奇怪的训练损失曲线")

    # WandB 参数
    parser.add_argument('--wandb_project', type=str, default="")
    parser.add_argument('--resume_from_checkpoint', type=str, help="训练检查点或最终适配器")

    # DDP
    parser.add_argument('--local_rank', type=int, default=-1)

    args = parser.parse_args()
    main(args)
