import time

import torch
import torch.nn as nn

from optq import *

from modelutils import *
from quant import *
from datetime import datetime, timedelta


from gptqmodel.utils. importer import hf_select_quant_linear
from gptqmodel.quantization.config import QuantizeConfig, FORMAT


def pack_model(model, quantizers, bits=4, group_size=64):
    QuantLinear = hf_select_quant_linear(
        bits=bits,
        group_size=group_size,
        desc_act=False,
        sym=False,
        checkpoint_format="safetensors",
        device_map="cuda"
    )

    def replace_module(model, target_name, new_module):
        name_parts = target_name.split('.')
        parent = model
        for part in name_parts[:-1]:
            parent = parent._modules[part]
        attr = name_parts[-1]
        parent._modules[attr] = new_module


    for name, layer in model.named_modules():
        if name not in quantizers.keys():
            continue
        if isinstance(layer, nn.Linear):
            # quantizers[name]
            quant_linear = QuantLinear(
                in_features=layer.in_features,
                out_features=layer.out_features,
                bias=(layer.bias is not None),
                bits=bits,
                group_size=group_size,
                desc_act=False,
                sym=False
            )

            weight = layer.weight.data.clone()
            weight = weight.cuda()
            out_features, in_features = weight.shape

            weight_grouped = weight.reshape(out_features * (in_features//group_size), group_size)
            q = torch.clamp(torch.round(weight_grouped / quantizers[name].scale.reshape(-1, 1))+ quantizers[name].zero.reshape(-1, 1), 0, quantizers[name].maxq)
            fake_quant_weight = quantizers[name].scale.reshape(-1, 1) * (q - quantizers[name].zero.reshape(-1, 1))
            fake_quant_weight = fake_quant_weight.reshape(out_features, in_features)
            fake_quant_layer = nn.Linear(in_features=in_features, out_features=out_features, bias=False)
            fake_quant_layer.weight.data = fake_quant_weight
            quant_linear.pack(fake_quant_layer.cpu(), quantizers[name].scale.cpu(), quantizers[name].zero.cpu())
            quant_linear.post_init()
            replace_module(model, name, quant_linear)

            del weight

    for quantized_module_name, quantized_module in model.named_modules():
        if hasattr(quantized_module, "qweight"): # QuantLinear layer
            quantized_module.wf_unsqueeze_zero = quantized_module.wf_unsqueeze_zero.cuda()
            quantized_module.wf_unsqueeze_neg_one = quantized_module.wf_unsqueeze_neg_one.cuda()
    return model


def get_llama(model):
    import torch
    def skip(*args, **kwargs):
        pass
    torch.nn.init.kaiming_uniform_ = skip
    torch.nn.init.uniform_ = skip
    torch.nn.init.normal_ = skip
    from transformers import LlamaForCausalLM
    model = LlamaForCausalLM.from_pretrained(model, torch_dtype='auto')
    model.seqlen = 2048
    return model

@torch.no_grad()
def llama_sequential(model, dataloader, dev):
    print('Starting ...')

    use_cache = model.config.use_cache
    model.config.use_cache = False
    layers = model.model.layers

    model.model.embed_tokens = model.model.embed_tokens.to(dev)
    model.model.norm = model.model.norm.to(dev)
    layers[0] = layers[0].to(dev)

    dtype = next(iter(model.parameters())).dtype
    inps = torch.zeros(
        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
    )
    cache = {'i': 0, 'attention_mask': None}

    class Catcher(nn.Module):
        def __init__(self, module):
            super().__init__()
            self.module = module
        def forward(self, inp, **kwargs):
            inps[cache['i']] = inp
            cache['i'] += 1
            cache['attention_mask'] = kwargs['attention_mask']
            # Handle both old and new position embeddings format
            if 'position_ids' in kwargs:
                cache['position_ids'] = kwargs['position_ids']
            cache['rotary_emb'] = kwargs.get('position_embeddings', None)
            raise ValueError
    layers[0] = Catcher(layers[0])
    for batch in dataloader:
        try:
            model(batch[0].to(dev))
        except ValueError:
            pass
    layers[0] = layers[0].module

    layers[0] = layers[0].cpu()
    model.model.embed_tokens = model.model.embed_tokens.cpu()
    model.model.norm = model.model.norm.cpu()
    torch.cuda.empty_cache()

    outs = torch.zeros_like(inps)
    attention_mask = cache['attention_mask']
    position_ids = cache.get('position_ids', None)
    rotary_emb = cache.get('rotary_emb', None)

    print('Ready.')
    start_time = datetime.now()

    beta = 1
    CD_iter = 1

    if args.wbits == 2:
        beta = 0.8
        CD_iter = 30
    elif args.wbits == 3:
        beta = 0.9

    if args.groupsize == 128:
        beta = 0.95
    elif args.groupsize == 64:
        beta = 1

    quantizers = {}
    for i in range(len(layers)):
        layer = layers[i].to(dev)
        full = find_layers(layer)

        if args.true_sequential:
            sequential = [
                ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'],
                ['self_attn.o_proj'],
                ['mlp.up_proj', 'mlp.gate_proj'],
                ['mlp.down_proj']
            ]
        else:
            sequential = [list(full.keys())]

        for names in sequential:
            subset = {n: full[n] for n in names}

            gptq = {}
            for name in subset:
                gptq[name] = GPTQ(subset[name])
                gptq[name].quantizer = Quantizer()


                gptq[name].quantizer.configure(
                    args.wbits, perchannel=True, sym=args.sym, mse=False, beta=beta,
                )

            def add_batch(name):
                def tmp(_, inp, out):
                    gptq[name].add_batch(inp[0].data, out.data)
                return tmp
            handles = []
            for name in subset:
                handles.append(subset[name].register_forward_hook(add_batch(name)))
            for j in range(args.nsamples):
                # Handle both old and new position embeddings format
                forward_kwargs = {'attention_mask': attention_mask}
                if position_ids is not None:
                    forward_kwargs['position_ids'] = position_ids
                if rotary_emb is not None:
                    forward_kwargs['position_embeddings'] = rotary_emb
                    
                # Add return_dict=False and use_cache=False for transformers 4.48+
                forward_kwargs['return_dict'] = False
                forward_kwargs['use_cache'] = False
                    
                outs[j] = layer(inps[j].unsqueeze(0), **forward_kwargs)[0]
            for h in handles:
                h.remove()

            for name in subset:
                print(i, name)
                print('Quantizing ...')
                gptq[name].fasterquant(
                    percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order, static_groups=args.static_groups, magr=args.magr, CD_iter=CD_iter
                )
                # quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer
                quantizers['model.layers.%d.%s' % (i, name)] = gptq[name].quantizer
                gptq[name].free()

        for j in range(args.nsamples):
            # Handle both old and new position embeddings format
            forward_kwargs = {'attention_mask': attention_mask}
            if position_ids is not None:
                forward_kwargs['position_ids'] = position_ids
            if rotary_emb is not None:
                forward_kwargs['position_embeddings'] = rotary_emb
                
            # Add return_dict=False and use_cache=False for transformers 4.48+
            forward_kwargs['return_dict'] = False
            forward_kwargs['use_cache'] = False
                
            outs[j] = layer(inps[j].unsqueeze(0), **forward_kwargs)[0]

        layers[i] = layer.cpu()
        del layer
        del gptq
        torch.cuda.empty_cache()

        inps, outs = outs, inps

    model.config.use_cache = use_cache
    end_time = datetime.now()

    print(f'\nTime used for evaluation: {end_time - start_time}\n')

    return quantizers

@torch.no_grad()
def llama_eval(model, testenc, dev):
    print('Evaluating ...')

    testenc = testenc.input_ids
    nsamples = testenc.numel() // model.seqlen

    use_cache = model.config.use_cache
    model.config.use_cache = False
    layers = model.model.layers

    model.model.embed_tokens = model.model.embed_tokens.to(dev)
    layers[0] = layers[0].to(dev)

    dtype = next(iter(model.parameters())).dtype
    inps = torch.zeros(
        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
    )
    cache = {'i': 0, 'attention_mask': None}

    class Catcher(nn.Module):
        def __init__(self, module):
            super().__init__()
            self.module = module
        def forward(self, inp, **kwargs):
            inps[cache['i']] = inp
            cache['i'] += 1
            cache['attention_mask'] = kwargs['attention_mask']
            if 'position_ids' in kwargs:
                cache['position_ids'] = kwargs['position_ids']
            cache['rotary_emb'] = kwargs.get('position_embeddings', None)
            raise ValueError
    layers[0] = Catcher(layers[0])
    for i in range(nsamples):
        batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
        try:
            model(batch)
        except ValueError:
            pass
    layers[0] = layers[0].module

    layers[0] = layers[0].cpu()
    model.model.embed_tokens = model.model.embed_tokens.cpu()
    torch.cuda.empty_cache()

    outs = torch.zeros_like(inps)
    attention_mask = cache['attention_mask']
    position_ids = cache.get('position_ids', None)
    rotary_emb = cache.get('rotary_emb', None)

    for i in range(len(layers)):
        print(i)
        layer = layers[i].to(dev)

        if args.nearest:
            subset = find_layers(layer)
            for name in subset:
                quantizer = Quantizer()
                quantizer.configure(
                    args.wbits, perchannel=True, sym=False, mse=False
                )
                W = subset[name].weight.data
                quantizer.find_params(W, weight=True)
                subset[name].weight.data = quantize(
                    W, quantizer.scale, quantizer.zero, quantizer.maxq
                ).to(next(iter(layer.parameters())).dtype)

        for j in range(nsamples):
            # Handle both old and new position embeddings format
            forward_kwargs = {'attention_mask': attention_mask}
            if position_ids is not None:
                forward_kwargs['position_ids'] = position_ids
            if rotary_emb is not None:
                forward_kwargs['position_embeddings'] = rotary_emb
                
            # Add return_dict=False and use_cache=False for transformers 4.48+
            forward_kwargs['return_dict'] = False
            forward_kwargs['use_cache'] = False
                
            outs[j] = layer(inps[j].unsqueeze(0), **forward_kwargs)[0]
            
        layers[i] = layer.cpu()
        del layer
        torch.cuda.empty_cache()
        inps, outs = outs, inps

    if model.model.norm is not None:
        model.model.norm = model.model.norm.to(dev)
    model.lm_head = model.lm_head.to(dev)

    testenc = testenc.to(dev)
    nlls = []
    for i in range(nsamples):
        hidden_states = inps[i].unsqueeze(0)
        if model.model.norm is not None:
            hidden_states = model.model.norm(hidden_states)
        lm_logits = model.lm_head(hidden_states)
        shift_logits = lm_logits[:, :-1, :].contiguous()
        shift_labels = testenc[
            :, (i * model.seqlen):((i + 1) * model.seqlen)
        ][:, 1:]
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * model.seqlen
        nlls.append(neg_log_likelihood)
    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
    print(ppl.item())

    model.config.use_cache = use_cache

def llama_pack3(model, quantizers):
    layers = find_layers(model)
    print(layers)
    layers = {n: layers[n] for n in quantizers}
    make_quant3(model, quantizers)
    qlayers = find_layers(model, [Quant3Linear])
    print('Packing ...')
    for name in qlayers:
        # print(name)
        quantizers[name] = quantizers[name].cpu()
        qlayers[name].pack(layers[name], quantizers[name].scale, quantizers[name].zero)
    print('Done.')
    return model


if __name__ == '__main__':
    import argparse
    from datautils import *

    parser = argparse.ArgumentParser()

    parser.add_argument(
        'model', type=str,
        help='Llama model to load; pass location of hugginface converted checkpoint.'
    )
    parser.add_argument(
        '--dataset', type=str, default="wikitext2", choices=['wikitext2', 'ptb', 'c4'],
        help='Where to extract calibration data from.'
    )
    parser.add_argument(
        '--seed',
        type=int, default=0, help='Seed for sampling the calibration data.'
    )
    parser.add_argument(
        '--nsamples', type=int, default=128,
        help='Number of calibration data samples.'
    )
    parser.add_argument(
        '--percdamp', type=float, default=.01,
        help='Percent of the average Hessian diagonal to use for dampening.'
    )
    parser.add_argument(
        '--nearest', action='store_true',
        help='Whether to run the RTN baseline.'
    )
    parser.add_argument(
        '--wbits', type=int, default=4, choices=[2, 3, 4, 8, 16],
        help='#bits to use for quantization; use 16 for evaluating base model.'
    )
    parser.add_argument(
        '--groupsize', type=int, default=-1,
        help='Groupsize to use for quantization; default uses full row.'
    )
    parser.add_argument(
        '--sym', action='store_true',
        help='Whether to perform symmetric quantization.'
    )
    parser.add_argument(
        '--save', type=str, default='',
        help='Save quantized checkpoint under this name.'
    )
    parser.add_argument(
        '--new-eval', action='store_true',
        help='Whether to use the new PTB and C4 eval.'
    )
    parser.add_argument(
        '--act-order', action='store_true',
        help='Whether to apply the activation order GPTQ heuristic'
    )

    parser.add_argument(
        '--true-sequential', action='store_true',
        help='Whether to run in true sequential model.'
    )
    parser.add_argument(
        '--static-groups', action='store_true',
        help='Whether to use static groups; recommended when using `--actorder` for more efficient inference.'
    )
    parser.add_argument(
        '--magr', action='store_true',
        help='Whether to apply the MagR process.'
    )


    args = parser.parse_args()

    # Add DEV variable definition
    DEV = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = get_llama(args.model)

    model.eval()

    dataloader, testloader = get_loaders(
        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
    )


    if args.wbits < 16 and not args.nearest:
        tick = time.time()
        quantizers = llama_sequential(model, dataloader, DEV)
        print(time.time() - tick)

    datasets = ['wikitext2', 'c4']
    if args.new_eval:
        datasets = ['wikitext2', 'c4-new']
    for dataset in datasets:
        dataloader, testloader = get_loaders(
            dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
        )
        print(dataset)
        llama_eval(model, testloader, DEV)


    if args.save:
        pack_model(model, quantizers, bits=args.wbits, group_size=args.groupsize)
        gptq_config = QuantizeConfig(
            bits=args.wbits,
            group_size=args.groupsize,
            sym=args.sym,
            format=FORMAT.GPTQ_V2,
        )
        model.config.quantization_config = gptq_config
        model.save_pretrained(args.save)

    datasets = ['wikitext2', 'c4']
    if args.new_eval:
        datasets = ['wikitext2', 'c4-new']
    for dataset in datasets:
        dataloader, testloader = get_loaders(
            dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
        )
        print(dataset)
        llama_eval(model, testloader, DEV)