import random
import os
import numpy as np
import torch
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, LlamaTokenizer

c4_traindataset_path = "/data/jiangcheng1/workspace/sparsegpt-master/datasets/train/c4"
c4_testdataset_path = "/data/jiangcheng1/workspace/sparsegpt-master/datasets/test/c4"
ptb_traindataset_path = "/data/jiangcheng1/workspace/sparsegpt-master/datasets/train/ptb"
ptb_testdataset_path = "/data/jiangcheng1/workspace/sparsegpt-master/datasets/test/ptb"
wikitext_traindataset_path = "/data/jiangcheng1/workspace/sparsegpt-master/datasets/train/wikitext"
wikitext_testdataset_path = "/data/jiangcheng1/workspace/sparsegpt-master/datasets/test/wikitext"


def set_seed(seed):
    np.random.seed(seed)
    torch.random.manual_seed(seed)

def get_tokenizer(model):
    if "llama" in model.lower():
        tokenizer = LlamaTokenizer.from_pretrained(model, use_fast=False)
        # fix for transformer 4.28.0.dev0 compatibility
        if tokenizer.bos_token_id != 1 or tokenizer.eos_token_id != 2:
            try:
                tokenizer.bos_token_id = 1
                tokenizer.eos_token_id = 2
            except AttributeError:
                pass
    else:
        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
    return tokenizer

# def get_wikitext2(nsamples, seed, seqlen, model, tokenizer):
    
#     traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
#     testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')

#     trainenc = tokenizer(" ".join(traindata['text']), return_tensors='pt')
#     testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt')

#     random.seed(seed)
#     trainloader = []
#     for _ in range(nsamples):
#         i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
#         j = i + seqlen
#         inp = trainenc.input_ids[:, i:j]
#         tar = inp.clone()
#         tar[:, :-1] = -100
#         trainloader.append((inp, tar))
#     return trainloader, testenc

# def get_ptb(nsamples, seed, seqlen, model, tokenizer):
#     traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
#     testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test')

#     trainenc = tokenizer(" ".join(traindata['sentence']), return_tensors='pt')
#     testenc = tokenizer(" ".join(testdata['sentence']), return_tensors='pt')

#     random.seed(seed)
#     trainloader = []
#     for _ in range(nsamples):
#         i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
#         j = i + seqlen
#         inp = trainenc.input_ids[:, i:j]
#         tar = inp.clone()
#         tar[:, :-1] = -100
#         trainloader.append((inp, tar))
#     return trainloader, testenc

def get_ptb(nsamples, seed, seqlen, model, tokenizer):
    traindata = load_from_disk(ptb_traindataset_path)
    testdata  = load_from_disk(ptb_testdataset_path)

    trainenc = tokenizer(" ".join(traindata['sentence']), return_tensors='pt')
    testenc = tokenizer(" ".join(testdata['sentence']), return_tensors='pt')

    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))
    return trainloader, testenc

def get_wikitext2(nsamples, seed, seqlen, model, tokenizer):
    traindata = load_from_disk(wikitext_traindataset_path)
    testdata  = load_from_disk(wikitext_testdataset_path)

    trainenc = tokenizer(" ".join(traindata['text']), return_tensors='pt')
    testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt')

    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))
    return trainloader, testenc

def get_c4(nsamples, seed, seqlen, model, tokenizer):
    if not bool(os.listdir(c4_traindataset_path)):
        traindata = load_dataset(
            'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train',
        )
        traindata.save_to_disk(c4_traindataset_path)
    else:
        traindata = load_from_disk(c4_traindataset_path)

    if not bool(os.listdir(c4_testdataset_path)):
        valdata = load_dataset(
            'allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation'
        )
        valdata.save_to_disk(c4_testdataset_path)
    else:
        valdata  = load_from_disk(c4_testdataset_path)

    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        while True:
            i = random.randint(0, len(traindata) - 1)
            trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
            if trainenc.input_ids.shape[1] >= seqlen:
                break
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))

    valenc = tokenizer(' '.join(valdata[:1100]['text']), return_tensors='pt')
    valenc = valenc.input_ids[:, :(256 * seqlen)]

    class TokenizerWrapper:
        def __init__(self, input_ids):
            self.input_ids = input_ids
    valenc = TokenizerWrapper(valenc)

    return trainloader, valenc

def get_loaders(name, nsamples=128, seed=0, seqlen=2048, model=''):
    tokenizer = get_tokenizer(model)
    if 'wikitext2' in name:
        return get_wikitext2(nsamples, seed, seqlen, model, tokenizer)
    if 'ptb' in name:
        return get_ptb(nsamples, seed, seqlen, model, tokenizer)
    if 'c4' in name:
        return get_c4(nsamples, seed, seqlen, model, tokenizer)
