import numpy as np
import argparse
import json
import logging
import math
import os
import random
from pathlib import Path

import pandas as pd
import datasets
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import evaluate
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from huggingface_hub import Repository
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    PretrainedConfig,
    SchedulerType,
    default_data_collator,
    get_scheduler,
)
from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
from transformers.utils.versions import require_version

from bert2 import Bert
import time


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.26.0.dev0")

logger = get_logger(__name__)

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

start = time.time()

nmodels={}

nmodels[128]=[219,59,15,4]
nmodels[64]=[59,15,4]
nmodels[32]=[15,4]
nmodels[16]=[4]


total_models={}

total_models[8]=468
total_models[16]=266
total_models[32]=150
total_models[64]=90
total_models[128]=70


def parse_args():
    parser = argparse.ArgumentParser(description="Random model on a text classification task")
    parser.add_argument(
        "--task_name",
        type=str,
        default="mnli",
        help="The name of the glue task to train on.",
    )
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        default="bert-base-cased",
        help="Path to pretrained model or model identifier from huggingface.co/models.",
    )
    parser.add_argument('--n_layer', type=int, default=4,
                    help='number of total layers')

    parser.add_argument('--n_head', type=int, default=4,
                    help='number of heads')

    parser.add_argument('--d_model', type=int, default=128,
                    help='model dimension')

    parser.add_argument('--model_random', default="False", type=str,
                    help='Model Random')
    parser.add_argument(
        "--ignore_mismatched_sizes",
        action="store_true",
        help="Whether or not to enable to load a pretrained model whose head dimensions are different.",
    )

    args = parser.parse_args()
    for arg in vars(args):
        print(arg, " : ", getattr(args, arg))

    return args



# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
# in the environment
accelerator = (Accelerator())

# Make one log on every process with the configuration for debugging.
#logging.basicConfig(
#    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
#    datefmt="%m/%d/%Y %H:%M:%S",
#    level=logging.INFO,
#)

#logger.info(accelerator.state, main_process_only=False)

#if accelerator.is_local_main_process:
#    datasets.utils.logging.set_verbosity_warning()
#    transformers.utils.logging.set_verbosity_info()
#else:
#    datasets.utils.logging.set_verbosity_error()
#    transformers.utils.logging.set_verbosity_error()

# Set the seed
set_seed(0)
torch.manual_seed(0)    


raw_datasets = load_dataset("glue", "mnli")

# Labels

label_list = raw_datasets["train"].features["label"].names
num_labels = len(label_list)


# Load pretrained model and tokenizer
#
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.

config = AutoConfig.from_pretrained("bert-base-cased", num_labels=num_labels, finetuning_task="mnli")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased",use_fast=True)

# Preprocessing the datasets
sentence1_key, sentence2_key = "premise", "hypothesis"


# Some models have set the order of the labels to use, so let's make sure we do use it.
label_to_id = None

padding = False


def preprocess_function(examples):
    # Tokenize the texts
    texts = (
        (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
    )
    result = tokenizer(*texts, padding=padding, max_length=128, truncation=True)

    if "label" in examples:
        if label_to_id is not None:
            # Map labels to IDs (not necessary for GLUE tasks)
            result["labels"] = [label_to_id[l] for l in examples["label"]]
        else:
            # In all cases, rename the column to labels because the model will expect that.
            result["labels"] = examples["label"]
    return result

with accelerator.main_process_first():
    processed_datasets = raw_datasets.map(
        preprocess_function,
        batched=True,
        remove_columns=raw_datasets["train"].column_names,
        desc="Running tokenizer on dataset",
    )

train_dataset = processed_datasets["train"]
#random.seed(0)
#indices=random.sample(range(len(train_dataset)), 50000)
#train_dataset = train_dataset.select(indices)
#print("randomly sampled indices: {v}".format(v=indices[:10]))

eval_dataset = processed_datasets["validation_matched"]

# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

# DataLoaders creation:
# Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))

# turn off the shuffle
train_dataloader = DataLoader(
    train_dataset, shuffle=False, collate_fn=data_collator, batch_size=32
)
eval_dataloader = DataLoader(eval_dataset, shuffle=False, collate_fn=data_collator, batch_size=32)


# Prepare everything with our `accelerator`.
train_dataloader, eval_dataloader = accelerator.prepare( train_dataloader, eval_dataloader)


args = parse_args()


for dmodel_size in [args.d_model]:
    for se in list(range(1,11))+list(range(31,total_models[dmodel_size]+1)):        
    #for se in list(range(63,64)):
        logging.basicConfig(level = logging.INFO)
        
        
        #load the model
        if args.model_random=="True":
            torch.manual_seed(se)   
            model=Bert(args=args, num_labels=3, task_name="mnli", num_hidden_layers= 4, hidden_size= dmodel_size, num_attention_heads= 4, dropout=0.1)
            model= accelerator.prepare(model)
            print("Random model generated.")


        else: 
            path = "saved_models/" + str(dmodel_size) +"/result_trained_"+str(se)
            model=torch.load(path)
            logger.info(path)
        

        X_trainings=[]
        target_trainings=[]
        X_tests=[]
        target_tests=[]

        model.eval()
        with torch.no_grad():
            for step, batch  in enumerate(train_dataloader):
                 X_trainings.append(model.get_features(batch).detach())
                 target_trainings.append(batch["labels"])
            X_training = torch.cat(X_trainings,dim=0)
            target_training = torch.cat(target_trainings,dim=0)

        model.eval()
        with torch.no_grad():
            for step, batch in enumerate(eval_dataloader):
                X_tests.append(model.get_features(batch).detach())
                target_tests.append(batch["labels"])
            X_test=torch.cat(X_tests,dim=0)
            target_test = torch.cat(target_tests,dim=0)

        X_test=X_test.cpu().cpu().numpy() 
        X_training= X_training .cpu().numpy() 
        target_test=target_test.cpu().numpy().reshape(-1,1) 
        target_training=target_training.cpu().numpy().reshape(-1,1) 


        logging.info(np.shape(X_test))
        logging.info(np.shape(X_training))
        logging.info(np.shape(target_test)) 
        logging.info(np.shape(target_training))
        
		
        training_data=np.concatenate((target_training, X_training), axis=1)
        test_data=np.concatenate((target_test, X_test), axis=1)
        print(training_data)
               
        #write validation
        curr_dir=os.getcwd()+"/features/"
        curr_dir=curr_dir+str(dmodel_size)+"/"+  str(se)
    
        if not os.path.exists(curr_dir): 
            os.makedirs(curr_dir)
        
        if args.model_random=="True":
            pd.DataFrame(training_data).to_csv(curr_dir + "/" + 'random_training.csv',header=False, index=False)
            pd.DataFrame(test_data).to_csv(curr_dir + "/" + 'random_test.csv',header=False, index=False)
            df=pd.read_csv(curr_dir + "/" + 'random_test.csv',header=None,index_col=False)
            print(df.to_numpy())
        else:
            pd.DataFrame(training_data).to_csv(curr_dir + "/" + 'training.csv',header=False, index=False)
            pd.DataFrame(test_data).to_csv(curr_dir + "/" + 'test.csv',header=False, index=False)
            df=pd.read_csv(curr_dir + "/" + 'test.csv',header=None,index_col=False)
            print(df.to_numpy())

        print("TIME:::::::::::::")
        print(time.time() - start)
        print("::::::::::::::::::::::")

   
    
    

       


