import torch
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    AdamW,
    get_linear_schedule_with_warmup,
)
import numpy as np
import pandas as pd
from peft import get_peft_model, LoraConfig, TaskType
import transformers
from typing import Optional, Dict, Sequence, List, Literal
import copy
from dataclasses import dataclass, field

IGNORE_INDEX = -100
PROMPT = (
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{instruction}\n\n### Response:"
)

def load_and_preprocess_data(task, tokenizer, args):

    if "mnli" in task:
        dataset = load_dataset("glue", "mnli")
    else:
        dataset = load_dataset("glue", task)

    def tokenize_function(examples):

        # Handle different input formats
        if "premise" in examples and "hypothesis" in examples:
            # MNLI and similar tasks
            return tokenizer(
                examples["premise"],
                examples["hypothesis"],
                truncation=True,
                padding="max_length",
                max_length=args.max_seq_length,
            )
        elif "question" in examples and "sentence" in examples:
            # QNLI and similar tasks
            return tokenizer(
                examples["question"],
                examples["sentence"],
                truncation=True,
                padding="max_length",
                max_length=args.max_seq_length,
            )
        elif "sentence1" in examples and "sentence2" in examples:
            # MRPC, STS-B
            return tokenizer(
                examples["sentence1"],
                examples["sentence2"],
                truncation=True,
                padding="max_length",
                max_length=args.max_seq_length,
            )
        elif "question1" in examples and "question2" in examples:
            # QQP
            return tokenizer(
                examples["question1"],
                examples["question2"],
                truncation=True,
                padding="max_length",
                max_length=args.max_seq_length,
            )
        elif "sentence" in examples:
            # CoLA, SST-2
            return tokenizer(
                examples["sentence"],
                truncation=True,
                padding="max_length",
                max_length=args.max_seq_length,
            )
        else:
            raise ValueError(f"Unexpected format for task {task}")

    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    if task == "cola":
        tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
    elif task == "sst2":
        tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
    elif task == "mrpc":
        tokenized_datasets = tokenized_datasets.remove_columns(
            ["sentence1", "sentence2", "idx"]
        )
    elif task == "qqp":
        tokenized_datasets = tokenized_datasets.remove_columns(
            ["question1", "question2", "idx"]
        )
    elif task == "stsb":
        tokenized_datasets = tokenized_datasets.remove_columns(
            ["sentence1", "sentence2", "idx"]
        )
    elif task == "qnli":
        tokenized_datasets = tokenized_datasets.remove_columns(
            ["question", "sentence", "idx"]
        )
    elif task == "rte":
        tokenized_datasets = tokenized_datasets.remove_columns(
            ["sentence1", "sentence2", "idx"]
        )
    elif task == "wnli":
        tokenized_datasets = tokenized_datasets.remove_columns(
            ["sentence1", "sentence2", "idx"]
        )
    elif task == "mnli_matched" or task == "mnli_mismatched" or task == "mnli":
        tokenized_datasets = tokenized_datasets.remove_columns(
            ["premise", "hypothesis", "idx"]
        )
    else:
        raise ValueError(f"Unexpected task {task}")

    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
    tokenized_datasets.set_format("torch")

    if (
        task == "cola"
        or task == "sst2"
        or task == "mrpc"
        or task == "qqp"
        or task == "stsb"
        or task == "qnli"
        or task == "rte"
        or task == "wnli"
    ):
        train_dataset = tokenized_datasets["train"]
        val_dataset = tokenized_datasets["validation"]
        test_dataset = tokenized_datasets["test"]
    elif task == "mnli_matched":
        train_dataset = tokenized_datasets["train"]
        val_dataset = tokenized_datasets["validation_matched"]
        test_dataset = tokenized_datasets["test_matched"]
    elif task == "mnli_mismatched":
        train_dataset = tokenized_datasets["train"]
        val_dataset = tokenized_datasets["validation_mismatched"]
        test_dataset = tokenized_datasets["test_mismatched"]

    return train_dataset, val_dataset, test_dataset


def create_dataloader(dataset, args, shuffle=True):
    return DataLoader(dataset, batch_size=args.batch_size, shuffle=shuffle)

@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = [torch.tensor(x) for x in input_ids]
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = [torch.tensor(x) for x in labels]
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
    """Tokenize a list of strings."""
    tokenized_list = [
        tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        )
        for text in strings
    ]
    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
    ]
    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )
import torch
from torch.nn.utils.rnn import pad_sequence

class DataCollatorForSupervisedDataset_dp:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        # Use the tokenizer’s pad token id (or another padding value if desired)
        self.padding_value = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0

    def __call__(self, features):
        # If features is empty, return an empty batch.
        if len(features) == 0:
            return {"input_ids": torch.empty(0, dtype=torch.long),
                    "attention_mask": torch.empty(0, dtype=torch.long)}

        # Process input_ids: filter out any example that might be missing input_ids.
        input_ids_list = []
        for feature in features:
            # If "input_ids" is missing or empty, skip it.
            if "input_ids" not in feature or len(feature["input_ids"]) == 0:
                continue
            # Convert to tensor (if not already)
            if not torch.is_tensor(feature["input_ids"]):
                input_ids_list.append(torch.tensor(feature["input_ids"], dtype=torch.long))
            else:
                input_ids_list.append(feature["input_ids"])

        # If there are no valid input_ids, return empty tensors.
        if len(input_ids_list) == 0:
            padded_input_ids = torch.empty(0, dtype=torch.long)
        else:
            padded_input_ids = pad_sequence(input_ids_list, batch_first=True, padding_value=self.padding_value)

        # Process attention_mask similarly (if available)
        if "attention_mask" in features[0]:
            attention_mask_list = []
            for feature in features:
                if "attention_mask" not in feature or len(feature["attention_mask"]) == 0:
                    continue
                if not torch.is_tensor(feature["attention_mask"]):
                    attention_mask_list.append(torch.tensor(feature["attention_mask"], dtype=torch.long))
                else:
                    attention_mask_list.append(feature["attention_mask"])
            if len(attention_mask_list) == 0:
                padded_attention_mask = torch.empty(0, dtype=torch.long)
            else:
                padded_attention_mask = pad_sequence(attention_mask_list, batch_first=True, padding_value=0)
        else:
            padded_attention_mask = None

        batch = {"input_ids": padded_input_ids}
        if padded_attention_mask is not None:
            batch["attention_mask"] = padded_attention_mask

        # (Optionally, process other fields in a similar way.)
        return batch


def preprocess(
        sources: Sequence[str],
        targets: Sequence[str],
        tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
    """Preprocess the data by tokenizing."""
    examples = [s + t for s, t in zip(sources, targets)]
    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
    input_ids = examples_tokenized["input_ids"]
    labels = copy.deepcopy(input_ids)
    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
        label[:source_len] = IGNORE_INDEX
    return dict(input_ids=input_ids, labels=labels)

def train_tokenize_function(examples, tokenizer, query, response):
    sources = [PROMPT.format_map(dict(instruction=instruction)) for instruction in examples[query]]
    targets = [f"{output}{tokenizer.eos_token}" for output in examples[response]]
    data_dict = preprocess(sources, targets, tokenizer)
    return data_dict

def load_and_preprocess_it(tokenizer, args):

    raw_train_datasets = load_dataset(
        args.data_path, 
        split=args.dataset_split)

    train_dataset = raw_train_datasets.map(
        train_tokenize_function,
        batched=True,
        batch_size=3000,
        num_proc=32,
        remove_columns=raw_train_datasets.column_names,
        load_from_cache_file=True,
        desc="Running tokenizer on train dataset",
        fn_kwargs={
            "tokenizer": tokenizer, 
            "query": args.dataset_field[0],
            "response": args.dataset_field[1]}
    )

    return train_dataset


def load_and_preprocess_cr(tokenizer, args):
    """Load and preprocess the dataset."""
    if args.data_path.endswith(".json"):
        data = load_dataset("json", data_files=args.data_path)
    else:
        data = load_dataset(args.data_path)

    # Create a wrapper function that includes all necessary arguments
    def generate_and_tokenize_prompt_wrapper(data_point):
        return generate_and_tokenize_prompt_cr(data_point, tokenizer, args)

    train_dataset = data["train"].shuffle().map(
        generate_and_tokenize_prompt_wrapper,
        num_proc=8,
        remove_columns=data["train"].column_names  # Remove original columns
    )
    
    return train_dataset
def load_and_preprocess_it_dp(tokenizer, args):

    # Load raw dataset
    raw_train_datasets = load_dataset(
        args.data_path, 
        split=args.dataset_split)

    # Tokenize the dataset.
    train_dataset = raw_train_datasets.map(
        train_tokenize_function,
        batched=True,
        batch_size=3000,
        num_proc=32,
        remove_columns=raw_train_datasets.column_names,
        load_from_cache_file=True,
        desc="Running tokenizer on train dataset",
        fn_kwargs={
            "tokenizer": tokenizer, 
            "query": args.dataset_field[0],
            "response": args.dataset_field[1]
        }
    )

    # Convert token lists to PyTorch tensors.
    # Adjust the keys if your tokenization returns different field names.
    def convert_to_tensor(example):
        if "input_ids" in example:
            example["input_ids"] = torch.tensor(example["input_ids"], dtype=torch.long)
        if "attention_mask" in example:
            example["attention_mask"] = torch.tensor(example["attention_mask"], dtype=torch.long)
        # Add conversion for any other fields as needed.
        return example

    train_dataset = train_dataset.map(
        convert_to_tensor,
        batched=False,  # Process each example individually.
        desc="Converting token lists to tensors"
    )

    return train_dataset
def generate_prompt_cr(data_point):
    """Generate prompt from data point."""
    if data_point.get("input", ""):  # Using get() with default empty string
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

            ### Instruction:
            {data_point["instruction"]}

            ### Input:
            {data_point["input"]}

            ### Response:
            {data_point["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

            ### Instruction:
            {data_point["instruction"]}

            ### Response:
            {data_point["output"]}"""

def tokenize_cr(prompt, tokenizer, args, add_eos_token=True):
    """Tokenize the prompt."""
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=args.max_seq_length,
        padding=False,
        return_tensors=None,
    )

    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < args.max_seq_length
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        if "chatglm" not in args.model:
            result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    if "chatglm" in args.model:
        return {"input_ids": result["input_ids"], "labels": result["labels"]}
    else:
        return result

def generate_and_tokenize_prompt_cr(data_point, tokenizer, args):
    """Generate and tokenize prompt with proper labels."""
    full_prompt = generate_prompt_cr(data_point)
    tokenized_full_prompt = tokenize_cr(full_prompt, tokenizer, args)
    
    if not args.train_on_inputs:
        # Create a user prompt without the response
        user_prompt = generate_prompt_cr({
            "instruction": data_point["instruction"],
            "input": data_point.get("input", ""),
            "output": ""
        })
        tokenized_user_prompt = tokenize_cr(
            user_prompt, 
            tokenizer, 
            args, 
            add_eos_token=False
        )
        user_prompt_len = len(tokenized_user_prompt["input_ids"])

        # Replace labels with -100 for non-response tokens
        tokenized_full_prompt["labels"] = (
            [-100] * user_prompt_len + 
            tokenized_full_prompt["labels"][user_prompt_len:]
        )

    return tokenized_full_prompt