import torch
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

def preprocess_data(examples, tokenizer):
    # Concatenate the context documents and the question for input processing
    inputs = [f"Context: {context} Question: {question}" for context, question in zip(examples['context'], examples['question'])]
    targets = [f"{answer}" for answer in examples['answer']]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    # Prepare labels for language modeling: labels are -100 where we don't want to compute loss (padding)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=1024, truncation=True, padding="max_length")
    model_inputs["labels"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_ids] for label_ids in labels["input_ids"]
    ]
    return model_inputs

def get_qa_trainer(model, tokenizer):
    # Load the dataset
    dataset = load_dataset("hotpot_qa", "distractor")
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    train_test_split = dataset["train"].train_test_split(test_size=0.1)
    dataset = train_test_split["test"]  # This is the 1/10th dataset
    
    # Preprocess the dataset
    tokenized_datasets = dataset.map(preprocess_data, batched=True, fn_kwargs={"tokenizer": tokenizer})

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        per_device_train_batch_size=1,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets,
    )

    # Start training
    return trainer
