#########################################################
######### Script for TCR-T5 Model No Pretrain ###########
#########################################################

from datasets import load_dataset, Dataset
from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments 
from accelerate import Accelerator
import wandb
import os

### Set up logging via Weights & Biases
#wandb.init(offline=True)

# Load source and target files separately
source_dataset = load_dataset('text', data_files='../data/pmhc_stringent_split/train_source.txt')
target_dataset = load_dataset('text', data_files='../data/pmhc_stringent_split/train_target.txt')

# Load the validation files
val_source = load_dataset('text', data_files='../data/pmhc_stringent_split/val_source.txt')
val_target = load_dataset('text', data_files='../data/pmhc_stringent_split/val_target.txt')

# Ensure source and target datasets have the same size
assert len(source_dataset["train"]) == len(target_dataset["train"])
assert len(val_source['train'])==len(val_target['train'])

# Merge source and target datasets
dataset = Dataset.from_dict({
    'src_texts': [example['text'] for example in source_dataset['train']],
    'tgt_texts': [example['text'] for example in target_dataset['train']]
})


val_dataset = Dataset.from_dict({
    'src_texts': [example['text'] for example in val_source['train']],
    'tgt_texts': [example['text'] for example in val_target['train']]
})


tokenizer = T5Tokenizer(vocab_file='prott5_tokenizer.model', bos_token='[SOS]', eos_token='[EOS]', sep_token='[SEP]', cls_token='[CLS]', unk_token='[UNK]', pad_token='[PAD]', mask_token='[MASK]')


def labeled_tokenize_function(example, src_max_len=51, trg_max_len=23):
    # Split each element of the list into two parts
    sentences = [text.split(" ", 1) for text in example["src_texts"]]
    # Create lists for source and target texts
    pep = [s[0] for s in sentences]
    pseudo = [s[1] if len(s) > 1 else "" for s in sentences]  # Ensure that the second part exists
    sequences = [f'{pmhc[0]}{tokenizer.sep_token}{pmhc[1]}' for pmhc in zip(pep, pseudo)]

    # Tokenize source and target texts separately
    source_tokens = tokenizer(sequences, padding="max_length", return_tensors='pt', truncation=True, max_length=src_max_len)

    # Tokenize source and target texts separately
    target_texts = example['tgt_texts']
    target_tokens = tokenizer(target_texts, padding="max_length", return_tensors='pt', truncation=True, max_length=trg_max_len)

    # Apply padding to source and target sequences
    padded_source = {
        "input_ids": source_tokens["input_ids"][:, :src_max_len],
        "attention_mask": source_tokens["attention_mask"][:, :src_max_len]
    }
    
    padded_target = {
        "input_ids": target_tokens["input_ids"][:, :trg_max_len],
        "attention_mask": target_tokens["attention_mask"][:, :trg_max_len]
    }
    
    return {
        "input_ids": padded_source["input_ids"],
        "attention_mask": padded_source["attention_mask"],
        "labels": padded_target["input_ids"]
    }

tokenized_dataset = dataset.map(labeled_tokenize_function, batched=True)
tokenized_dataset.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])

tokenized_val = val_dataset.map(labeled_tokenize_function, batched=True)
tokenized_val.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])


### Instantiating the Model and Trainer Classes

config = T5Config(
    vocab_size=128,
    max_position_embeddings=512,
    d_model=768,
    pad_token_id=0,
    bos_token_id=1,
    eos_token_id=2,
    sep_token_id=4,
    decoder_start_token_id=1,
    encoder_layers=6,
    decoder_layers=6,
    output_hidden_states=True,
    output_scores=True,
    output_attentions=True,
    add_cross_attention=True,
    top_k=3
)

# initialize the model
model = T5ForConditionalGeneration(config)

# Make the training Args
training_args = TrainingArguments(
    output_dir="../model_checkpoints/no_pretraining_t5_model_high_lr",
    overwrite_output_dir=True,
    num_train_epochs=100,
    per_device_train_batch_size=128,
    save_steps=1000,
    do_eval=True,
    evaluation_strategy='steps',
    eval_steps=100,
    learning_rate=3e-04,
    logging_steps=100,
    save_total_limit=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

accelerator = Accelerator()
trainer = accelerator.prepare(trainer)

trainer.train()
wandb.finish()
