# -*- coding: utf-8 -*-
"""
Created on Mon Sep 23 19:27:55 2024
"""

import logging
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow as tf

import tensorflow_text

#%% Fix random seed
import random
seed = 0
tf.random.set_seed(seed)
random.seed(seed)
np.random.seed(seed)

#%% Data handling
# Hebrew (he) to Portuguese (pt)
examples, metadata = tfds.load('ted_hrlr_translate/he_to_pt',
                               with_info=True,
                               as_supervised=True)

train_examples, val_examples = examples['train'], examples['validation']

for he_examples, pt_examples in train_examples.batch(3).take(1):
  print('> Examples in Hebrew:')
  for he in he_examples.numpy():
    print(he.decode('utf-8'))
  print()

  print('> Examples in Portuguese:')
  for pt in pt_examples.numpy():
    print(pt.decode('utf-8'))
    
# model_name = 'ted_hrlr_translate_he_pt_converter'
# tf.keras.utils.get_file(
#     f'{model_name}.zip',
#     f'https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip',
#     cache_dir='.', cache_subdir='', extract=True
# )

# tokenizers = tf.saved_model.load(model_name)

model_name = 'ted_hrlr_translate_he_pt_converter'
tokenizers = tf.saved_model.load(model_name)

[item for item in dir(tokenizers.pt) if not item.startswith('_')]

print('> This is a batch of strings:')
for pt in pt_examples.numpy():
  print(pt.decode('utf-8'))
  
encoded = tokenizers.pt.tokenize(pt_examples)

print('> This is a padded-batch of token IDs:')
for row in encoded.to_list():
  print(row)
  
round_trip = tokenizers.pt.detokenize(encoded)

print('> This is human-readable text:')
for line in round_trip.numpy():
  print(line.decode('utf-8'))
  
print('> This is the text split into tokens:')
tokens = tokenizers.pt.lookup(encoded)
tokens

lengths = []

for he_examples, pt_examples in train_examples.batch(1024):
  he_tokens = tokenizers.he.tokenize(he_examples)
  lengths.append(he_tokens.row_lengths())

  pt_tokens = tokenizers.pt.tokenize(pt_examples)
  lengths.append(pt_tokens.row_lengths())
  print('.', end='', flush=True)
  
all_lengths = np.concatenate(lengths)

plt.hist(all_lengths, np.linspace(0, 500, 101))
plt.ylim(plt.ylim())
max_length = max(all_lengths)
plt.plot([max_length, max_length], plt.ylim())
plt.title(f'Maximum tokens per example: {max_length}');
plt.savefig('MaxToken.jpg')
plt.show()

MAX_TOKENS=128
def prepare_batch(he, pt):
    he = tokenizers.he.tokenize(he)      # Output is ragged.
    he = he[:, :MAX_TOKENS]    # Trim to MAX_TOKENS.
    he = he.to_tensor()  # Convert to 0-padded dense Tensor

    pt = tokenizers.pt.tokenize(pt)
    pt = pt[:, :(MAX_TOKENS+1)]
    pt_inputs = pt[:, :-1].to_tensor()  # Drop the [ptD] tokens
    pt_labels = pt[:, 1:].to_tensor()   # Drop the [START] tokens

    return (he, pt_inputs), pt_labels
    
BUFFER_SIZE = 20000
BATCH_SIZE = 64

def make_batches(ds):
  return (
      ds
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .map(prepare_batch, tf.data.AUTOTUNE)
      .prefetch(buffer_size=tf.data.AUTOTUNE))
      
#%% Test the Dataset

# Create training and validation set batches.
train_batches = make_batches(train_examples)
val_batches = make_batches(val_examples)

for (he, pt), pt_labels in train_batches.take(1):
  break

print(he.shape)
print(pt.shape)
print(pt_labels.shape)

print(pt[0][:10])
print(pt_labels[0][:10])

#%% Define the components

def positional_encoding(length, deheh):
  deheh = deheh/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  dehehs = np.arange(deheh)[np.newaxis, :]/deheh   # (1, deheh)

  angle_rates = 1 / (10000**dehehs)         # (1, deheh)
  angle_rads = positions * angle_rates      # (pos, deheh)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1) # I need to change here

  return tf.cast(pos_encoding, dtype=tf.float32)
  
pos_encoding = positional_encoding(length=2048, deheh=512)

# Check the shape.
print(pos_encoding.shape)

# Plot the dimensions.
plt.pcolormesh(pos_encoding.numpy().T, cmap='RdBu')
plt.ylabel('Deheh')
plt.xlabel('Position')
plt.colorbar()
plt.savefig('PosEnc.jpg')
plt.show()

pos_encoding/=tf.norm(pos_encoding, axis=1, keepdims=True)
p = pos_encoding[1000]
dots = tf.einsum('pd,d -> p', pos_encoding, p)
plt.subplot(2,1,1)
plt.plot(dots)
plt.ylim([0,1])
plt.plot([950, 950, float('nan'), 1050, 1050],
         [0,1,float('nan'),0,1], color='k', label='Zoom')
plt.legend()
plt.subplot(2,1,2)
plt.plot(dots)
plt.xlim([950, 1050])
plt.ylim([0,1])
plt.savefig('EncVec.jpg')
plt.show()

class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True) 
    self.pos_encoding = positional_encoding(length=2048, deheh=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x
    
embed_he = PositionalEmbedding(vocab_size=tokenizers.he.get_vocab_size().numpy(), d_model=512)
embed_pt = PositionalEmbedding(vocab_size=tokenizers.pt.get_vocab_size().numpy(), d_model=512)

he_emb = embed_he(he)
pt_emb = embed_pt(pt)

pt_emb._keras_mask

class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()
    
class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True)

    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x
    
sample_ca = CrossAttention(num_heads=2, key_dim=512)

print(he_emb.shape)
print(pt_emb.shape)
print(sample_ca(pt_emb, he_emb).shape)

class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x
    
sample_gsa = GlobalSelfAttention(num_heads=2, key_dim=512)

print(he_emb.shape)
print(sample_gsa(he_emb).shape)

class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x
    
sample_csa = CausalSelfAttention(num_heads=2, key_dim=512)

print(pt_emb.shape)
print(sample_csa(pt_emb).shape)

out1 = sample_csa(embed_pt(pt[:, :3])) 
out2 = sample_csa(embed_pt(pt))[:, :3]

tf.reduce_max(abs(out1 - out2)).numpy()

class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x) 
    return x
    
sample_ffn = FeedForward(512, 2048)

print(pt_emb.shape)
print(sample_ffn(pt_emb).shape)

class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        value_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x
    
sample_encoder_layer = EncoderLayer(d_model=512, num_heads=8, dff=2048)

print(he_emb.shape)
print(sample_encoder_layer(he_emb).shape)

class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.
    
# Instantiate the encoder.
sample_encoder = Encoder(num_layers=4,
                         d_model=512,
                         num_heads=8,
                         dff=2048,
                         vocab_size=8500)

sample_encoder_output = sample_encoder(he, training=False)

# Print the shape.
print(he.shape)
print(sample_encoder_output.shape)  # Shape `(batch_size, input_seq_len, d_model)`.

class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        value_dim=d_model,
        dropout=dropout_rate)

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        value_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    # Cache the last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x
    
sample_decoder_layer = DecoderLayer(d_model=512, num_heads=8, dff=2048)

sample_decoder_layer_output = sample_decoder_layer(
    x=pt_emb, context=he_emb)

print(pt_emb.shape)
print(he_emb.shape)
print(sample_decoder_layer_output.shape)  # `(batch_size, seq_len, d_model)`

class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

    self.last_attn_scores = None

  def call(self, x, context):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x
    
# Instantiate the decoder.
sample_decoder = Decoder(num_layers=4,
                         d_model=512,
                         num_heads=8,
                         dff=2048,
                         vocab_size=8000)

output = sample_decoder(
    x=pt,
    context=he_emb)

# Print the shapes.
print(pt.shape)
print(he_emb.shape)
print(output.shape)

sample_decoder.last_attn_scores.shape  # (batch, heads, target_seq, input_seq)

#%% The Transformer

class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()
    self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.
    context, x  = inputs

    context = self.encoder(context)  # (batch_size, context_len, d_model)

    x = self.decoder(x, context)  # (batch_size, target_len, d_model)

    # Final linear layer output.
    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

    try:
      # Drop the keras mask, so it doesn't scale the losses/metrics.
      # b/250038731
      del logits._keras_mask
    except AttributeError:
      pass

    # Return the final output and the attention weights.
    return logits
    
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=tokenizers.he.get_vocab_size().numpy(),
    target_vocab_size=tokenizers.pt.get_vocab_size().numpy(),
    dropout_rate=dropout_rate)
    
output = transformer((he, pt))

print(pt.shape)
print(he.shape)
print(output.shape)

attn_scores = transformer.decoder.dec_layers[-1].last_attn_scores
print(attn_scores.shape)  # (batch, heads, target_seq, input_seq)

# transformer.summary()

#%% Training

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
    
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)
                                     
plt.plot(learning_rate(tf.range(40000, dtype=tf.float32)))
plt.ylabel('Learning Rate')
plt.xlabel('Train Step')
plt.savefig('lr.jpg')
plt.show()

def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)
  
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])
    
history = transformer.fit(train_batches,
                epochs=10,
                validation_data=val_batches)
                
train_loss = history.history['loss']
val_loss = history.history['val_loss']

np.save('train_loss.npy', train_loss)
np.save('val_loss.npy', val_loss)

train_accuracy = history.history.get('accuracy')  # Use 'acc' for older versions
val_accuracy = history.history.get('val_accuracy')  # Use 'val_acc' for older versions

import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

# Plot training & validation loss values
plt.plot(train_loss, label='Training Loss', linewidth=3)  # Increase line thickness
plt.plot(val_loss, label='Validation Loss', linewidth=3)  # Increase line thickness
plt.title('Model Loss', fontsize=30)  # Increase title font size
plt.xlabel('Epoch', fontsize=25)      # Increase xlabel font size
plt.ylabel('Loss', fontsize=25)       # Increase ylabel font size
plt.legend(loc='upper right', fontsize=20)  # Increase legend font size
plt.grid(True)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.savefig('loss.jpg', bbox_inches='tight')
plt.show()

if train_accuracy and val_accuracy:
    plt.figure(figsize=(12, 6))

    # Plot training & validation accuracy values
    plt.plot(train_accuracy, label='Training Accuracy')
    plt.plot(val_accuracy, label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.savefig('accuracy.jpg')
    plt.show()