import time
import json
import torch
import random
import numpy as np
from transformers import AutoTokenizer, AutoModel

def main():
    # Force usage of CUDA (will raise an error if CUDA is not available)
    device = torch.device("cuda")

    # We use mpnet-base from Hugging Face
    model_name = "sentence-transformers/all-mpnet-base-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    # We run 10 benchmarks with different seeds
    seeds = range(10)
    times_ms = []

    # We pick random tokens from a small list for our synthetic text
    word_list = ["foo", "bar", "baz", "qux", "lorem", "ipsum", "dolor", "sit", "amet", "test"]

    # We want 50 tokens each time
    num_tokens = 50

    # For matrix multiplication: n x d
    n = 1000

    for seed in seeds:
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

        # --- Prepare random text (50 tokens) ---
        tokens = random.choices(word_list, k=num_tokens)
        text = " ".join(tokens)

        # --- Tokenize on CPU (small overhead, not timed) ---
        encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

        # Move input tensors to GPU *before* we start timing
        input_ids = encoded["input_ids"].to(device)
        attention_mask = encoded["attention_mask"].to(device)

        # --- Pre-build random matrix on GPU *before* timing ---
        # We don't know hidden_size yet until we do a quick forward pass
        # but let's do a forward pass once outside to find hidden_size:
        with torch.no_grad():
            quick_out = model(input_ids, attention_mask=attention_mask)
        hidden_size = quick_out.last_hidden_state.shape[-1]

        # Now, build a random matrix of shape [n, hidden_size] on GPU
        # We do it once per seed/trial to keep it consistent with that seed
        random_matrix = torch.randn(n, hidden_size, device=device)

        start_t = time.perf_counter()

        # --- Tokenize on CPU (small overhead, not timed) ---
        encoded = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

        # Move input tensors to GPU *before* we start timing
        input_ids = encoded["input_ids"].to(device)
        attention_mask = encoded["attention_mask"].to(device)

        # ---- MEASURE ONLY GPU EXECUTION TIME ----
        # Synchronize to finish all preceding CPU-to-GPU transfers
        # torch.cuda.synchronize()

        with torch.no_grad():
            # 1) Forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
            # Suppose we use the [CLS] vector (at position 0) as embedding
            # shape: [batch_size, hidden_size]
            embedding = outputs.last_hidden_state[:, 0, :]

            # 2) Multiply embedding [batch_size=1, hidden_size] by random_matrix [n, hidden_size] (transposed)
            result = embedding @ random_matrix.t()  # shape: [1, n]

        # Synchronize to ensure GPU ops complete
        torch.cuda.synchronize()
        end_t = time.perf_counter()

        elapsed_ms = (end_t - start_t) * 1000.0
        times_ms.append(elapsed_ms)

    # Summarize times
    times_array = np.array(times_ms)
    mean_time = times_array.mean()
    std_time = times_array.std()
    min_time = times_array.min()
    max_time = times_array.max()

    print("Benchmark results (GPU kernel time only) over 10 runs (ms):")
    print(f"Mean: {mean_time:.2f} ms")
    print(f"Std : {std_time:.2f} ms")
    print(f"Min : {min_time:.2f} ms")
    print(f"Max : {max_time:.2f} ms")

    # Save array of all 10 measurements to JSON
    with open("times.json", "w") as f:
        json.dump(times_ms, f, indent=2)

if __name__ == "__main__":
    main()
