import torch
import time
from transformers import BertModel, BertTokenizer

# Load the model and tokenizer
model_name = 'bert-base-uncased'
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Set the model to evaluation mode
model.eval()

# Sequence lengths to test
sequence_lengths = [8, 16, 32, 64, 128, 256, 512]

# Create a dummy input for testing
dummy_text = "This is a test sentence to evaluate the model's performance on different sequence lengths."

# Function to measure latency for a given sequence length
def measure_latency(seq_len, repetitions=10):
    inputs = tokenizer(dummy_text, return_tensors="pt", max_length=seq_len, padding="max_length", truncation=True)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Warm-up
    with torch.no_grad():
        for _ in range(5):
            model(input_ids, attention_mask=attention_mask)

    # Measure latency
    start_time = time.time()
    with torch.no_grad():
        for _ in range(repetitions):
            model(input_ids, attention_mask=attention_mask)
    end_time = time.time()
    
    avg_latency = (end_time - start_time) / repetitions
    return avg_latency

# Measure and print latency for each sequence length
for seq_len in sequence_lengths:
    latency = measure_latency(seq_len)
    print(f"Average latency for sequence length {seq_len}: {latency:.6f} seconds")