import random
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration, AutoTokenizer


import os
import time
# import requests
# from PIL import Image


print("finish import")
# print(world_size, rank, local_rank)
os.environ['TRANSFORMERS_CACHE'] = "/XXXX-5/home-XXXX-3/workspace/MLLMSD/profile_cache"


print("set env")

random.seed(67)
random_sequence = torch.tensor([random.randint(14, 30562) for _ in range(2048)]).cuda(0)
# Example input sequences (batch)
batch_size = 8  # Example batch size
seq_len = 200  # Example sequence length
# all batch have same sequence ids
random_sequences = torch.randint(3, 30562, (1, seq_len)).cuda(0)
random_sequences = random_sequences.expand(batch_size, -1)
# random_sequences = torch.full((batch_size, seq_len), random_sequence).cuda(0)

print("set tokenizers")


processor = AutoProcessor.from_pretrained("XXXX-2/lvlm68m", cache_dir="/XXXX-5/home-XXXX-3/workspace/MLLMSD/profile_cache", token="XXXX-1")

# Load a tokenizer compatible with the model
tokenizer = AutoTokenizer.from_pretrained(
    "XXXX-2/lvlm68m", 
    torch_dtype=torch.float16, 
    cache_dir="/XXXX-5/home-XXXX-3/workspace/MLLMSD/profile_cache", 
    token="XXXX-1"
)

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = "[PAD]"
tokenizer.padding_side = "left"
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

# Load the draft model
draft_model = LlavaForConditionalGeneration.from_pretrained(
    "XXXX-2/lvlm68m", 
    torch_dtype=torch.float32, 
    cache_dir="/XXXX-5/home-XXXX-3/workspace/MLLMSD/profile_cache", 
    token="XXXX-1",
).cuda(0)

print("Finished loading models.")

warmup_step = 50
for _ in range(warmup_step):
    # Generate sequences with varying lengths
    draft_model.generate(
        random_sequences[:1, :seq_len], 
        max_new_tokens=100, 
        min_new_tokens=100, 
        pad_token_id=tokenizer.pad_token_id
    )

        
# Set up timing measurement
time_list = []
batch_sizes = [1, 2, 4, 8]
seq_len = 200
repeat = 3  # Repeat inference to get stable timing
measure_step = 200

for idx, bs in enumerate(batch_sizes):
    time_list.append([])
    for i in range(1, measure_step, 2):  # Vary the number of new tokens generated
        start_time = time.time()
        for _ in range(repeat):
            # Generate sequences with varying lengths
            draft_model.generate(
                random_sequences[:bs, :seq_len], 
                max_new_tokens=i, 
                min_new_tokens=i, 
                pad_token_id=tokenizer.pad_token_id
            )
        end_time = time.time()
        print(f"Generate {i} tokens {repeat} times")
        # Append average time taken per generation
        time_list[idx].append((end_time - start_time) / repeat)
    print(time_list)
    with open("log_time_list.txt", "w") as f:
        f.write(str(time_list))
        f.write("\n")
print("Finished timing measurements.")

import matplotlib.pyplot as plt

# Plotting the results
plt.figure(figsize=(8, 6))
batch_sizes = [1, 2, 4, 8]
for idx, bs in enumerate(batch_sizes):
    # Divide the total time by the number of auto-regression steps to get per-step latency
    per_step_latency = [t * 1000 / step for t, step in zip(time_list[idx], range(1, measure_step, 2))]
    
    # Plot per-step latency
    plt.plot(range(1, measure_step, 2), per_step_latency, label=f'b={bs}')

plt.xlabel('Auto-regression Steps')
plt.ylabel('Per-step Latency (ms)')
plt.legend()
plt.title('Per-step Latency for Different Batch Sizes')
plt.grid(True)
plt.savefig("vlm_scaling_plot.pdf")
plt.show()
