from transformers import AutoProcessor, IdeficsForVisionText2Text
from transformers.generation import GenerationConfig
import torch
import random
from PIL import Image

torch.manual_seed(1234)


processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics-9b-instruct")
model = IdeficsForVisionText2Text.from_pretrained(
    "HuggingFaceM4/idefics-9b-instruct",
    device_map="cuda",
    torch_dtype=torch.bfloat16
).eval()

model.generation_config = GenerationConfig.from_pretrained("HuggingFaceM4/idefics-9b-instruct")


def call_model(image_path, text_prompt):
    # Load the image
    image = Image.open(image_path).convert('RGB')
    prompts = ["user:", image, text_prompt, "Assistant:"]

    # Process inputs
    inputs = processor(prompts, return_tensors="pt").to("cuda")

    exit_condition = processor.tokenizer("<end_of_utterance>", add_special_tokens=False).input_ids
    bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"],
                                             add_special_tokens=False).input_ids

    generated_ids = model.generate(**inputs,
                                    eos_token_id=exit_condition,
                                    bad_words_ids=bad_words_ids,
                                    do_sample=True,
                                    # num_beams=1,
                                    max_length=512,
                                    # min_length=10,
                                    # top_p=1,
                                    # repetition_penalty=1.5,
                                    # length_penalty=1.0,
                                    # temperature=0
                                    )
    output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    txt = f"user:{text_prompt} Assistant:"
    assistant_answer = output.split("Assistant:")[-1].strip()

    return assistant_answer

