from modelscope import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor, AutoModelForCausalLM
from qwen_vl_utils import process_vision_info
import torch
import base64
import requests
from tqdm import tqdm
import glob
import os
from PIL import Image
# from openai import OpenAI
import numpy as np
import subprocess
import re
import json

model = AutoModelForCausalLM.from_pretrained(
    "./Qwen2.5-32B-Instruct/",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("./Qwen2.5-32B-Instruct/")

def get_json_files(output_dir):
    if not os.path.exists(output_dir):
        return []

    json_paths = glob.glob(os.path.join(output_dir, "*.json"))
    
    json_filenames = [os.path.splitext(os.path.basename(path))[0] for path in json_paths]
    
    return json_filenames

def read_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)


def generate_questions(model_name, json_data, task_name, task_description, file_name, output_dir):
    results = []
    json_files = get_json_files(output_dir)
    for i, item in enumerate(json_data):
        protein_name = item["Protein Image"][0].split('/')[-2]
        protein_sequence = item["Protein Sequence"]   
        protein_images = item["Protein Image"]
        
        choices = item["Multiple Choices"]
        choices_text = "\n".join([f"{key}. {value}" for key, value in choices.items()])

        system_prompt = f"""You are an excellent scientist. Your should analyze the provided protein-related task carefully and choose the correct answer form the multiple-choice options.

                        The current task is about {task_name}, which {task_description}. The inputs provided by the user for this task include:

                        * Protein Sequence: The amino acid sequence of the protein.
                        * Multiple Choices: Options for the answer.

                        Please think step-by-step about this problem:
                        1. Analyze the protein sequence carefully
                        2. Consider the biological context and function
                        3. Evaluate each multiple choice option
                        4. Provide your reasoning process
                        5. Finally, give your answer

                        Provide your response in following format:
                        1. reasoning: [Your detailed reasoning here]
                        2. answer: Your final answer, which should be \"A\", \"B\", \"C\" or \"D\".
                        """

        choices_text = "\n".join([f"{key}. {value}" for key, value in choices.items()])
        
        task_specific_user_prompt = f"""
                                    [Protein Sequence]
                                    {protein_sequence}

                                    [Multiple Choices]
                                    {choices_text}
                         """

        messages_content = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": task_specific_user_prompt}
        ]
        text = tokenizer.apply_chat_template(
            messages_content,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=10240
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

        detailed_result = {
            "response": response,
            "input": messages_content,
        }
        output_path = os.path.join(output_dir, f"{protein_name}.json")
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(detailed_result, f, indent=2, ensure_ascii=False)
    
    return results

def get_json_files_in_directory(directory_path):
    if not os.path.exists(directory_path):
        return []
    
    json_files = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            json_files.append(os.path.splitext(filename)[0])
    
    return json_files

def read_prompt_json(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return data
    except Exception as e:
        return None

def main():
    prompt_data = read_prompt_json("./prompt.json")
    task_dict = {task['task']: task for task in prompt_data}
    tasks = ['EC_number','pathway','transmembrane', 'ph', 'temperature']
    model_name = 'Qwen2.5-32B-Instruct'
    for file_name in tasks:
        input_file = "./questions/" + file_name +".json"
        output_dir = "./responses/" + file_name + '/' + model_name
        task_name = task_dict[file_name]['task_name']
        task_description = task_dict[file_name]['task_description']
        json_data = read_json_file(input_file)
        os.makedirs(output_dir, exist_ok=True)
        results = generate_questions(model_name, json_data, task_name, task_description, file_name, output_dir)

        all_output_path = os.path.join(output_dir, f"{file_name}_all_questions.json")
        with open(all_output_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)

if __name__ == "__main__":
    main()