# import json
# import torch
# from transformers import AutoModelForCausalLM
# from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
# from deepseek_vl2.utils.io import load_pil_images
# from tqdm import tqdm

# # 初始化模型和处理器
# model_path = "deepseek-ai/deepseek-vl2-small"
# vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(model_path)
# tokenizer = vl_chat_processor.tokenizer
# vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
# vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# # 读取输入 JSON 文件
# input_path = "VQA_Data.json"
# output_path = "VQA_Data_text_only_Deepseek_Small.json"

# with open(input_path, "r", encoding="utf-8") as f:
#     data = json.load(f)

# results = []

# # 遍历每一个样本
# for item in tqdm(data):
#     # mcqid = item["MCQID"]
#     # image_path = item["image_path"]
#     # prompt = "Return only the letter of the correct option. " +item["prompt"]
#     question = item.get("Rephrased_Question", "").strip()
#     options = [
#         f"A. {str(item.get('Option1', '')).strip()}",
#         f"B. {str(item.get('Option2', '')).strip()}",
#         f"C. {str(item.get('Option3', '')).strip()}",
#         f"D. {str(item.get('Option4', '')).strip()}",
#     ]
#     prompt = "请回答以下与中国文化相关的问题：\n"+question + "\n" + "\n".join(options)+"\n这是一个多选题，请先返回所有可能的选项字母，再用中文解释你的选择。"
#     image_path = item['Image_path']  # <-- 请将这里替换为你的本地图片路径
#     conversation = [
#     {
#         "role": "<|User|>",
#         "content": f"\n<|ref|>{prompt}<|/ref|>.",
#         "images": []
        
#     },
#     {
#         "role": "<|Assistant|>",
#         "content": ""
#     }
#     ]
#     prepare_inputs = vl_chat_processor(
#             conversations=conversation,
#             images=[],
#             force_batchify=True,
#             system_prompt="",
#         ).to(dtype=torch.bfloat16,device=vl_gpt.device)
#     inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs) 
#     outputs = vl_gpt.generate(
#         inputs_embeds=inputs_embeds,
#         attention_mask=prepare_inputs.attention_mask,
#         pad_token_id=tokenizer.eos_token_id,
#         bos_token_id=tokenizer.bos_token_id,
#         eos_token_id=tokenizer.eos_token_id,
#         max_new_tokens=250,
#         do_sample=False,
#         use_cache=True
#     )

#     # 解码输出
#     answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)


#     item["GPT4o_Answer"] = answer

# with open(output_path, "w", encoding="utf-8") as f:
#     json.dump(data, f, indent=2, ensure_ascii=False)

# print(f"✅ 处理完成，已将带有预测结果的数据保存至 {output_path}")


import os
import json
import torch
from transformers import AutoModelForCausalLM
from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
# from deepseek_vl2.utils.io import load_pil_images # Text-only, not needed
from tqdm import tqdm

# --- 1. 模型和处理器加载 ---
# 指定 Deepseek-VL2 模型路径
model_path = "deepseek-ai/deepseek-vl2-small"

print(f"正在加载模型: {model_path}...")
vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer
# 使用 bfloat16 以获得更好的性能，并移动到 CUDA 设备
vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, 
    trust_remote_code=True
).to(torch.bfloat16).cuda().eval()
print("模型加载完成。")


# --- 2. 路径和配置信息 ---
# 包含国家子文件夹的基础目录 (请根据您的实际路径修改)
base_data_dir = "." 
# 需要处理的子文件夹列表 (国家名称)
#folders_to_process = ["Indonesian", "Korea", "Mongolia", "Vitnamese", "Singapore"]
folders_to_process = ["India","Japan"]
# 输出文件夹
output_dir = "Output_Json_Deepseek"

# 创建输出文件夹 (如果不存在)
os.makedirs(output_dir, exist_ok=True)


# --- 3. 动态提示词模板 ---
# 这部分与您的 Qwen 脚本完全相同，实现了动态、多语言的提示词功能
PROMPT_TEMPLATES = {
    "China": {
        "Chinese": "请回答以下与中国文化相关的问题：\n{question}\n{options}\n这是一个多选题，请先返回所有可能的选项字母，再用中文解释你的选择。",
        "English": "Please answer the following question related to Chinese Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English."
    },
    "Indonesian": {
        "Indonesian": "Silakan jawab pertanyaan berikut terkait Budaya Indonesia.\n{question}\n{options}\nIni adalah pertanyaan pilihan ganda. Harap kembalikan semua kemungkinan huruf opsi terlebih dahulu, lalu jelaskan pilihan Anda dalam Bahasa Indonesia.",
        "English": "Please answer the following question related to Indonesian Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English."
    },
    "Korea": {
        "Korean": "다음은 한국 문화와 관련된 질문입니다. 답변해 주세요.\n{question}\n{options}\n이것은 객관식 문제입니다. 먼저 가능한 모든 옵션 문자를 반환한 다음, 한국어로 당신의 선택을 설명해 주세요.",
        "English": "Please answer the following question related to Korean Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English."
    },
    "Mongolia": {
        "Mongolian": "Монголын соёлтой холбоотой дараах асуултад хариулна уу.\n{question}\n{options}\nЭнэ бол олон сонголттой асуулт юм. Эхлээд боломжит бүх сонголтын үсгийг буцааж, дараа нь сонголтоо монгол хэлээр тайлбарлана уу.",
        "English": "Please answer the following question related to Mongolian Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English."
    },
    "Singapore": {
        "English": "Please answer the following question related to Singaporean Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English.",
        "Malay": "Sila jawab soalan berikut yang berkaitan dengan Budaya Singapura.\n{question}\n{options}\nIni adalah soalan pilihan berganda. Sila kembalikan semua huruf pilihan yang mungkin terlebih dahulu, kemudian jelaskan pilihan anda dalam Bahasa Inggeris.",
        "Chinese": "请回答以下与新加坡文化相关的问题。\n{question}\n{options}\n这是一个多选题，请先返回所有可能的选项字母，再用英文解释你的选择。",
        "Tamil": "இந்திய கலாச்சாரத்தைச் சார்ந்த கீழ்க்கண்ட கேள்விக்கு பதிலளிக்கவும்.\n{question}\n{options}\nஇது ஒரு பன்முகத் தேர்வு கேள்வி. முதலில் சாத்தியமான அனைத்து விருப்ப எழுத்துகளையும் கொடுத்து, பின்னர் உங்கள் தேர்வை ஆங்கிலத்தில் விளக்கவும்."
    },
    "Vitnamese": {
        "Vietnamese": "Vui lòng trả lời câu hỏi sau đây liên quan đến Văn hóa Việt Nam.\n{question}\n{options}\nĐây là một câu hỏi trắc nghiệm. Vui lòng trả về tất cả các chữ cái tùy chọn có thể có trước, sau đó giải thích lựa chọn của bạn bằng tiếng Việt.",
        "English": "Please answer the following question related to Vietnamese Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English."
    },
    "Japan": {
        "Japanese": "日本文化に関する次の質問に答えてください。\n{question}\n{options}\nこれは多肢選択式の質問です。まずすべての可能な選択肢のアルファベットを返し、その後にあなたの選択を英語で説明してください。",
        "English": "Please answer the following question related to Japanese Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English."
    },
    "India": {
        "Hindi": "कृपया भारतीय संस्कृति से संबंधित निम्नलिखित प्रश्न का उत्तर दें।\n{question}\n{options}\nयह एक बहुविकल्पीय प्रश्न है। कृपया पहले सभी संभावित विकल्प अक्षरों को लौटाएँ, फिर अंग्रेज़ी में अपने चयन की व्याख्या करें।  ",
        "English": "Please answer the following question related to Indian Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English."
    }
}

# (这部分与您的 Qwen 代码完全相同，直接复用)
NATIVE_LANGUAGE_MAP = {
    "Indonesian": "Indonesian",
    "Korea": "Korean",
    "Mongolia": "Mongolian",
    "Singapore": "Malay,Chinese,Tamil",
    "Vitnamese": "Vietnamese",
    "China": "Chinese",
    "Japan": "Japanese",
    "India": "Hindi"
}


# --- 4. 遍历文件夹和文件进行处理 ---
for folder_name in folders_to_process:
    current_folder_path = os.path.join(base_data_dir, folder_name)

    # 检查文件夹是否存在
    if not os.path.isdir(current_folder_path):
        print(f"⚠️  警告: 文件夹 '{current_folder_path}' 不存在，已跳过。")
        continue

    print(f"\n📁 开始处理文件夹: {current_folder_path}")

    # 遍历文件夹中的所有文件
    for filename in os.listdir(current_folder_path):
        # 检查文件名是否符合条件
        if "Text_Only" not in filename and "English" in filename and filename.endswith(".json"):
            input_path = os.path.join(current_folder_path, filename)
            
            print(f"  ➡️  正在处理文件: {filename}")

            # --- 数据加载 ---
            try:
                with open(input_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
            except Exception as e:
                print(f"    ❌ 读取文件失败: {input_path}, 错误: {e}")
                continue # 跳过这个文件

            # --- 确定语言和Prompt模板 ---
            if "English" in filename:
                language = "English"
            else:
                language = NATIVE_LANGUAGE_MAP.get(folder_name, "English")
            
            prompt_template = "" # 初始化
            if len(language.split(",")) == 1:
                prompt_template = PROMPT_TEMPLATES[folder_name][language]
            else:
                for l in language.split(","):
                    if l in filename:
                        prompt_template = PROMPT_TEMPLATES[folder_name][l]
                        break
            
            if not prompt_template:
                print(f"    ⚠️ 警告: 未能为文件 {filename} 找到匹配的提示词模板，将跳过此文件。")
                continue
            
            # --- 逐条处理数据 ---
            for item in tqdm(data, desc=f"  Processing items in {filename}", leave=False):
                try:
                    # 从原始数据项中获取问题和选项
                    question = item.get("Rephrased_Question", "").strip()
                    options = [
                        f"A. {str(item.get('Option1', '')).strip()}",
                        f"B. {str(item.get('Option2', '')).strip()}",
                        f"C. {str(item.get('Option3', '')).strip()}",
                        f"D. {str(item.get('Option4', '')).strip()}",
                    ]
                    
                    # 使用模板和实际数据生成最终的 text_prompt
                    text_prompt = prompt_template.format(
                        question=question, 
                        options="\n".join(options)
                    )
                    
                    # 构造 Deepseek-VL2 特定的对话格式
                    # 因为是 Text-Only 任务, "images" 列表为空
                    conversation = [
                        {
                            "role": "<|User|>",
                            "content": f"\n<|ref|>{text_prompt}<|/ref|>.",
                            "images": []
                        },
                        {
                            "role": "<|Assistant|>",
                            "content": ""
                        }
                    ]

                    # 模型输入预处理
                    prepare_inputs = vl_chat_processor(
                        conversations=conversation,
                        images=[], # 明确传入空列表
                        force_batchify=True,
                        system_prompt="",
                    ).to(dtype=torch.bfloat16, device=vl_gpt.device)
                    
                    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs) 

                    # 模型推理
                    with torch.no_grad():
                        outputs = vl_gpt.generate(
                            inputs_embeds=inputs_embeds,
                            attention_mask=prepare_inputs.attention_mask,
                            pad_token_id=tokenizer.eos_token_id,
                            bos_token_id=tokenizer.bos_token_id,
                            eos_token_id=tokenizer.eos_token_id,
                            max_new_tokens=1024, # 保持与 Qwen 一致
                            do_sample=False,
                            use_cache=True
                        )

                    # 解码得到最终的文本输出
                    output_text = tokenizer.decode(
                        outputs[0].cpu().tolist(), 
                        skip_special_tokens=True
                    ).strip()

                    # --- 在原数据项中添加新字段 ---
                    # 字段名反映了所用的模型
                    item["deepseek_vl2_small_answer"] = output_text

                except Exception as e:
                    item["deepseek_vl2_small_answer"] = f"Error: {str(e)}"

            # --- 保存更新后的数据 ---
            # 构建输出文件名
            base_filename = os.path.splitext(filename)[0]
            output_filename = f"{base_filename}_deepseekVL2s_answered.json"
            output_path = os.path.join(output_dir, output_filename)

            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(data, f, indent=2, ensure_ascii=False)

            print(f"    ✅ 处理完成，结果已保存至: {output_path}")

print(f"\n🎉 所有文件夹处理完毕！所有输出文件已保存到 '{output_dir}' 文件夹中。")
