import os
import json
import torch
from transformers import AutoProcessor, Glm4vForConditionalGeneration
from tqdm import tqdm

# --- 1. 模型和处理器加载 (已修改为 GLM-4.1V) ---
# GLM-4.1V 的模型路径
model_id = "THUDM/GLM-4.1V-9B-Thinking" 

print(f"正在加载模型: {model_id}...")
# 使用 GLM-4.1V 对应的类进行加载
# 注意：torch_dtype=torch.bfloat16 是官方推荐的，有助于性能和内存
model = Glm4vForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16, # 官方推荐
    device_map="auto"
)
# GLM-4.1V 的处理器，官方推荐 use_fast=True
processor = AutoProcessor.from_pretrained(model_id, use_fast=True)
print("模型加载完成。")

# --- 2. 路径和配置信息 (保持不变) ---
# 这部分逻辑与模型无关，因此无需修改
base_data_dir = "." 
folders_to_process = ["Indonesian", "Korea", "Mongolia", "Vitnamese", "Singapore"]
output_dir = "Output_Json_GLM" # 建议为GLM创建一个新的输出目录以避免混淆

os.makedirs(output_dir, exist_ok=True)

# --- 3. 动态提示词模板 (保持不变) ---
# 这部分逻辑与模型无关，因此无需修改
PROMPT_TEMPLATES = {
    "China": {
        "Chinese": "请回答以下与中国文化相关的问题：\n{question}\n{options}\n这是一个多选题，请先返回所有可能的选项字母，再用中文解释你的选择。",
        "English": "Please answer the following question related to Chinese Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English."
    },
    "Indonesian": {
        "Indonesian": "Silakan jawab pertanyaan berikut terkait Budaya Indonesia.\n{question}\n{options}\nIni adalah pertanyaan pilihan ganda. Harap kembalikan semua kemungkinan huruf opsi terlebih dahulu, lalu jelaskan pilihan Anda dalam Bahasa Indonesia.",
        "English": "Please answer the following question related to Indonesian Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English."
    },
    "Korea": {
        "Korean": "다음은 한국 문화와 관련된 질문입니다. 답변해 주세요.\n{question}\n{options}\n이것은 객관식 문제입니다. 먼저 가능한 모든 옵션 문자를 반환한 다음, 한국어로 당신의 선택을 설명해 주세요.",
        "English": "Please answer the following question related to Korean Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English."
    },
    "Mongolia": {
        "Mongolian": "Монголын соёлтой холбоотой дараах асуултад хариулна уу.\n{question}\n{options}\nЭнэ бол олон сонголттой асуулт юм. Эхлээд боломжит бүх сонголтын үсгийг буцааж, дараа нь сонголтоо монгол хэлээр тайлбарлана уу.",
        "English": "Please answer the following question related to Mongolian Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English."
    },
    "Singapore": {
        "English": "Please answer the following question related to Singaporean Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English.",
        "Malay": "Sila jawab soalan berikut yang berkaitan dengan Budaya Singapura.\n{question}\n{options}\nIni adalah soalan pilihan berganda. Sila kembalikan semua huruf pilihan yang mungkin terlebih dahulu, kemudian jelaskan pilihan anda dalam Bahasa Inggeris.",
        "Chinese": "请回答以下与新加坡文化相关的问题。\n{question}\n{options}\n这是一个多选题，请先返回所有可能的选项字母，再用英文解释你的选择。",
    },
    "Vitnamese": {
        "Vietnamese": "Vui lòng trả lời câu hỏi sau đây liên quan đến Văn hóa Việt Nam.\n{question}\n{options}\nĐây là một câu hỏi trắc nghiệm. Vui lòng trả về tất cả các chữ cái tùy chọn có thể có trước, sau đó giải thích lựa chọn của bạn bằng tiếng Việt.",
        "English": "Please answer the following question related to Vietnamese Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English."
    }
}

NATIVE_LANGUAGE_MAP = {
    "Indonesian": "Indonesian",
    "Korea": "Korean",
    "Mongolia": "Mongolian",
    "Singapore": "Malay,Chinese",
    "Vitnamese": "Vietnamese"
}


# --- 4. 遍历文件夹和文件进行处理 ---
for folder_name in folders_to_process:
    current_folder_path = os.path.join(base_data_dir, folder_name)

    if not os.path.isdir(current_folder_path):
        print(f"⚠️  警告: 文件夹 '{current_folder_path}' 不存在，已跳过。")
        continue

    print(f"\n📁 开始处理文件夹: {current_folder_path}")

    for filename in os.listdir(current_folder_path):
        if "Text_Only" not in filename and filename.endswith(".json"):
            input_path = os.path.join(current_folder_path, filename)
            
            print(f"  ➡️  正在处理文件: {filename}")

            try:
                with open(input_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
            except Exception as e:
                print(f"    ❌ 读取文件失败: {input_path}, 错误: {e}")
                continue

            if "English" in filename:
                language = "English"
            else:
                language = NATIVE_LANGUAGE_MAP.get(folder_name, "English")
            
            prompt_template = "" # 初始化
            if len(language.split(",")) == 1:
                prompt_template = PROMPT_TEMPLATES[folder_name][language]
            else:
                for l in language.split(","):
                    if l in filename:
                        prompt_template = PROMPT_TEMPLATES[folder_name][l]
                        break
            
            # 如果在多语言情况下仍未找到匹配的模板，则默认使用英语
            if not prompt_template:
                prompt_template = PROMPT_TEMPLATES[folder_name].get("English", "Error: No suitable prompt found.")


            for item in tqdm(data, desc=f"  Processing items in {filename}", leave=False):
                try:
                    question = item.get("Rephrased_Question", "").strip()
                    options = [
                        f"A. {str(item.get('Option1', '')).strip()}",
                        f"B. {str(item.get('Option2', '')).strip()}",
                        f"C. {str(item.get('Option3', '')).strip()}",
                        f"D. {str(item.get('Option4', '')).strip()}",
                    ]
                    
                    text_prompt = prompt_template.format(
                        question=question, 
                        options="\n".join(options)
                    )
                    
                    # --- 模型输入处理 (已修改为 GLM-4.1V 格式) ---
                    # GLM-4.1V 同样使用这种对话格式，但因为是纯文本，content列表里只有一个text类型的字典
                    messages = [
                        {
                            "role": "user",
                            "content": [{"type": "text", "text": text_prompt}],
                        }
                    ]

                    # GLM 的 processor.apply_chat_template 直接完成模板应用和tokenize，返回一个字典
                    inputs = processor.apply_chat_template(
                        messages,
                        tokenize=True,
                        add_generation_prompt=True,
                        return_dict=True,
                        return_tensors="pt"
                    ).to(model.device)

                    # --- 模型推理 (已修改为 GLM-4.1V 格式) ---
                    with torch.no_grad():
                        # GLM-4.1V支持更长的输出，但我们可以根据需要设置，这里保持1024与Qwen脚本一致
                        generated_ids = model.generate(**inputs, max_new_tokens=1024)
                    
                    # --- 解码 (已修改为 GLM-4.1V 格式) ---
                    # GLM的inputs是一个字典，需要用 inputs["input_ids"] 来访问
                    input_ids_len = inputs["input_ids"].shape[1]
                    
                    # 解码时跳过输入部分
                    output_text = processor.decode(
                        generated_ids[0][input_ids_len:], 
                        skip_special_tokens=True, # 使用True来移除特殊token如<|endoftext|>
                        clean_up_tokenization_spaces=False
                    ).strip()

                    # --- 在原数据项中添加新字段 (已修改) ---
                    # 字段名反映了所用的模型
                    item["glm_4_1_answer"] = output_text

                except Exception as e:
                    item["glm_4_1_answer"] = f"Error: {str(e)}"

            # --- 保存更新后的数据 (已修改) ---
            # 构建能区分模型的输出文件名
            base_filename = os.path.splitext(filename)[0]
            output_filename = f"{base_filename}_glm4.1_answered.json"
            output_path = os.path.join(output_dir, output_filename)

            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(data, f, indent=2, ensure_ascii=False)

            print(f"    ✅ 处理完成，结果已保存至: {output_path}")

print(f"\n🎉 所有文件夹处理完毕！所有输出文件已保存到 '{output_dir}' 文件夹中。")
