import os
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

# --- 1. 模型和分词器加载 ---
# 模型ID更新为 Qwen3-30B-A3B-Thinking-2507
model_id = "Qwen/Qwen3-30B-A3B-Thinking-2507"

print(f"正在加载模型: {model_id}...")
# 使用 AutoModelForCausalLM 和 AutoTokenizer 加载纯文本模型
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="auto"
)
print("模型加载完成。")

# --- 2. 路径和配置信息 (与原代码保持一致) ---
base_data_dir = "." 
#folders_to_process = ["Singapore", "Japan"]
folders_to_process = ["Singapore"]
output_dir = "Output_Json_Qwen3-30B-A3B-Thinking-2507_for_remain_tamil"
os.makedirs(output_dir, exist_ok=True)

# --- 3. 动态提示词模板 (与原代码保持一致) ---
PROMPT_TEMPLATES = {
    "China": {
        "Chinese": "请回答以下与中国文化相关的问题：\n{question}\n{options}\n这是一个多选题，请先返回所有可能的选项字母，再用中文解释你的选择。",
        "English": "Please answer the following question related to Chinese Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English."
    },
    "Indonesian": {
        "Indonesian": "Silakan jawab pertanyaan berikut terkait Budaya Indonesia.\n{question}\n{options}\nIni adalah pertanyaan pilihan ganda. Harap kembalikan semua kemungkinan huruf opsi terlebih dahulu, lalu jelaskan pilihan Anda dalam Bahasa Indonesia.",
        "English": "Please answer the following question related to Indonesian Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English."
    },
    "Korea": {
        "Korean": "다음은 한국 문화와 관련된 질문입니다. 답변해 주세요.\n{question}\n{options}\n이것은 객관식 문제입니다. 먼저 가능한 모든 옵션 문자를 반환한 다음, 한국어로 당신의 선택을 설명해 주세요.",
        "English": "Please answer the following question related to Korean Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English."
    },
    "Mongolia": {
        "Mongolian": "Монголын соёлтой холбоотой дараах асуултад хариулна уу.\n{question}\n{options}\nЭнэ бол олон сонголттой асуулт юм. Эхлээд боломжит бүх сонголтын үсгийг буцааж, дараа нь сонголтоо монгол хэлээр тайлбарлана уу.",
        "English": "Please answer the following question related to Mongolian Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English."
    },
    "Singapore": {
        "English": "Please answer the following question related to Singaporean Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English.",
        "Malay": "Sila jawab soalan berikut yang berkaitan dengan Budaya Singapura.\n{question}\n{options}\nIni adalah soalan pilihan berganda. Sila kembalikan semua huruf pilihan yang mungkin terlebih dahulu, kemudian jelaskan pilihan anda dalam Bahasa Inggeris.",
        "Chinese": "请回答以下与新加坡文化相关的问题。\n{question}\n{options}\n这是一个多选题，请先返回所有可能的选项字母，再用英文解释你的选择。",
        "Tamil": "இந்திய கலாச்சாரத்தைச் சார்ந்த கீழ்க்கண்ட கேள்விக்கு பதிலளிக்கவும்.\n{question}\n{options}\nஇது ஒரு பன்முகத் தேர்வு கேள்வி. முதலில் சாத்தியமான அனைத்து விருப்ப எழுத்துகளையும் கொடுத்து, பின்னர் உங்கள் தேர்வை ஆங்கிலத்தில் விளக்கவும்."
    },
    "Vitnamese": { # 拼写与您的文件夹名保持一致
        "Vietnamese": "Vui lòng trả lời câu hỏi sau đây liên quan đến Văn hóa Việt Nam.\n{question}\n{options}\nĐây là một câu hỏi trắc nghiệm. Vui lòng trả về tất cả các chữ cái tùy chọn có thể có trước, sau đó giải thích lựa chọn của bạn bằng tiếng Việt.",
        "English": "Please answer the following question related to Vietnamese Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English."
    },
    "Japan": {
        "Japanese": "日本文化に関する次の質問に答えてください。\n{question}\n{options}\nこれは多肢選択式の質問です。まずすべての可能な選択肢のアルファベットを返し、その後にあなたの選択を英語で説明してください。",
        "English": "Please answer the following question related to Japanese Culture.\n{question}\n{options}\nThis is a multiple-choice question. Please first return all possible option letters, then explain your choice in English."
    },
}
NATIVE_LANGUAGE_MAP = {
    "Indonesian": "Indonesian",
    "Korea": "Korean",
    "Mongolia": "Mongolian",
    "Singapore": "Malay,Chinese,Tamil",
    "Vitnamese": "Vietnamese",
    "Japan": "Japanese"
}

# --- 4. 遍历文件夹和文件进行处理 ---
for folder_name in folders_to_process:
    current_folder_path = os.path.join(base_data_dir, folder_name)

    if not os.path.isdir(current_folder_path):
        print(f"⚠️  警告: 文件夹 '{current_folder_path}' 不存在，已跳过。")
        continue

    print(f"\n📁 开始处理文件夹: {current_folder_path}")

    for filename in os.listdir(current_folder_path):
        if "Text_Only" in filename and "English" not in filename and filename.endswith(".json"):
            if ("SG" in filename and "Tamil" not in filename) or ("SG" in filename and "Malay" not in filename):
                continue
            else:
                input_path = os.path.join(current_folder_path, filename)
                
                print(f"  ➡️  正在处理文件: {filename}")

                try:
                    with open(input_path, "r", encoding="utf-8") as f:
                        data = json.load(f)
                except Exception as e:
                    print(f"    ❌ 读取文件失败: {input_path}, 错误: {e}")
                    continue

                if "English" in filename:
                    language = "English"
                else:
                    language = NATIVE_LANGUAGE_MAP.get(folder_name, "English")
                
                prompt_template = None # 初始化
                if "," in language:
                    for lang_part in language.split(","):
                        if lang_part in filename:
                            prompt_template = PROMPT_TEMPLATES[folder_name][lang_part]
                            break
                else:
                    prompt_template = PROMPT_TEMPLATES[folder_name][language]

                if prompt_template is None:
                    print(f"    ⚠️ 警告: 未找到文件 '{filename}' 对应的Prompt模板，将使用默认英语模板。")
                    prompt_template = PROMPT_TEMPLATES[folder_name].get("English", "Error: No English template found.")

                for item in tqdm(data, desc=f"  Processing items in {filename}", leave=False):
                    try:
                        question = item.get("Question", "").strip()
                        options = "\n".join([
                            f"A. {str(item.get('Option1', '')).strip()}",
                            f"B. {str(item.get('Option2', '')).strip()}",
                            f"C. {str(item.get('Option3', '')).strip()}",
                            f"D. {str(item.get('Option4', '')).strip()}",
                        ])
                        
                        text_prompt = prompt_template.format(question=question, options=options)
                        
                        messages = [{"role": "user", "content": text_prompt}]

                        # 使用 tokenizer 替代 processor
                        text = tokenizer.apply_chat_template(
                            messages,
                            tokenize=False,
                            add_generation_prompt=True,
                        )
                        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

                        # 模型推理，并解析 "thinking" 内容
                        with torch.no_grad():
                            generated_ids = model.generate(
                                **model_inputs,
                                max_new_tokens=1024 # 2048通常足够，32768可能过大
                            )
                            
                            # 从生成结果中移除输入部分
                            output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

                            # 解析 "thinking" 内容
                            thinking_content = ""
                            content = ""
                            # </think> 对应的 token ID 是 151668
                            think_token_id = 151668
                            try:
                                # 查找 </think> token 的位置
                                # rindex 从后往前找，找到第一个就是思考过程的结尾
                                index = len(output_ids) - output_ids[::-1].index(think_token_id)
                                # 解码思考过程和最终答案
                                thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip()
                                content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip()
                            except ValueError:
                                # 如果没有找到 </think> token，则将全部输出视为最终答案
                                content = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
                                
                        # --- 在原数据项中添加新字段 ---
                        # 字段名反映了所用的模型
                        item["qwen3_30b_thinking"] = thinking_content
                        item["qwen3_30b_answer"] = content

                    except Exception as e:
                        # 发生错误时，记录错误信息
                        item["qwen3_30b_thinking"] = f"Error: {str(e)}"
                        item["qwen3_30b_answer"] = f"Error: {str(e)}"

            # --- 保存更新后的数据 ---
            # 更新输出文件名以反映新模型
            base_filename = os.path.splitext(filename)[0]
            output_filename = f"{base_filename}_qwen3_30b_answered.json"
            output_path = os.path.join(output_dir, output_filename)

            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(data, f, indent=2, ensure_ascii=False)

            print(f"    ✅ 处理完成，结果已保存至: {output_path}")

print(f"\n🎉 所有文件夹处理完毕！所有输出文件已保存到 '{output_dir}' 文件夹中。")
