import json
import random
import string
from datasets import load_dataset

def process_sciq_dataset(split='test', output_questions_file='processed_sciq.json', output_answers_file='sciq_answer_keys.json'):
    # 加载 SciQ 数据集的指定拆分
    dataset = load_dataset("allenai/sciq", split=split)

    processed_data = []
    answer_keys = []

    for entry in dataset:
        question = entry['question']
        correct_answer = entry['correct_answer']
        distractors = [entry['distractor1'], entry['distractor2'], entry['distractor3']]

        # 合并所有选项并打乱顺序
        options = distractors + [correct_answer]
        random.shuffle(options)

        # 创建选项字符串
        option_labels = list(string.ascii_uppercase)[:len(options)]
        options_str = '\n'.join([f"{label}. {text}" for label, text in zip(option_labels, options)])

        # 构建处理后的问题数据
        processed_entry = {
            'question': question,
            'options': options_str
        }
        processed_data.append(processed_entry)

        # 记录正确答案对应的选项标号
        correct_option_index = options.index(correct_answer)
        correct_option_label = option_labels[correct_option_index]
        answer_keys.append(correct_option_label)

    # 保存处理后的问题数据
    with open(output_questions_file, 'w', encoding='utf-8') as f:
        json.dump(processed_data, f, indent=4, ensure_ascii=False)

    # 保存正确答案的选项标号
    with open(output_answers_file, 'w', encoding='utf-8') as f:
        json.dump(answer_keys, f, indent=4, ensure_ascii=False)

    print(f"处理后的问题数据已保存到 {output_questions_file}")
    print(f"正确答案的选项标号已保存到 {output_answers_file}")

# 示例用法
process_sciq_dataset(split='test')
