import random
import jsonlines
from datasets import load_dataset
import re


def contains_chinese(text):
    # 匹配任意一个中文字符
    return re.search('[\u4e00-\u9fff]', text) is not None


ds = load_dataset('open-r1/OpenR1-Math-220k', split='train')
output_data = []
for item in ds:
    if item['question_type'] == 'MCQ':
        continue
    if contains_chinese(item['problem']):
        continue
    output_data.append({
        'problem': item['problem'],
        'solution': item['solution'],
        'answer': item['answer'],
        'level': None,
    })
random.shuffle(output_data)
output_data = output_data[:10000]
with jsonlines.open('critic_bench/openr1_math.jsonl', 'w') as writer:
    writer.write_all(output_data)
