import json
import re
from datasets import load_dataset



dataset = load_dataset('openai/gsm8k', 'main')

with open('processed_gsm8k_comma.jsonl', 'w') as f:
    for idx, item in enumerate(dataset['train']):
        question = item['question']
        original_answer = item['answer']
        
        answer_match = re.search(r'####\s*([\d,]+)', original_answer)
        if answer_match:
            extracted_answer = answer_match.group(1)
        
        processed_item = {
            'id': idx,
            'en': question,
            'answer': extracted_answer,
            'original_answer': original_answer
        }

        json.dump(processed_item, f, ensure_ascii=False)
        f.write("\n")

with open('processed_gsm8k_comma.jsonl', 'a', encoding='utf-8') as f:
    for idx, item in enumerate(dataset['test'], start=len(dataset['train'])):
        question = item['question']
        original_answer = item['answer']
        
        answer_match = re.search(r'####\s*(\d+)', original_answer)
        extracted_answer = answer_match.group(1) if answer_match else None
        
        processed_item = {
            'id': idx,
            'split': 'test',
            'en': question,
            'answer': extracted_answer,
            'original_answer': original_answer
        }

        json.dump(processed_item, f, ensure_ascii=False)
        f.write("\n")

print("Finished processing，saved as 'processed_gsm8k_comma.jsonl'")
