import re
import os
import random
from datasets import Dataset, load_dataset
from random import randint, seed, choice
from typing import List, Tuple
from tqdm import tqdm
from verl.utils.hdfs_io import copy, makedirs
import argparse
import json

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--local_dir', default='')
    parser.add_argument('--hdfs_dir', default=None)
    args = parser.parse_args()
    
    train_samples = []
    example_contents = []
    
    # 读取原始数据文件
    with open("sat_code_5-5_1clique_0hop.jsonl") as f:
        for line in f.readlines():
            # 解析JSON行
            sample = json.loads(line)
            train_samples.append(sample)
    
    # 使用parquet文件替代jsonl文件
    example_dataset = load_dataset('parquet', data_files="writingprompts/data/train-00000-of-00002-105e07cb0d199464.parquet")
    for example in example_dataset['train']:
        if "story" in example:
            example_contents.append(example["story"])
    
    example_dataset = load_dataset('parquet', data_files="writingprompts/data/train-00001-of-00002-4fdb982c11056472.parquet")
    for example in example_dataset['train']:
        if "story" in example:
            example_contents.append(example["story"])
    
    # 创建新的训练样本
    enhanced_samples = []
    for code_sample in train_samples:
        # 确保我们有需要的键
        if "problem" in code_sample and "answer" in code_sample:
            background = random.choice(example_contents)
            enhanced_samples.append({
                "data_source": 'fl2nl-translation',
                "prompt": [{
                    "role": "user",
                    "content": f'''Z3 Code:
{code_sample["problem"]}

Background:
{background}

Integrate all information from the Z3 code into the Background to generate a challenging natural language content. Do not refer to or quote the code directly, and do not use symbolic identifiers (e.g., "A1", "C5") in the narrative.
Begin with a straightforward version in natural language, then progressively refine it to be more complex, either by using more sophisticated vocabulary, crafting a more intricate or abstract setting, or adding layers of conceptual difficulty.
Ensure that each constraint encoded in the Z3 code is explicitly represented in the final version of the natural language content, each constraint should be clearly reflected one by one, while the final solution must remain undisclosed.
After that, provide natural language definitions for each variable used in the code. Each line formatted as: "[Variable name]: [Definition in the natural language content]".

Conclude your response with following format:
Natural Language Content:
[content]

Definitions:
[definitions]''',
                }],
                "ability": "translation",
                "reward_model": {
                    "style": "rule",
                    "ground_truth": {"code": code_sample["problem"], "answer": code_sample["answer"], "background": background}
                }
            })
    
    print(enhanced_samples[0])
    
    # 创建训练数据集 - 修复索引错误
    if len(enhanced_samples) >= 3500:
        # train_dataset = Dataset.from_list(enhanced_samples[:1000] + enhanced_samples[1500:3500])
        # test_dataset = Dataset.from_list(enhanced_samples[1000:1500])
        train_dataset = Dataset.from_list(enhanced_samples[:3000])
        test_dataset = Dataset.from_list(enhanced_samples[3000:3500])
    else:
        # 如果数据量不足，调整切分策略
        train_size = min(1000, len(enhanced_samples) * 8 // 10)  # 80%用于训练
        test_size = len(enhanced_samples) - train_size
        train_dataset = Dataset.from_list(enhanced_samples[:train_size])
        test_dataset = Dataset.from_list(enhanced_samples[train_size:])
    
    print(f"Created training dataset with {len(train_dataset)} samples")
    print(f"Created test dataset with {len(test_dataset)} samples")
    
    local_dir = os.path.expanduser(args.local_dir)
    os.makedirs(local_dir, exist_ok=True)
    
    # 保存数据集
    train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
    test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
    
    if args.hdfs_dir is not None:
        makedirs(args.hdfs_dir)
        copy(src=local_dir, dst=args.hdfs_dir)