import json
import random

random.seed(42)

input_file = "../../data/system_prompts/generated_prompt_components_20250315_zh.jsonl" 
output_file = input_file.replace("prompt_components", "prompt")
num_combinations = 10000
max_length = 30

components = []
with open(input_file, "r", encoding="utf-8") as infile:
    for line in infile:
        data = json.loads(line.strip()) 
        components.append((data["prompt"], data["category"]))

index_list = list(range(len(components)))

seen_combinations = set()
generated_data = []

while len(generated_data) < num_combinations:
    length = min(max_length, random.choices(
        range(1, max_length + 1),
        weights=[1 / (i ** 0.8) for i in range(1, max_length + 1)]
    )[0])

    selected_indices = random.sample(index_list, min(length, len(components)))

    combined_prompt = " /// ".join([components[i][0] for i in selected_indices])
    combined_category = [components[i][1] for i in selected_indices]

    combination_tuple = (tuple(combined_category), combined_prompt)

    if combination_tuple not in seen_combinations:
        seen_combinations.add(combination_tuple)
        generated_data.append({"prompt": combined_prompt, "category": combined_category})

with open(output_file, "w", encoding="utf-8") as outfile:
    for item in generated_data:
        outfile.write(json.dumps(item, ensure_ascii=False) + "\n")

