import json
import os

DATA_DIR = "./data"

SAMPLE_DATA_DIR = "./data_python_high_quality"
os.makedirs(SAMPLE_DATA_DIR, exist_ok=True)

data_files = [
    "completion_candidates.jsonl",
    "construct_candidates.jsonl",
    "task_candidates.jsonl"
]

SAMPLE_NUM = 150000

# sample for each file and save the sampled data to a new file under sample data dir
def sample_data():
    for data_file in data_files:
        data_path = f"{DATA_DIR}/{data_file}"
        sample_data_path = f"{SAMPLE_DATA_DIR}/{data_file}"
        
        with open(data_path, 'r') as f:
            lines = f.readlines()

        items = [json.loads(line) for line in lines if line.strip()]
        python_items = [item for item in items if item["patch_analysis"]["language"].lower() == "python"]
        print(f"Total items in {data_file}: {len(items)}"
              f" | Python items: {len(python_items)}")
        
        # sample SAMPLE_NUM lines
        sampled_items = python_items[:SAMPLE_NUM]
        
        with open(sample_data_path, 'w') as f:
            for item in sampled_items:
                f.write(json.dumps(item, ensure_ascii=False) + '\n')
        
        print(f"Sampled {len(sampled_items)} lines from {data_file} and saved to {sample_data_path}")

sample_data()
