import pdb
import pandas as pd
import numpy as np
import json

# Load data from JSONL file
def load_data_from_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Main function to perform sampling
def sample_data(file_path, n_samples=200):
    # Load the data
    df = load_data_from_jsonl(file_path)

    # Check if 'difficulty' column exists
    if 'difficulty' not in df.columns:
        raise ValueError("The DataFrame must contain a 'difficulty' column.")

    # Calculate the distribution of difficulties
    difficulty_distribution = df['difficulty'].value_counts(normalize=True)
    
    # Calculate the number of samples to take from each difficulty level
    samples_per_difficulty = (difficulty_distribution * n_samples).round().astype(int)

    pdb.set_trace()
    # Adjust if the sum of samples is less than n_samples due to rounding
    while samples_per_difficulty.sum() < n_samples:
        samples_per_difficulty.iloc[0] += 1

    # Sample the data
    sampled_data = pd.DataFrame()

    for difficulty, count in samples_per_difficulty.items():
        if count > 0:  # Only sample if count is greater than zero
            sampled_data = pd.concat([sampled_data, df[df['difficulty'] == difficulty].sample(n=count)])

    # Reset index for the final sampled DataFrame
    sampled_data.reset_index(drop=True, inplace=True)

    return sampled_data

# Specify the path to your JSONL file
file_path = ''

# Perform sampling and print the result
sampled_df = sample_data(file_path)
pdb.set_trace()
print(sampled_df)

# Optionally, save the sampled data to a new JSONL file
sampled_df.to_json('sampled_data.jsonl', orient='records', lines=True, force_ascii=False)