# Eval.sh save the generated test cases evaluation results in the response file, so you can get the generated test cases evaluation results by get the metadata from the response file.

import random
import pandas as pd
import numpy as np


response_path = "/path/to/folder/data/lcb_tiny_rl_180steps.pkl"  # Eval Result

response_dataset = pd.read_pickle(response_path)

if type(response_dataset) == list:
    response_dataset = pd.DataFrame(response_dataset)

# Number of random trials
n_trials = 100

# Flag to enable/disable test case validation
validate_test_cases = False

# Lists to store results for each trial
plain_results = []
public_results = []
generate_results = []


def get_valid_test_cases(solutions, metadata):
    """
    Returns a set of valid test case indices that are passed by at least 
    half of the solutions that pass public test cases.
    """

    # Find the maximum number of test cases in any solution
    max_test_cases = 0
    for sol_idx in solutions:
        results = metadata[sol_idx]['results'][0][0]
        max_test_cases = max(max_test_cases, len(results))

    valid_tests = set()
    # Check each test case
    for test_idx in range(max_test_cases):
        passing_count = 0
        for sol_idx in solutions:
            if sol_idx < len(metadata):
                results = metadata[sol_idx]['results'][0][0]
                if test_idx < len(results) and results[test_idx] == True:
                    passing_count += 1

        # Valid if at least half of solutions pass it
        if passing_count >= len(solutions) / 2:
            valid_tests.add(test_idx)

    return valid_tests


# Pre-compute the public_case_true_idx and max_idx_list for each problem
problem_indices = []
vaild_filter = 0
for i in range(len(response_dataset)):
    max_idx_list = []
    max_score = -1
    public_case_true_idx = []
    private_case_true_idx = []
    response_problem = response_dataset.iloc[i]

    # Find solutions that pass public and private test cases
    for j in range(len(response_problem['metadata'])):
        if response_problem['private_scores'][j] == True:
            private_case_true_idx.append(j)
        if response_problem['public_scores'][j] == True:
            public_case_true_idx.append(j)

    # Pre-compute valid test cases once for this problem
    valid_test_cases = None
    if validate_test_cases and len(public_case_true_idx) > 1:
        valid_test_cases = get_valid_test_cases(
            public_case_true_idx, response_problem['metadata'])

    # Calculate scores for each solution that passes public tests
    for j in public_case_true_idx:
        all_results = response_problem['metadata'][j]['results'][0][0]

        # Calculate generate_score based on validation setting
        if validate_test_cases and valid_test_cases is not None:
            # Count only valid test cases
            generate_score = sum(1 for test_idx, result in enumerate(all_results)
                                 if result == True and test_idx in valid_test_cases)
        else:
            # Original logic: count all passing test cases
            generate_score = sum(1 for result in all_results if result == True)

        if generate_score > max_score:
            max_score = generate_score
            max_idx_list = [j]
        elif generate_score == max_score:
            max_idx_list.append(j)

    # Check if public_case_true_idx and max_idx_list are different
    if set(public_case_true_idx) != set(max_idx_list):
        print(f"i: {i}, public_case_true_idx: {public_case_true_idx}, private_case_true_idx: {private_case_true_idx}, max_idx_list: {max_idx_list}")
        # Check if there are elements in private_case_true_idx that are not in max_idx_list
        private_not_in_max = [
            idx for idx in private_case_true_idx if idx not in max_idx_list]
        if private_not_in_max:
            print(
                f"  private_case_true_idx elements not in max_idx_list: {private_not_in_max}")
        vaild_filter += 1

    problem_indices.append({
        'public_case_true_idx': public_case_true_idx,
        'private_case_true_idx': private_case_true_idx,
        'max_idx_list': max_idx_list
    })

print(
    f"vaild_filter: {vaild_filter}, vaild_filter/len(response_dataset): {vaild_filter/len(response_dataset)}")

# Run multiple trials
for trial in range(n_trials):
    passn_plain = 0
    passn_public = 0
    passn_generate = 0

    for i in range(len(response_dataset)):
        response_problem = response_dataset.iloc[i]
        indices = problem_indices[i]
        public_case_true_idx = indices['public_case_true_idx']
        max_idx_list = indices['max_idx_list']

        # Plain random selection
        if response_problem['private_scores'][random.randint(0, 15)] == True:
            passn_plain += 1

        # Public test case based selection
        if len(public_case_true_idx) >= 1:
            selected_idx = random.choice(public_case_true_idx)
            if response_problem['private_scores'][selected_idx] == True:
                passn_public += 1

        # Generate score based selection
        if len(max_idx_list) >= 1:
            selected_idx = random.choice(max_idx_list)
            if response_problem['private_scores'][selected_idx] == True:
                passn_generate += 1

    # Store results for this trial
    plain_results.append(passn_plain)
    public_results.append(passn_public)
    generate_results.append(passn_generate)

# Calculate statistics
plain_stats = {
    'min': min(plain_results),
    'max': max(plain_results),
    'mean': np.mean(plain_results),
    'rate_min': min(plain_results) / len(response_dataset),
    'rate_max': max(plain_results) / len(response_dataset),
    'rate_mean': np.mean(plain_results) / len(response_dataset)
}

public_stats = {
    'min': min(public_results),
    'max': max(public_results),
    'mean': np.mean(public_results),
    'rate_min': min(public_results) / len(response_dataset),
    'rate_max': max(public_results) / len(response_dataset),
    'rate_mean': np.mean(public_results) / len(response_dataset)
}

generate_stats = {
    'min': min(generate_results),
    'max': max(generate_results),
    'mean': np.mean(generate_results),
    'rate_min': min(generate_results) / len(response_dataset),
    'rate_max': max(generate_results) / len(response_dataset),
    'rate_mean': np.mean(generate_results) / len(response_dataset)
}

print(f"n_trials: {n_trials}, validate_test_cases: {validate_test_cases}")
print(f"Plain random selection - min: {plain_stats['min']}, max: {plain_stats['max']}, mean: {plain_stats['mean']:.2f}, rate min: {plain_stats['rate_min']:.4f}, rate max: {plain_stats['rate_max']:.4f}, rate mean: {plain_stats['rate_mean']:.4f}")
print(f"Public test selection - min: {public_stats['min']}, max: {public_stats['max']}, mean: {public_stats['mean']:.2f}, rate min: {public_stats['rate_min']:.4f}, rate max: {public_stats['rate_max']:.4f}, rate mean: {public_stats['rate_mean']:.4f}")
print(f"Generate score selection - min: {generate_stats['min']}, max: {generate_stats['max']}, mean: {generate_stats['mean']:.2f}, rate min: {generate_stats['rate_min']:.4f}, rate max: {generate_stats['rate_max']:.4f}, rate mean: {generate_stats['rate_mean']:.4f}")
