import numpy as np
from itertools import combinations

# 1. Input data from the TeX table
'''
models = [
    "DP-V1", "DP-V1.5-SFT", "DP-V1.5-RL", "GP-SFT", "GP-DPO", 
    "GP-V2-8B", "GP-V2-32B", "DP-V2-7B", "DP-V2-671B"
]
raw_A = np.array([38.3, 54.1, 57.2, 62.9, 59.9, 75.7, 75.7, 87.0, 100.0]) # ZSCT
raw_B = np.array([46.1, 48.2, 50.0, 57.6, 60.3, 84.6, 88.1, 75.6,  82.4]) # miniF2F
'''
models = [
    "DeepSeek-Prover-V1", "DeepSeek-Prover-V1.5-SFT", "DeepSeek-Prover-V1.5-RL",
    "Goedel-Prover-SFT", "Goedel-Prover-DPO", "Goedel-Prover-V2-8B", 
    "Goedel-Prover-V2-32B", "DeepSeek-Prover-V2-7B", "DeepSeek-Prover-V2-671B",
    "Kimina-Prover-72B",
]
raw_A = np.array([38.3, 54.1, 57.2, 62.9, 59.9, 75.7, 75.7, 87.0, 100.0, 98.9]) # ZSCT
raw_B = np.array([46.1, 48.2, 50.0, 57.6, 60.3, 84.6, 88.1, 75.6,  82.4, 84.0]) # miniF2F
tail_offset = 5 #4

# Adjust A for chance level (50%)
adj_A = np.maximum(0, (raw_A - 50) / (100 - 50) * 100)

# 2. Identify who is in the "Hard Tail" of Benchmark B (>75%)
# This dynamically finds the indices of the top 4 models on miniF2F
#tail_indices = np.argsort(raw_B)[-4:] 
tail_indices = np.argsort(raw_B)[-tail_offset:] 
print("Models in the Hard Tail of miniF2F:", [models[i] for i in tail_indices])

# 3. Calculate the Observed Capability Metric
# We sum the Adjusted ZSCT scores of the models that made it into the hard tail.
# If the tail requires high ZSCT, this sum will be exceptionally large.
observed_sum = np.sum(adj_A[tail_indices])

# 4. Exact Combinatorics (Evaluate all possible ways to pick 4 models out of 9)
# Total combinations = 9 choose 4 = 126 possible groupings.
all_possible_sums = []
for combo in combinations(range(len(adj_A)), tail_offset):
    all_possible_sums.append(np.sum(adj_A[list(combo)]))

# Calculate how many random combinations yield a ZSCT sum >= our observed sum
greater_or_equal = np.sum(np.array(all_possible_sums) >= observed_sum)
p_value = greater_or_equal / len(all_possible_sums)

print(f"\n--- Tail Dependency Analysis ---")
print(f"Observed ZSCT Sum in Hard Tail: {observed_sum:.2f}")
print(f"Total Possible Combinations:    {len(all_possible_sums)}")
print(f"Combinations >= Observed:       {greater_or_equal}")
print(f"Exact Empirical p-value:        {p_value:.5f}")

if p_value < 0.05:
    print("\nCONCLUSION: Statistically Significant!")
    print("The models that conquer the hard tail of miniF2F possess a cluster of high ZSCT scores")
    print("that is mathematically virtually impossible to achieve by random coincidence.")
