import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import permutation_test

# 1. Populating the completed model pairs from Table 1
'''
models = [
    "DeepSeek-Prover-V1", "DeepSeek-Prover-V1.5-SFT", "DeepSeek-Prover-V1.5-RL",
    "Goedel-Prover-SFT", "Goedel-Prover-DPO", "Goedel-Prover-V2-8B", 
    "Goedel-Prover-V2-32B", "DeepSeek-Prover-V2-7B", "DeepSeek-Prover-V2-671B"
]
raw_A = np.array([38.3, 54.1, 57.2, 62.9, 59.9, 75.7, 75.7, 87.0, 100.0]) # ZSCT
raw_B = np.array([46.1, 48.2, 50.0, 57.6, 60.3, 84.6, 88.1, 75.6,  82.4]) # miniF2F
'''
models = [
    "DeepSeek-Prover-V1", "DeepSeek-Prover-V1.5-SFT", "DeepSeek-Prover-V1.5-RL",
    "Goedel-Prover-SFT", "Goedel-Prover-DPO", "Goedel-Prover-V2-8B", 
    "Goedel-Prover-V2-32B", "DeepSeek-Prover-V2-7B", "DeepSeek-Prover-V2-671B",
    "Kimina-Prover-72B",
]
raw_A = np.array([38.3, 54.1, 57.2, 62.9, 59.9, 75.7, 75.7, 87.0, 100.0, 98.9]) # ZSCT
raw_B = np.array([46.1, 48.2, 50.0, 57.6, 60.3, 84.6, 88.1, 75.6,  82.4, 84.0]) # miniF2F


# 2. Baseline Adjustment
# ZSCT (A) chance level is 50. miniF2F (B) chance level is 0.
adj_A = np.maximum(0, (raw_A - 50) / (100 - 50) * 100)
adj_B = raw_B  # Kept as is (0-100 scale, maximum observed is ~88.1)

# 3. Define the Prerequisite Metric (Top-Left Empty Space)
def calculate_empty_space(y_shuffled, x_fixed):
    # If a model has low adjusted Skill X (A) but high Task performance (B),
    # it violates the prerequisite hypothesis, generating a high penalty.
    # A negative penalty means a 'larger empty space' equals a higher score.
    penalty = np.sum(np.maximum(0, (y_shuffled / 100.0) - (x_fixed / 100.0)))
    return -penalty

# 4. Exact Permutation Test for N=9
# Total possible permutations is 9! = 362,880. We can evaluate it exactly.
res = permutation_test(
    (adj_B,), 
    lambda y: calculate_empty_space(y, adj_A), 
    permutation_type='pairings', 
    vectorized=False, 
    alternative='greater'
)

# 5. Output Results
print(f"--- Bottleneck Hypothesis Test (N = {len(adj_A)}) ---")
print(f"Observed Metric Value: {calculate_empty_space(adj_B, adj_A):.4f}")
print(f"Empirical p-value:    {res.pvalue:.5f}")

if res.pvalue < 0.05:
    print("\nCONCLUSION: Statistically Significant. The top-left corner is uniquely empty.")
    print("Benchmark A (ZSCT) acts as a structural bottleneck/prerequisite for Benchmark B (miniF2F).")
else:
    print("\nCONCLUSION: Not Significant. The triangular distribution could be due to random chance.")

# 6. Generate Diagnostic Scatter Plot
plt.figure(figsize=(7, 5))
plt.scatter(adj_A, adj_B, color='darkblue', edgecolors='black', s=60, zorder=3, label='Models')

# Draw the theoretical boundary ceiling line from (0,0) to maximum observed bounds
plt.plot([0, 100], [0, 90], 'r--', alpha=0.8, label='Prerequisite Boundary')

# Add labels to points for easy diagnostic tracking
for i, txt in enumerate(models):
    short_name = txt.replace("DeepSeek-Prover-", "DP-").replace("Goedel-Prover-", "GP-")
    plt.annotate(short_name, (adj_A[i], adj_B[i]), textcoords="offset points", 
                 xytext=(0,6), ha='center', fontsize=7, alpha=0.7)

plt.xlim(-5, 105)
plt.ylim(-5, 105)
plt.xlabel('Adjusted Benchmark A: ZSCT Ability Above Chance (%)')
plt.ylabel('Benchmark B: miniF2F Test Accuracy (%)')
plt.title('Prerequisite Testing Boundary Map (N=9)')
plt.grid(True, linestyle=':', alpha=0.6)
plt.legend(loc='lower right')
plt.tight_layout()
#plt.show()
# bbox_inches='tight' ensures that labels or titles don't get cut off at the edges
plt.savefig('prerequisite_boundary_plot.pdf', format='pdf', bbox_inches='tight')

# Optional: Close the plot to free up memory if running in a loop
plt.close()
print("Plot successfully saved as 'prerequisite_boundary_plot.pdf'")
