import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from scipy.linalg import eigh, norm
# we evaluate the sharpness of Theorem 2.1 by comparing its bound with the actual error and the EYM(Section 4).
# A = Census (n = 69), A = KDD-Cup (n = 416), and A = Adult (n = 6).
# E = Rademacher Noise.
# ------------------------------------------------------------------
# 1.  Load and preprocess the 1990 US Census dataset
# ------------------------------------------------------------------
def preprocess_dataset(path: str):
    """
    Load a numeric CSV without headers, apply min-max normalization,
    scale rows so max L2 norm ≤ 1, center columns, and return M and A = M^T M.
    """
    # 1a. Read all columns as numeric, avoid mixed-type warnings
    df = pd.read_csv(path, header=None, low_memory=False)

    # 1b. Convert entire DataFrame to numeric, coercing errors to NaN
    df = df.apply(pd.to_numeric, errors='coerce')

    # 2. Drop any columns that are non-numeric or fully NaN
    df = df.dropna(axis=1, how='all')

    # 3. Fill remaining NaNs (if any) with column means
    df = df.fillna(df.mean())

    # 4. Min-max scale all columns to [0,1]
    scaler = MinMaxScaler()
    M = scaler.fit_transform(df.values)

    # 5. Scale rows so max row-norm ≤ 1
    row_norms = np.linalg.norm(M, axis=1)
    max_norm = row_norms.max() if len(row_norms)>0 else 1.0
    if max_norm > 0:
        M = M / max_norm

    # 6. Center columns (subtract mean)
    M = M - M.mean(axis=0, keepdims=True)

    # 7. Compute A = M^T M
    A = (M.T @ M)

    return M, A

# ------------------------------------------------------------------
# Run preprocessing
# ------------------------------------------------------------------
csv_file = "USCensus1990.data.txt" #Change the Dataset here.
M, A = preprocess_dataset(csv_file)
print(f"Processed M shape: {M.shape}")
print(f"Matrix A shape: {A.shape}")

# ------------------------------------------------------------------
# Eigendecomposition and rank selection
# ------------------------------------------------------------------
n = A.shape[0]
lam, Q = eigh(A)
lam, Q = lam[::-1], Q[:, ::-1]

# Compute p by 99% Frobenius energy retained
lam_sq    = lam**2
prefix_sq = np.cumsum(lam_sq)
tot_sq    = prefix_sq[-1]
idx       = np.searchsorted(prefix_sq, 0.99 * tot_sq)
computed_p = idx + 1
manual_p  = None   # set to integer to override
p = manual_p if manual_p is not None else computed_p
print(f"Chosen rank p = {p} (computed {computed_p})")

# Print spectral quantities
lambda_p  = lam[p-1]
# Check if p is less than n before accessing lam[p]
lambda_p1 = lam[p] if p < n else 0 # Use 0 if p=n, as there is no (p+1)-th eigenvalue
delta_p   = lambda_p - lambda_p1
print(f"λ_p     = {lambda_p:.6e}")
print(f"δ_p     = {delta_p:.6e}")
if p < n:
    print(f"λ_{{p+1}} = {lambda_p1:.6e}")
else:
    print(f"λ_{{p+1}} = N/A (p=n)")

# Calculate the rank-p approximation of the original matrix A
Ap = Q[:, :p] @ (lam[:p, None] * Q[:, :p].T)


# ------------------------------------------------------------------
# Monte-Carlo experiment with bounds
# ------------------------------------------------------------------
rng          = np.random.default_rng(42)
noise_levels = np.linspace(0.0, 1.0, 20)
n_trials     = 100

spec_mean, spec_std = [], []
bound_mean, bound_std = [], []
eym_mean, eym_std = [], []

for sigma in noise_levels:
    errs, bnds, eyms = [], [], []
    for _ in range(n_trials):
        # symmetric Rademacher noise E_ij = ±sigma
        R = rng.choice([-1.0, 1.0], size=(n, n))
        E = sigma * 0.5 * (R + R.T)

        # Eigen-decomposition of the noisy matrix A+E
        lam_noisy, Q_noisy = eigh(A + E)
        # Sort descending
        idx_noisy = np.argsort(lam_noisy)[::-1]
        lam_noisy = lam_noisy[idx_noisy]
        Q_noisy = Q_noisy[:, idx_noisy]

        # Rank-p approximation of A+E
        Ap_prime = Q_noisy[:, :p] @ (lam_noisy[:p, None] * Q_noisy[:, :p].T)

        # Calculate the actual spectral error: ||(A+E)_p - A_p||_2
        spec_err = norm(Ap_prime - Ap, 2) # Compute difference between Ap_prime and pre-computed Ap
        errs.append(spec_err)

        # Our bound - Theorem 2.1
        # Make sure delta_p is not zero to avoid division by zero
        bound_val = 7.0 * norm(E, 2) * lambda_p / delta_p if delta_p != 0 else np.inf
        bnds.append(bound_val)

        # EYM bound
        # Use the p+1-th eigenvalue of A, which is lam[p]
        eyms.append(2.0 * (norm(E, 2) + (lam[p] if p < n else 0))) # Use lam[p] for EYM bound

    errs = np.array(errs)
    bnds = np.array(bnds)
    eyms = np.array(eyms)

    spec_mean.append(errs.mean());   spec_std.append(errs.std(ddof=1))
    bound_mean.append(bnds.mean());  bound_std.append(bnds.std(ddof=1))
    eym_mean.append(eyms.mean());    eym_std.append(eyms.std(ddof=1))

# Convert to arrays for plotting
spec_mean  = np.array(spec_mean)
spec_std   = np.array(spec_std)
bound_mean = np.array(bound_mean)
bound_std  = np.array(bound_std)
eym_mean   = np.array(eym_mean)
eym_std    = np.array(eym_std)

# Plot (skip sigma=0)
xs      = noise_levels[1:]
sm, ss  = spec_mean[1:], spec_std[1:]
bm, bs  = bound_mean[1:], bound_std[1:]
em, es  = eym_mean[1:], eym_std[1:]

plt.figure(figsize=(6,5))
plt.errorbar(xs, sm, yerr=ss, fmt='o-', capsize=3, label='Actual Error') # Updated label
plt.errorbar(xs, bm, yerr=bs, fmt='s--', capsize=3, label='Our bound$') # Updated label
plt.errorbar(xs, em, yerr=es, fmt='d-.', capsize=3, label='EYM bound$') # Updated label
plt.yscale('log')
plt.xlabel('Noise level σ')
plt.ylabel('Error  (log scale)')
plt.title(f'Census- Rademacher Noise')
plt.grid(alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()