import numpy as np
import matplotlib.pyplot as plt
from scipy.linalg import svd, norm
#We perform Monte Carlo experiments on real-world covariance matrices (US Census and KDD-Cup), each truncated to rank \(p=5\) and perturbed by Gaussian noise
#
# 0. Preprocess
def file_to_matrix(path: str) -> np.ndarray:
    """Reads a CSV‐style file of integers and returns a padded matrix as a 2D float array."""
    rows = []
    with open(path, "r") as fh:
        next(fh)
        for line in fh:
            row = []
            for tok in line.strip().split(","):
                try:
                    row.append(int(tok))
                except ValueError:
                    row.append(0)
            if row:
                rows.append(row)

    max_len = max(len(r) for r in rows)
    for r in rows:
        r.extend([0] * (max_len - len(r)))
    return np.asarray(rows, dtype=np.float64)

# ------------------------------------------------------------------
# 1.  Build covariance matrix A (68 × 68)
# ------------------------------------------------------------------
M = file_to_matrix("USCensus1990.data.txt") #Change the data file here
A = M.T @ M / M.shape[0]
n = A.shape[0]

# ------------------------------------------------------------------
# 2.  Parameters and baseline rank-p approximation
# ------------------------------------------------------------------
p = 5
num_trials   = 20
noise_levels = np.linspace(0, 2.0, 20)

U, S, _       = svd(A, full_matrices=False)
A_p           = U[:, :p] @ np.diag(S[:p]) @ U[:, :p].T
base_residual = norm(A - A_p, 2)
print(f"‖A - A_p‖₂ = {base_residual:.6e}")

# ------------------------------------------------------------------
# 3.  Monte-Carlo loop
# ------------------------------------------------------------------
spec_m, fro_m, diff_m = [], [], []
spec_sd, fro_sd, diff_sd = [], [], []   # ← collect stddevs

for sigma in noise_levels:
    trial_spec, trial_fro, trial_delta = [], [], []

    for trial in range(num_trials):
        E       = np.random.normal(0, sigma, size=(n, n))
        E       = 0.5*(E + E.T)
        A_tilde = A + E

        U_t, S_t, _ = svd(A_tilde, full_matrices=False)
        A_tilde_p   = U_t[:, :p] @ np.diag(S_t[:p]) @ U_t[:, :p].T

        trial_spec.append( norm(A_tilde_p - A_p, 2) ) #spectral norm
        trial_fro .append( norm(A_tilde_p - A_p, 'fro') ) #Fro norm

        pert_residual = norm(A - A_tilde_p, 2)
        trial_delta.append( abs(base_residual - pert_residual) ) #change-in-error

    # compute and store means
    spec_m.append( np.mean(trial_spec) )
    fro_m.append ( np.mean(trial_fro)  )
    diff_m.append( np.mean(trial_delta) )

    # --- NEW: compute and store stddevs ---
    spec_sd.append( np.std(trial_spec) )
    fro_sd.append ( np.std(trial_fro)  )
    diff_sd.append( np.std(trial_delta) )

# convert to arrays for plotting
spec_m   = np.array(spec_m)
fro_m    = np.array(fro_m)
diff_m   = np.array(diff_m)
spec_sd  = np.array(spec_sd)
fro_sd   = np.array(fro_sd)
diff_sd  = np.array(diff_sd)

# ------------------------------------------------------------------
# 4.  Plot
# ------------------------------------------------------------------
plt.figure(figsize=(6,5))
plt.errorbar(noise_levels, spec_m,  yerr=spec_sd,  fmt='o-', lw=2, ms=5,
             capsize=3, label=r'$\|\tilde A_p - A_p\|_2$')
plt.errorbar(noise_levels, fro_m,   yerr=fro_sd,   fmt='s--', lw=2, ms=5,
             capsize=3, label=r'$\|\tilde A_p - A_p\|_F$')
plt.errorbar(noise_levels, diff_m,  yerr=diff_sd,  fmt='d-.', lw=2, ms=5,
             capsize=3, label=r'$|\|A-A_p\| - \|A-\tilde A_p\||$')

plt.xlabel("Noise level $\sigma$")
plt.ylabel("Error")
plt.grid(True)
plt.tight_layout()
plt.show()
