import numpy as np
from scipy.linalg import eigh
#Compute the Maximal Allowable Variance Proxy - Subsection 4.1 for A = the 1990 US Census covariance matrix.

#Step 1: Preprocess the 1990 US Census dataset and compute its covariance matrix.
def file_to_matrix(file_path):
    """Reads a file containing integers and returns a matrix (2D list).

    Args:
        file_path: Path to the input file.

    Returns:
        A 2D list representing the matrix.
    """

    matrix = []
    with open(file_path, 'r') as file:
        next(file)  # Skip the first line (header)
        for line in file:
            # Split the line into individual numbers and convert them to integers,
            # skipping the first element (column)
            row = []
            for num in line.strip().split(','):
                try:
                    row.append(int(num))
                except ValueError:
                    # Replace non-numeric values with 0
                    row.append(0)  # Or any other default value you prefer
            if row:  # Add the row to the matrix if it's not empty
                matrix.append(row)

    # Get the maximum row length
    max_len = max(len(row) for row in matrix)

    # Pad shorter rows with 0s to make all rows the same length
    for row in matrix:
        if len(row) < max_len:
            row.extend([0] * (max_len - len(row)))

    return matrix

# Example usage:
file_path = 'USCensus1990.data.txt'  # Replace with your file path
matrix = file_to_matrix(file_path)

# Convert the matrix to a NumPy array
matrix_np = np.array(matrix)

# Calculate singular values
# Removing compute_u and compute_vh arguments for compatibility
U, s, Vh = np.linalg.svd(matrix_np, full_matrices=False)
# full_matrices=False has the same effect as compute_u=False, compute_vh=False in later versions

# Calculate r -  Corrected calculation of r
# Assuming 'r' should represent the numerical rank or a similar metric based on singular values
# This change uses the singular values (s) and divides the smallest by each singular value
# Adjust the calculation if 'r' is intended to represent something else
r = sum(s[-1]**2 / sv**2 for sv in s)

print(f"Stable rank = {r}")



# Calculate the covariance matrix A = M^T * M
A = np.dot(matrix_np.T, matrix_np)
#Step 2: Compute the low-rank parameter p such that the spectral tail < 0.05.
# Evaluate \Delta^{\max} for each 1 \leq p' \leq p as described in Section 4.1
def analyze_matrix(A):
    n = A.shape[0]

    # Step 1: Compute eigenvalues in increasing order: λ_n < ... < λ_1
    eigvals = eigh(A, eigvals_only=True)
    eigvals = np.sort(eigvals)  # eigvals[0] = λ_n, eigvals[n-1] = λ_1

    # Re-index as λ_{n-i} = eigvals[i], where i = 0 (→ λ_n), ..., n-1 (→ λ_1)
    invs = 1.0 / eigvals
    total = np.sum(invs)

    # Task 1: Find smallest p s.t. tail sum / total sum < 0.05
    p = None
    lambda_n = eigvals[0]
    for i in range(1, n):
        if lambda_n / eigvals[i] < 0.05:
            p = i
            break

    if p is None:
       print("Task 1: No such p found where λ_n / λ_{n-p} < 0.05")
       return

    print(f"Task 1: Smallest p such that λ_n / λ_(n-p) < 0.05 is p = {p}")

    # Task 2: δ_{n-i} = λ_{n-i} - λ_{n-i-1} = eigvals[i] - eigvals[i-1]
    print("\nTask 2: Gaps δ_{n-i} for 0 ≤ i ≤ p:")
    for i in range(p + 1):
        if i == 0:
            print(f"δ_{{n-{i}}} = undefined (no λ_{{n-{i+1}}})")
        else:
            delta = eigvals[i] - eigvals[i - 1]
            print(f"δ_{{n-{i}}} = λ_{{{i}}} - λ_{{{i-1}}} = {delta:.4e}")

    # Task 3: min{λ_n, δ_{n-i}} / (8 √n)
    print("\nTask 3: min{λ_n, δ_{n-i}} / (8√n) for 0 ≤ i ≤ p:")
    lambda_n = eigvals[0]
    for i in range(p + 1):
        if i == 0:
            print(f"Ratio_{{n-{i}}} = undefined (no δ_{{n-{i}}})")
        else:
            delta = eigvals[i] - eigvals[i - 1]
            val = min(lambda_n, delta) / (8 * np.sqrt(n))
            print(f"Ratio_{{n-{i}}} = min(λ_n, δ_{{n-{i}}}) / (8√n) = {val:.4e}")
analyze_matrix(A)