import numpy as np
import pandas as pd
import PCAMethods

ZERO_CUTOFF = 100
rs = pd.read_csv("GENDX000068_GeneMatrix_FPKM.txt",sep="\t")
XX = rs.values[:,2:].astype(float).T

#Remove null features
nonzero_genes = []
for j in range(XX.shape[1]):
    if np.count_nonzero(XX[:,j]) > ZERO_CUTOFF:
        nonzero_genes.append(j)
XX = XX[:,nonzero_genes]

#Standardize
for gene in range(XX.shape[1]):
    XX[:,gene] = (XX[:,gene] - np.mean(XX[:,gene])) / np.std(XX[:,gene])
XX = XX.T
p,n = XX.shape

res = []
for _ in range(20):
    # Split observations into two halves
    nrange = list(range(n))
    np.random.shuffle(nrange)
    mat_perm = XX[:,nrange] # permuted observations
    mat1 = mat_perm[:,:int(n/2)]
    mat2 = mat_perm[:,int(n/2):]

    dir1 = PCAMethods.Our_method(mat1, R=500, P=30, N=30)[:,0]
    dir2 = PCAMethods.Our_method(mat2, R=500, P=30, N=30)[:,0]
    res.append(1-np.dot(dir1,dir2)**2)

    print("Iterate", _+1)
    print("Split-half error:",res[-1])

    # Print largest entries of PCA components to ensure they are not close to coordinate vectors
    dir1 = list(np.abs(dir1))
    dir2 = list(np.abs(dir2))
    dir1.sort()
    dir2.sort()
    print("Largest entries of vec1:",dir1[::-1][:4])
    print("Largest entries of vec2:",dir2[::-1][:4])
    
    print("\n")

print("Average split-half error:",np.mean(res))
print("St. dev. of split-half errors:",np.std(res))
