import numpy as np
import pandas as pd
import scipy.sparse as sp
from math import exp, log, sqrt
import cupy as cp
import accelib1 as accelib
import time

start = time.time()

#Calculate Training Error and Test Error
df_train = pd.read_csv("./data/YELP2018/ratings_processed_train.csv")
#df_test = pd.read_csv("./data/ML20M/ratings_processed_test.csv")
df_whole = pd.read_csv("./data/YELP2018/ratings_processed.csv")

arr1 = df_train.to_numpy()
#arr2 = df_test.to_numpy()
arr3 = df_whole.to_numpy()

num1 = arr1.shape[0]
#label1 = np.ones((num1, 1))
#num2 = arr2.shape[0]
#label2 = np.ones((num2, 1))
num3 = arr3.shape[0]
#label3 = np.ones((num3, 1))

s1 = sp.csr_matrix((np.squeeze(arr1[:, 2]), (np.squeeze(arr1[:, 0].astype(int)), 
                    np.squeeze(arr1[:, 1].astype(int)))), 
                    #shape = (117718, 26744), dtype = "float32").todense() # ML20M1
                    #shape = (408160, 17770), dtype = "float32").todense() # NETFLIX
                    shape = (769365, 40000), dtype = "float32").todense() # YELP2018
                    #shape = (865284, 40000), dtype = "float32").todense() # MSD
#s2 = sp.csr_matrix((np.squeeze(label2), (np.squeeze(arr2[:, 0]), np.squeeze(arr2[:, 1])))).todense()
s3 = sp.csr_matrix((np.squeeze(arr3[:, 2]), (np.squeeze(arr3[:, 0].astype(int)), 
                    np.squeeze(arr3[:, 1].astype(int)))), dtype = "float32").todense()

#s1 = s1[:, :2000]
#s3 = s3[:, :2000]

#print(s3.dtype)

#Load Data
W = np.load("./model/ease_train_YELP2018.npy")
# W = np.load("./model/ease_train.npy")

#print(W.dtype)

I = np.eye(W.shape[0])
#sigma = 0.01
sigma = 0.001
sigma2 = sigma ** 2
m = s1.shape[0]

'''
mu = np.sum(s1, axis = 0) / m
s1_mu = s1 - mu
Cov = accelib.gpu_block_matmul(s1_mu) / m
#print(Cov)
#print(mu.T @ mu)
Cov += mu.T @ mu
#print(Cov)

mu1 = np.sum(s3, axis = 0) / s3.shape[0]
s3_mu = s3 - mu1
Cov1 = accelib.gpu_block_matmul(s3_mu) / s3.shape[0]
Cov1 += mu1.T @ mu1

Cov_diff = Cov1 - Cov
tau = np.linalg.norm(Cov_diff, ord = 2)
print(tau)
norm_IW = np.linalg.norm(I - W)
#L, V = np.linalg.eig(Cov_diff)
#print(L, S)
#print(S.shape)
#print(S @ np.diag(L) @ S.T)


#Q = I - W - W.T + accelib.gpu_block_matmul(W.T)
'''

#with cp.cuda.Device(3):
#M = accelib.gpu_block_matmul(s1)
#print(M.dtype)
#M = M / float(m)
#print(M.dtype)

M = accelib.gpu_block_matmul(s1) / m
T = accelib.gpu_block_matmul(s3) / s3.shape[0]
# M and T are of float64 type. To use float32, write float(m) and float(s3.shape[0])
# The precision of T affects the eigenvalue decomposition. The Phi is 1213 of float64 while 1281 of float 32.
#T = np.ones((W.shape[0], W.shape[0]))

#print(T.dtype)

mempool = cp.get_default_memory_pool()
#print(mempool.used_bytes())

T_gpu = cp.asarray(T)
#print(mempool.used_bytes())
L_gpu, S_gpu = cp.linalg.eigh(T_gpu)

#mempool = cp.get_default_memory_pool()
#print(mempool.get_limit())
#print(mempool.total_bytes())
#print(mempool.used_bytes())

del T_gpu

#print(mempool.get_limit())
#print(mempool.total_bytes())
#print(mempool.used_bytes())

I_gpu = cp.asarray(I)
W_gpu = cp.asarray(W)

for i in range(len(L_gpu)):
    if L_gpu[i] < 0:
        #print(i, L_gpu[i])
        L_gpu[i] = 0

B_gpu = S_gpu.T @ (I_gpu - W_gpu)
B_gpu = cp.multiply(B_gpu, B_gpu)
sum_B_gpu = cp.sum(B_gpu, axis = 1)
L = cp.asnumpy(L_gpu)
sum_B = cp.asnumpy(sum_B_gpu)
del B_gpu
del I_gpu
del W_gpu
del S_gpu
del L_gpu
del sum_B_gpu

print(time.time() - start)

s1_norm = accelib.gpu_block_colnorm(s1)

#print(mempool.used_bytes())
#mempool.free_all_blocks()
#print(mempool.used_bytes())

res = []
lmd = 1
for t in range(10):

    # Calculate Variance
    #s = np.zeros(W.shape[0])
    #for i in range(W.shape[0]):
    #    s[i] = 1 / ( 2 * lmd / m * np.linalg.norm(s1[:, i]) ** 2 + 1 / sigma2 )
    s = 1 / ( 2 * lmd / m * s1_norm + 1 / sigma2 )
    s *= (W.shape[0] - 1)     # Revision
    #s *= (W.shape[0] * 0.9)

    # Calculate Expectation
    #M = accelib.gpu_block_matmul(s1) / m
    #print(time.time() - start)

    tp = 1 / (2 * lmd * sigma2)
    #print(s.dtype)
    #print(type(tp))
    #s = s.astype(np.float32)
    #tp = float(tp)
    W1 = tp * W + M
    #print(M.dtype)
    M1 = M + tp * I
    #print(M1.dtype)
    #M1 = M1.astype(np.float32)

    #print(time.time() - start)

    #M1 = M1.astype(np.float32)
    #W1 = W1.astype(np.float32)
    inv = accelib.gpu_block_inv(M1)
    #print(np.max(inv))
    #print(np.min(inv))
    #print("inv type", inv.dtype)
    inv_gpu = cp.asarray(inv)
    #print(cp.asnumpy(cp.sum(cp.isnan(inv_gpu))))
    print(mempool.used_bytes())
    W1_gpu = cp.asarray(W1)
    U_gpu = inv_gpu @ W1_gpu
    del W1_gpu
    #Temp_gpu = cp.diag( cp.divide( cp.diag(U_gpu), cp.diag(inv_gpu) ) )
    #U_gpu -= inv_gpu @ Temp_gpu
    Temp_gpu = cp.divide( cp.diag(U_gpu), cp.diag(inv_gpu) )
    #print(cp.asnumpy(cp.max(Temp_gpu)))
    #print(cp.asnumpy(cp.sum(cp.isnan(Temp_gpu))))
    #U_gpu -= inv_gpu * Temp_gpu
    inv_gpu = inv_gpu * Temp_gpu
    U_gpu = U_gpu - inv_gpu
    #Q = I - U - U.T + accelib.gpu_block_matmul(U.T)
    #print(cp.asnumpy(cp.diag(U_gpu)))
    del Temp_gpu
    del inv_gpu
    print(mempool.used_bytes())
    I_gpu = cp.asarray(I)
    I_U_gpu = I_gpu - U_gpu
    del I_gpu
    Q_gpu = I_U_gpu @ I_U_gpu.T
    del I_U_gpu
    print(mempool.used_bytes())
    U = cp.asnumpy(U_gpu)
    del U_gpu

    print(mempool.used_bytes())

    '''
    # Calculate Phi
    P = V.T @ (I - W) / sigma
    P = P * P
    #print(P)
    P1 = np.sum(P, axis = 1)
    #print(P1)

    lmd_sigma_L = lmd * sigma ** 2 * L
    lmd_sigma_L1 = 1 - 2 * lmd_sigma_L
    #print(lmd_sigma_L, lmd_sigma_L1)
    T = lmd_sigma_L / lmd_sigma_L1 * P1 - 1 / 2 * np.log(lmd_sigma_L1)
    T /= lmd
    #print(T)
    sum_T = np.sum(T)
    #print(sum_T)
    '''
    # Calculate Phi
    phi = 0
    for i in range(W.shape[0]):
        phi += lmd * L[i] * sum_B[i] / (1 - 2 * lmd * L[i] * sigma2) - W.shape[0] / 2 * np.log(1 - 2 * lmd * L[i] * sigma2)   # Revision
    print("Psi: ", phi)
    #print(time.time() - start)

    # Calculate Bound
    #Vt = Q + np.diag(s)
    s_gpu = cp.asarray(s)
    Q_diag_gpu = cp.diagonal(Q_gpu)
    Q_diag_gpu = Q_diag_gpu + s_gpu
    cp.fill_diagonal(Q_gpu, Q_diag_gpu)
    del s_gpu
    del Q_diag_gpu
    '''
    A, X, C = np.linalg.svd(Vt)
    X_h = np.sqrt(X)
    Vt1 = A @ np.diag(X_h) @ C
    
    C, L1, D = np.linalg.svd(Vt, hermitian=True)
    L1_h = np.sqrt(L1)
    Vt1 = C @ np.diag(L1_h) @ D
    '''
    #print(Q_gpu.shape)
    #print(Q_gpu.dtype)
    #Q_gpu.astype(cp.float32)

    print(mempool.used_bytes())

    Q = cp.asnumpy(Q_gpu)
    del Q_gpu
    #Q = Q.astype(np.float32)
    #Q_gpu = cp.asarray(Q)
    print(Q.shape)
    print(Q.dtype)

    #print(Q_gpu.dtype)
    #mempool = cp.get_default_memory_pool()
    #print(mempool.get_limit())
    #print(mempool.total_bytes())
    print(mempool.used_bytes())
    mempool.free_all_blocks()
    print(mempool.used_bytes())
    pinned_mempool = cp.get_default_pinned_memory_pool()
    pinned_mempool.free_all_blocks()
    Q_gpu = cp.asarray(Q)
    #print(mempool.used_bytes())

    L1_gpu, S1_gpu = cp.linalg.eigh(Q_gpu)
    del Q_gpu

    for i in range(len(L1_gpu)):
        if L1_gpu[i] < 0:
            L1_gpu[i] = 0

    L1_h_gpu = cp.sqrt(L1_gpu)
    #Q_h_gpu = S1_gpu @ cp.diag(L1_h_gpu) @ S1_gpu.T
    Q_h_gpu = (S1_gpu * L1_h_gpu) @ S1_gpu.T
    print(cp.asnumpy(cp.sum(cp.isnan(L1_h_gpu))))
    print(cp.asnumpy(cp.sum(cp.isnan(S1_gpu))))
    del L1_gpu
    del S1_gpu
    del L1_h_gpu
    print(cp.asnumpy(cp.sum(cp.isnan(Q_h_gpu))))

    #print(time.time() - start)

    s /= (W.shape[0] - 1)     # Revision
    U_gpu = cp.asarray(U)
    W_gpu = cp.asarray(W)
    U_W_norm2 = cp.asnumpy(cp.linalg.norm(U_gpu - W_gpu) ** 2)
    KL = 0.5 * ( W.shape[0] ** 2 * ( 2 * log(sigma) - 1) - W.shape[0] * np.sum(np.log(s) - s / sigma2) + U_W_norm2 / sigma2 )
    print("KL: ", KL)
    del U_gpu
    del W_gpu

    print(mempool.used_bytes())

    partition = int(s1.shape[0] / 24 + 1)
    start = 0
    end = partition
    sum_norm2 = 0
    while start < end:
        s1_part_gpu = cp.asarray(s1[start:end, :])
        sum_norm2 += cp.asnumpy(cp.linalg.norm(s1_part_gpu @ Q_h_gpu) ** 2) / m
        del s1_part_gpu

        start = end
        end = start + partition
        if end > s1.shape[0]:
            end = s1.shape[0]

    del Q_h_gpu

    print(mempool.used_bytes())

    print("R_emp: ", sum_norm2)

    res0 = sum_norm2 + (KL + np.log(10 / 0.01) + phi) / lmd
    res.append(res0)
    #res.append(np.linalg.norm(s1 @ Vt1) ** 2 / (df_train["userId"].max() + 1))
    
    print("iter: ", t, "  lambda: ", lmd, "  result: ", res0)

    lmd *= 2


print(res)
print(min(res))


