import h5py
import scipy.io
import scipy.sparse
import time
import numpy as np
from tqdm import tqdm, trange
import hdf5storage
import os

directory = "graph_datasets/"
datasets = {
    "wiki-topcats": "wiki-topcats.mat",
    "com-LiveJournal": "com-LiveJournal.mat",
    "com-Friendster": "com-Friendster.mat",
}

for key in datasets:

    path = os.path.join(directory, datasets[key])
    save_path = os.path.join(directory, f"{key}_preprocessed.mat")
    if os.path.exists(save_path):
        print("already preprocessed ", key)
        continue

    print("#"*20)
    print("loading ", key)
    t0 = time.time()

    with h5py.File(path, 'r') as f:
        A_data = None
        A_row_indices = None
        A_col_indices = None
        num_rows_cols = None

        Problem = f["Problem"]
        A = Problem["A"]
        A_data = A["data"][:].astype(np.float32)
        A_row_indices = A["ir"][:].astype(np.uint64)
        A_col_indices = A["jc"][:].astype(np.uint64)
        num_rows_cols = A["jc"].shape[0] - 1

        A = scipy.sparse.csc_matrix(((A_data, A_row_indices, A_col_indices)), shape=(num_rows_cols,num_rows_cols),dtype=np.float32)
        A_lil = A.tolil()
        print("here")
        if key in ["wiki-topcats"]:
            A_lil = A_lil + A_lil.T
            np.minimum(A_lil.data, 1.0, out=A_lil.data)


        A_lil.setdiag(1.0)
        print("after")

        A = A_lil.tocsc()

        W = A.sum(axis=1)
        W_inv = 1.0 / W

        # preprocess A to get K=W_inv * A * W_inv in place
        for i in trange(A.shape[0], desc="preprocessing kernel matrix"):
            start_idx = A.indptr[i]
            end_idx = A.indptr[i+1]
            row_indices = A.indices[start_idx:end_idx]
            A.data[start_idx:end_idx] = W_inv[row_indices].reshape(-1) *W_inv[i].item()

        # extract data, indices and indptr from A
        data = A.data
        indices = A.indices
        indptr = A.indptr

        
        print("saving to ", save_path)
        try:
            hdf5storage.savemat(save_path, {"data": data, "indices": indices, "indptr": indptr, "W": W, "W_inv": W_inv}, do_compression=True)
        except:
            hdf5storage.savemat(save_path, {"data": data, "indices": indices, "indptr": indptr, "W": W, "W_inv": W_inv})





