import scipy.sparse as spp
import numpy as np
import json
import os
from ExtractFeatures import extract_features, get_review_matrix
from GenerateInfo import generate_info
import pandas as pd

def load_sparse_matrix(source_prefix='../source/ml-25m'):
    # key: business_id, item: list of categories or None
    categories = {}

    bs = pd.read_csv(source_prefix + '/movies.csv')
    for index, row in bs.iterrows():
        items = row.iloc[2].split('|')
        categories[row.iloc[0]] = []
        for item in items:
            categories[row.iloc[0]].append(item.strip())
    print(categories)
    # with open(source_prefix + '/movies.csv', encoding='utf-8') as bs:
    #     next(bs)
    #     lines = bs.readlines()
    #     for line in lines:
    #         dicts = line.split(',')
    #         if len(dicts) == 2:
    #             categories[dicts[0]] = None
    #         else:
    #             items = dicts[2].split('|')
    #             categories[dicts[0]] = []
    #             for item in items:
    #                 categories[dicts[0]].append(item.strip())

    # get tuples for sparse matrix
    user_dict, business_dict = {}, {}
    category_list = []
    data, rows, cols = [], [], []

    rf = pd.read_csv(source_prefix + '/ratings.csv')
    for index, row in rf.iterrows():
    # with open(source_prefix + '/ratings.csv', encoding='utf-8') as rf:
        # next(rf)
        # lines = rf.readlines()
        # for line in lines:
        # dicts = line.split(',')
        user_id = row.iloc[0]
        business_id = row.iloc[1]
        rating = float(row.iloc[2])
        rating = 1 if rating > 3 else 0

        # key: user id, item: user index
        if user_id not in user_dict:
            user_dict[user_id] = len(user_dict)
        row_index = user_dict[user_id]

        # key: business id, item: business index
        if business_id not in business_dict:
            business_dict[business_id] = len(business_dict)
            # a list of categories in the order of business index
            # print(business_id)
            category_list.append(categories[business_id])
        col_index = business_dict[business_id]

        # add tuple to list
        data.append(rating)
        rows.append(row_index)
        cols.append(col_index)

    # generate sparse matrix
    data = np.asarray(data)
    rows = np.asarray(rows)
    cols = np.asarray(cols)
    print(len(data), len(rows), len(cols))
    print(len(user_dict), len(business_dict), len(category_list))
    return spp.csr_matrix((data, (rows, cols))), category_list


if __name__ == '__main__':
    target_prefix = 'ml_25m'
    user_num = 2000
    business_num = 25000
    d = 50

    if not os.path.exists('ml_25m'):
        os.mkdir('ml_25m')
    M, category_list = load_sparse_matrix()
    M, top_business = get_review_matrix(user_num, business_num, M)
    U, V = extract_features(user_num, business_num, d, M)
    generate_info(U, V, user_num, top_business, category_list, target_prefix)
    np.save(target_prefix + '/user_item.npy', M)
