#!/usr/bin/env python
# coding: utf-8

# In[53]:


import pandas as pd
import numpy as np
import os
import argparse

# In[54]:
parser = argparse.ArgumentParser(description='PyTorch Neural Collaborative Filtering')
parser.add_argument('--dataset', type=str, default='douban')
parser.add_argument('--cold_percent', type=int, default=50, help='upper epoch limit')
parser.add_argument('--train_percent', type=float, default=0.5, help='upper epoch limit')
args = parser.parse_args()
# dataset = 'yelp_AZ'
dataset = args.dataset

dataset_path = '/home/data/{}'.format(dataset)


# In[55]:


df_filter = pd.read_csv(os.path.join(dataset_path, 'raw.csv'))
cold_percent = args.cold_percent
training_percent = args.train_percent
# In[56]:


def mapping(df_filter):
    user_map = df_filter.groupby('user').count()
    user_map.insert(0, 'userID', range(len(user_map)))
    user_map['user'] = user_map.index
    user_map = user_map[['user', 'userID']]
    user_map.reset_index(drop=True, inplace=True)
    df_filter = df_filter.merge(user_map, how='left', on='user')

    item_map = df_filter.groupby('item').count()
    item_map.insert(0, 'itemID', range(len(item_map)))
    item_map['item'] = item_map.index
    item_map = item_map[['item', 'itemID']]
    item_map.reset_index(drop=True, inplace=True)
    df_filter = df_filter.merge(item_map, how='left', on='item')

    return df_filter, user_map, item_map

df_mapped, _, _ = mapping(df_filter)


# In[57]:



n_users = df_mapped.userID.nunique()
n_items = df_mapped.itemID.nunique()

# train_users = np.random.choice(range(n_users), int(n_users*(cold_percent/100)), replace=False)
# train_items = np.random.choice(range(n_items), int(n_items*(cold_percent/100)), replace=False)
#
# train_df = df_mapped[df_mapped['userID'].isin(train_users)]
# train_df = train_df[train_df['itemID'].isin(train_items)]
# train_df.reset_index(drop=True, inplace=True)
#
# test_df = df_mapped[~df_mapped['userID'].isin(train_users)]
# test_df = test_df[~test_df['itemID'].isin(train_items)]
# test_df.reset_index(drop=True, inplace=True)


# In[58]:


train_df = pd.read_csv(os.path.join(dataset_path, 'train_half.csv'))
test_df = pd.read_csv(os.path.join(dataset_path, 'test_half.csv'))


# In[59]:


wu = train_df.userID.unique()
wi = train_df.itemID.unique()


# In[60]:


cu = test_df.userID.unique()
ci = test_df.itemID.unique()


# In[61]:


w_c = df_mapped[df_mapped['userID'].isin(wu)]
w_c = w_c[w_c['itemID'].isin(ci)]


# In[62]:


c_w = df_mapped[df_mapped['userID'].isin(cu)]
c_w = c_w[c_w['itemID'].isin(wi)]


# In[63]:


w_w = df_mapped[df_mapped['userID'].isin(wu)]
w_w = w_w[w_w['itemID'].isin(wi)]


# In[64]:


c_c = df_mapped[df_mapped['userID'].isin(cu)]
c_c = c_c[c_c['itemID'].isin(ci)]


# In[65]:


def filter (df, n_core=5):
    df_filter = df.drop_duplicates(keep='first', subset=['userID', 'itemID'])
    print('# users:{}, # items:{}'.format(df_filter.user.nunique(), df_filter.item.nunique()))

    while True:
        df_filter = df_filter.groupby('userID').filter(lambda x:len(x)>=n_core)
        df_filter = df_filter.groupby('itemID').filter(lambda x:len(x)>=n_core)
        print('# users:{}, # items:{}'.format(df_filter.user.nunique(), df_filter.item.nunique()))
        if df_filter.groupby('userID').size().min() >= n_core and df_filter.groupby('itemID').size().min() >= n_core:
            break

    return df_filter

w_w = filter(w_w)


# In[66]:


c_c = filter(c_c)


# In[67]:

w_w = w_w.reset_index(drop=True)[['userID', 'itemID']].rename(columns={'userID': 'user', 'itemID':'item'})
c_c = c_c.reset_index(drop=True)[['userID', 'itemID']].rename(columns={'userID': 'user', 'itemID':'item'})
# In[68]:


# In[69]:


w_w_mapped, _, _ = mapping(w_w)
c_c_mapped, _, _ = mapping(c_c)


# In[70]:


wu = w_w_mapped.userID.unique()
wi = w_w_mapped.itemID.unique()
print(c_c_mapped.head())
print(max(wu))
c_c_mapped.userID = c_c_mapped.userID + len(wu)
c_c_mapped.itemID = c_c_mapped.itemID + len(wi)
print(c_c_mapped.head())


# In[71]:


cu = c_c_mapped.userID.unique()
ci = c_c_mapped.itemID.unique()


# In[72]:


print(len(train_df), len(test_df))


# In[73]:


print(len(df_mapped))


# In[74]:


def get_neg_dict(raw, item_list):
    user_list = list(raw.userID.unique())
    neg_dict = {}
    for user in user_list:
        item_sample = raw[raw['userID']==int(user)].itemID.tolist()
        if user not in neg_dict:
            neg_pool = list(set(list(item_list))- set(item_sample))
            # assert len(neg_pool) == len(item_list) - len(item_sample)
            neg_dict[user] = neg_pool
            neg_dict[user] = np.random.choice(neg_pool, 3500)
        else:
            neg_pool = neg_dict[user]
    return neg_dict


# In[75]:


w_neg_dict = get_neg_dict(w_w_mapped, wi)
# c_neg_dict = get_neg_dict(c_c_mapped, ci)


# In[76]:


c_neg_dict = get_neg_dict(c_c_mapped, ci)


# In[77]:


# _ = get_neg_dict(w_w_mapped, wi)


# In[78]:



if training_percent == 0.5:
    val_percent = 0.2
elif training_percent == 0.4:
    val_percent = 0.17
elif training_percent == 0.7:
    val_percent = 0.34
elif training_percent == 0.8:
    val_percent = 0.5
elif training_percent == 0.3:
    val_percent = 0.14
elif training_percent == 0.2:
    val_percent = 0.13
else:
    val_percent = 0.25

num_test = 5

# In[79]:


def sample_neg(df_filter, neg_dict, raw):
    neg_item = []
    neg_user = []
    curr_neg = []
    user_list = list(raw.userID.unique())
    df_filter.reset_index(drop=True, inplace=True)
    for index, row in df_filter.iterrows():
        user = row['userID']
        curr_user = np.random.choice(user_list, 1)[0]
        curr_item = np.random.choice(neg_dict[curr_user], 1)[0]
        neg_user.append(curr_user)
        neg_item.append(curr_item)
        curr_neg.append(np.random.choice(neg_dict[user], 99))
        if index % 50000 == 0:
            print("Sampled: ", index)
    df_filter['neg_item'] = neg_item
    df_filter['neg_user'] = neg_user
    df_filter['curr_neg'] = curr_neg
    return df_filter


# In[80]:


# w_w_sampled = sample_neg(w_w_mapped, w_neg_dict, w_w_mapped)


# In[81]:


def get_data(df_ui, neg_dict):
    # df_test =  df_ui.groupby('userID').apply(lambda x: x.sort_values(['timestamp'], ascending=True)[-1])
    # print(df_test)
    # quit()
    df_training = df_ui.groupby('userID').apply(lambda x: x.sample(n=int(training_percent * len(x))))
    # df_training = df_ui.groupby('userID').apply(lambda x: x.sample(n=int(len(x) - 2)))
    df_eval = df_ui[~df_ui.index.isin(df_training.index.get_level_values(1))]
    df_eval.reset_index(drop=True, inplace=True)
    print(len(df_training), len(df_eval), len(df_ui))
    df_eval = sample_neg(df_eval, neg_dict, df_ui)
    df_training.reset_index(drop=True, inplace=True)

    df_val = df_eval.groupby('userID').apply(lambda x: x.sample(n=int(val_percent * len(x))))
    df_test = df_eval[~df_eval.index.isin(df_val.index.get_level_values(1))]
    assert len(df_training) + len(df_val) + len(df_test) == len(df_ui)
    return df_training, df_val, df_test
    # return df_test, df_val, df_training


# In[82]:


w_w_train, w_w_val, w_w_test = get_data(w_w_mapped, w_neg_dict)

# w_w_train, w_w_val, w_w_test = get_data(w_w_sampled, w_neg_dict)


# In[83]:


output_path = '/home/data/{}_{}_{}_new'.format(dataset, cold_percent, training_percent)
print(output_path)
if not os.path.isdir(output_path):
    os.mkdir(output_path)


# In[84]:


file_column = ['userID', 'itemID']
w_w_train.to_csv(os.path.join(output_path, 'w_w_train.csv'), index=False, columns=file_column)
w_w_val.to_csv(os.path.join(output_path, 'w_w_val.csv'), index=False, columns=file_column + ['neg_item', 'neg_user', "curr_neg"])
w_w_test.to_csv(os.path.join(output_path, 'w_w_test.csv'), index=False, columns=file_column + ['neg_item', 'neg_user', "curr_neg"])


# In[85]:


# c_c = sample_neg(c_c_mapped, c_neg_dict, c_c_mapped)
c_c = c_c_mapped


# In[86]:


def get_data_cc(df_ui, neg_dict):
    df_training = df_ui.groupby('userID').apply(lambda x: x.sample(n=int(training_percent * len(x))))
    # df_training = df_ui.groupby('userID').apply(lambda x: x.sample(n=int(len(x) - 1)))
    df_eval = df_ui[~df_ui.index.isin(df_training.index.get_level_values(1))]
    df_eval.reset_index(drop=True, inplace=True)
    df_eval = sample_neg(df_eval, neg_dict, df_ui)
    df_training.reset_index(drop=True, inplace=True)
    print(len(df_training), len(df_eval), len(df_ui))

    # return df_val, df_training
    return df_training, df_eval


# In[ ]:


c_c_train, c_c_test = get_data_cc(c_c, c_neg_dict)


# In[ ]:

# In[ ]:


def write_file(df_training, df_val, df_test, output_path):
    file_column = ['userID', 'itemID', 'rating']
    if dataset == 'epinions' or dataset == 'gowalla' or dataset == 'yelp_AZ':
        file_column = ['userID', 'itemID']
    df_val.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)
    df_training.to_csv(os.path.join(output_path, 'train.csv'), index=False, columns=file_column)
    df_val.to_csv(os.path.join(output_path, 'val.csv'), index=False, columns=file_column + ['neg_item', 'neg_user', "curr_neg"])
    df_test.to_csv(os.path.join(output_path, 'test.csv'), index=False, columns=file_column + ['neg_item', 'neg_user', "curr_neg"])


# In[ ]:



c_c.to_csv(os.path.join(output_path, 'c_c.csv'), index=False, columns=file_column)
c_c_train.to_csv(os.path.join(output_path, 'c_c_train.csv'), index=False, columns=file_column)
c_c_test.to_csv(os.path.join(output_path, 'c_c_test.csv'), index=False, columns=file_column + ['neg_item', 'neg_user', "curr_neg"])


# # In[ ]:
#
#
# # c_w = sample_neg(c_w, w_neg_dict, c_w)
# # w_c = sample_neg(w_c, c_neg_dict, w_c)
# # c_w.to_csv(os.path.join(output_path, 'c_w.csv'), index=False, columns=file_column + ['neg_item', 'neg_user', "curr_neg"])
# # w_c.to_csv(os.path.join(output_path, 'w_c.csv'), index=False, columns=file_column + ['neg_item', 'neg_user', "curr_neg"])
# def mapping2(df_filter):
#     user_map = df_filter.groupby('user1').count()
#     user_map.insert(0, 'userID', range(len(user_map)))
#     user_map['user1'] = user_map.index
#     user_map = user_map[['user1', 'userID']]
#     user_map.reset_index(drop=True, inplace=True)
#     df_filter = df_filter.merge(user_map, how='left', on='user1')
#
#     item_map = df_filter.groupby('item1').count()
#     item_map.insert(0, 'itemID', range(len(item_map)))
#     item_map['item1'] = item_map.index
#     item_map = item_map[['item1', 'itemID']]
#     item_map.reset_index(drop=True, inplace=True)
#     df_filter = df_filter.merge(item_map, how='left', on='item1')
#
#     return df_filter, user_map, item_map
#
#
# # In[ ]:
#
#
# train_df = pd.read_csv(os.path.join(output_path, 'w_w_train.csv'))
# test_df = pd.read_csv(os.path.join(output_path, 'c_c_train.csv'))
#
#
# # In[ ]:
#
#
# train_filtered, user_map_tr, item_map_tr = mapping2(train_df.rename({'userID': 'user1', 'itemID': 'item1'}, axis='columns'))
#
#
# # In[ ]:
#
#
# import scipy.sparse as sp
# from scipy import sparse
# def get_adj(df_ui):
#     n_users = df_ui.userID.max() + 1
#     n_items = df_ui.itemID.max() + 1
#     rows_ui, cols_ui = df_ui['userID'], df_ui['itemID']
#     UserItemNet = sparse.csr_matrix((np.ones_like(rows_ui),
#                                  (rows_ui, cols_ui)), dtype='float32',
#                                 shape=(n_users, n_items))
#
#     adj_mat = sp.dok_matrix((n_users + n_items, n_users + n_items), dtype=np.float32)
#     adj_mat = adj_mat.tolil()
#     R = UserItemNet.tolil()
#     adj_mat[:n_users, n_users:] = R
#     adj_mat[n_users:, :n_users] = R.T
#     adj_mat = adj_mat.toarray()
#     return adj_mat
#
#
# # In[ ]:
#
#
# train_adj = get_adj(train_filtered)
#
#
# # In[ ]:
#
#
# test_filtered, user_map_te, item_map_te = mapping2(test_df.rename({'userID': 'user1', 'itemID': 'item1'}, axis='columns'))
# test_adj = get_adj(test_filtered)
#
#
# # In[ ]:
#
#
#
# with open(os.path.join(output_path, 'train_half_adj.npy'), 'wb') as f:
#     np.save(f, train_adj)
# with open(os.path.join(output_path, 'test_half_adj.npy'), 'wb') as f:
#     np.save(f, test_adj)
#


# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:




