import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""
# import sys
# sys.path.append("mypath")

import numpy as np
import pandas as pd

data_dir = '/home/Project_LazyGraph/data/'
def read_data_graph():
    names = ['idx', 'user_id', 'item_id', 'timestamp', 'label', 'feature']
    data = pd.read_csv(os.path.join(data_dir, 'ml_wikipedia.csv'), ',', names=names,
                       engine='python', skiprows=[0])
    return data

data = read_data_graph()
### count ui valid numbers
N_U = data.user_id.unique().shape[0]
N_I = data.item_id.unique().shape[0]
print(N_U,N_I)

### timestamp to a valid range
### map features
# Map_Type = {"atk":1.0, "clk":0.0}
# data.feature = data.feature.map(Map_Type)
data.feature = 0
print(data)

# data.to_csv(os.path.join(data_dir, 'ml_wikipedia.csv'),
#             header=False,
#             index=False)

"""================ ROUND 1 =================="""
from csv import reader
import pickle
# open file in read mode
clean=[]
usernames = {}
itemnames = {}
itemkeys = {}
usercount=0
itemcount=0
history = {}
u_list, i_list, ts_list, label_list = [], [], [], []
feat_l = []
idx_list = []
with open(os.path.join(data_dir, 'ml_wikipedia.csv'), 'r') as read_obj:
    csv_reader = reader(read_obj)
    row_count=0
    for e in csv_reader:
        if row_count==0:
            row_count = 1
            continue
        if row_count%10000==0:
            print(row_count)
            print(e)
        e = e[1:]
        u = int(e[0])
        i = int(e[1])
        ts = float(e[2])
        label = float(e[3])  # int(e[3])
        feat = np.array([float(x) for x in e[4:]])

        if u in usernames:
            uid = usernames[u]
        else:
            uid = usercount
            usernames[u] = usercount
            usercount += 1
        if i in itemnames:
            iid = itemnames[i]
            history[iid].append(uid)
        else:
            iid = itemcount
            itemnames[i] = itemcount
            itemkeys[itemcount] = i
            itemcount += 1
            history[iid] = []

        u_list.append(uid)
        i_list.append(iid)
        ts_list.append(ts)
        label_list.append(label)
        idx_list.append(row_count)
        feat_l.append(feat)
        row_count += 1

_ = 0
remain_items = []
for iid in range(1000):
    if len(set(history[iid]))>10:
      if max(set(history[iid]), key = history[iid].count) not in history[iid][-3:]:
        # print(iid, history[iid])
        _+=1
        remain_items.append(itemkeys[iid])
print(_, remain_items)
# exit()
"""================ ROUND 2 =================="""
from csv import reader
import pickle
# open file in read mode
clean=[]
usernames = {}
itemnames = {}
usercount=0
itemcount=0
history = {}
u_list, i_list, ts_list, label_list = [], [], [], []
feat_l = []
idx_list = []
with open(os.path.join(data_dir, 'ml_wikipedia.csv'), 'r') as read_obj:
    csv_reader = reader(read_obj)
    row_count=0
    for e in csv_reader:
        if row_count==0:
            row_count = 1
            continue
        if row_count%10000==0:
            print(row_count)
            print(e)
        e = e[1:]
        u = int(e[0])
        i = int(e[1])
        if i not in remain_items:
            continue
        ts = float(e[2])
        label = float(e[3])  # int(e[3])
        feat = np.array([float(x) for x in e[4:]])

        if u in usernames:
            uid = usernames[u]
        else:
            uid = usercount
            usernames[u] = usercount
            usercount += 1
        if i in itemnames:
            iid = itemnames[i]
            history[iid].append(uid)
        else:
            iid = itemcount
            itemnames[i] = itemcount
            itemcount += 1
            history[iid] = []

        u_list.append(uid)
        i_list.append(iid)
        ts_list.append(ts)
        label_list.append(label)
        idx_list.append(row_count)
        feat_l.append(feat)
        row_count += 1

# print("WARNING! SAVING FEATURE IS NOT COMPLETED YET. TBI.")
data_final = pd.DataFrame({'u': u_list,
                           'i': i_list,
                           'ts': ts_list,
                           'label': label_list,
                           'feat':  np.array(feat_l)[:,0]})


print(row_count-1,usercount,itemcount)
print(data_final)

data_final.to_csv(os.path.join(data_dir, 'wikipediasubsample.csv'),
                  header=False,
                  index=False)