# -*- coding: utf-8 -*-
"""5.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1RVcvuSEmKICm5gPlp3h60-iSmLJx8Pn_
"""

import math
import io
import os
import time
import numba
import random
import numpy as np
import pandas as pd
from numba import njit
from random import sample
import multiprocessing as mp
from multiprocessing import Pool, freeze_support
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool, cpu_count

np.random.seed(100)
random.seed(113)

'''Local data'''

#Read data from file
df = pd.read_csv("ml1m_train.csv")
df.reset_index(drop=True, inplace=True)

#create users list
clients = df['userid']
clients =  list(set(np.array(clients.values)))
clients = np.array(clients)

#Latent dimensions
K = 20

#Pivot the dataset
df = df.pivot(index =  'userid' , columns = 'movieid' , values = 'rating').fillna(0)
R = np.array(df)

#store user and item count
N,M = df.shape
print("No. of users: ",N)
print("No. of items: ",M)

'''Server Data'''

#Read data from file
server_df = df.sample(frac = .25)

#create users list
users = server_df.index
users = np.array(users)

#store the user and item count
server_df = np.array(server_df)
N_server,M_server = server_df.shape
print("No. of users at server: ",N_server)
print("No. of items at server: ",M_server)

'''For sampling'''

tau_f = 35
tau = int((N*tau_f)/100)
print("No. of users permitted for 1 communication round: ",tau)

'''Demographic split'''

#under 18
u_df = pd.read_csv('ml1m_train_under18.csv')
u_df = u_df[['userid']]
new_header = u_df.iloc[0] #grab the first row for the header
u_df = u_df[1:] #take the data less the header row
u_df.columns = new_header #set the header row as the df header
v = []
vals = u_df.values
for [i] in vals:
    x = i
    v.append(x)
u_list = list(set(v))    

#above 18
a_df = pd.read_csv('ml1m_train_above18.csv')
a_df = a_df[['userid']]
new_header = a_df.iloc[0] #grab the first row for the header
a_df = a_df[1:] #take the data less the header row
a_df.columns = new_header #set the header row as the df header
vm = []
valsm = a_df.values
for [i] in valsm:
    x = i
    vm.append(x)
a_list = list(set(vm)) 


#store list of the count
u_count = len(u_list)
print("Under 18: ",u_count)
a_count = len(a_list)
print("Above 18: ",a_count)

l = []
for i in range(1,N+1):
    if i in u_list:
        l.append(0)
    else:
        l.append(1)

'''Demographic stats at 3 levels'''

#users in training dataset
print("#(Users) under 18: ",u_count)
print("#(Users) above 18: ",a_count)

#Local Clients
print("\n")
client_u18 = 0
client_a18 = 0
for x in clients:
    if(x in u_list):
        client_u18 +=1
    if(x in a_list):
        client_a18 +=1
print("#(Local Clients) under 18: ",client_u18)
print("#(Local Clients) above 18: ",client_a18)

#Users at server
print("\n")
server_u18 = 0
server_a18 = 0
for x in users:
    if(x in u_list):
        server_u18 +=1
    if(x in a_list):
        server_a18 +=1
print("#(Users @ Server) under 18: ",server_u18)
print("#(Users @ Server) above 18: ",server_a18)

#Indicator list for users at server
l_server = [0]*max(users+1) #has all clients in training dataset--if >18 then 1 -->else 0=>either under 18 or not in this frame
for i in range(N_server):
    x = users[i]       #get the userid of ith user at server
    if x in u_list:         #if userid in under18 list at training dataset
        l_server[x] = 0     #put a zero at that userid in l_server i.e. indicator list for server
    else:
        l_server[x] = 1
        
l_server = np.array(l_server)

U = {}

eta   = 5
gamma = 0.5
lamda = 0.01



def ClientFilling(Q,Uu,u,t):
    rui_prime = 0.0
    denom = 0
    for item in u:
        if item != 0:
            yuk  = 1
        else:
            yuk  = 0
        
        rui_prime += (yuk * item)
        denom += yuk
        
    return rui_prime/denom

'''ClientBatch'''

def ClientBatch(V, V_fair, u, t, Uu, i, i_d,gamma):
    pred_v       = []
    error_sum    = 0
    error        = 0 
    grad         = np.zeros((1,K))
    index        = []
    V_UA_EF      = {}
    count        = 0
    no_samples   = 3
    sample_i     = []
    n_itemsrated = np.count_nonzero(u) + no_samples
    
    '''Assign virtual ratings'''
    #save unrated items
    for i in range(M):
        if u[i]==0:
            sample_i.append(i)
            
    #Randomly sample some of these items
    random_sample = sample(sample_i , no_samples )
    
    #get virtual rating
    rui_prime = ClientFilling(V, Uu, u, t)    
            
    
    '''Update user gradient'''
    for j in range(M):
        if u[j]!=0:
            pred       = np.dot(Uu,V[j])
            error      = pred - u[j]
            error_sq   = pow(error,2)
            error_sum += error_sq
            x          = (error * V[j])+(lamda * Uu)
            grad       = np.add(grad, x)
        
        #if user has rated item virtually
        if u[j]==0 and j in random_sample:
            pred       = np.dot(Uu,V[j])
            error      = pred - rui_prime
            error_sq   = pow(error,2)
            error_sum += error_sq
            x          = (error * V[j])+(lamda * Uu)
            grad       = np.add(grad, x)
        
        
    '''Update user vector'''
    grad   = grad / n_itemsrated
    Uu     = Uu - (gamma * grad)
    U[i_d] = Uu

    '''Update item gradient''' 
    for item in u:
        
        #if user has rated item actually
        if item != 0:
            index.append(count)
            pred           = np.dot(Uu,V[count])
            V_ui           = ((pred - item)*Uu) + (lamda * V[count] ) + eta*(V[count] - V_fair[count])
            V_UA_EF[count] = V_ui
        
    
        #if user has rated itemm virtually
        if item == 0 and count in random_sample:
            index.append(count)
            pred           = np.dot(Uu,V[count])
            V_ui           = ((pred - rui_prime)*Uu) + (lamda * V[count]) + eta*(V[count] - V_fair[count])
            V_UA_EF[count] = V_ui
        count +=1
        
    '''RMSE of 1 user'''
    RMSE = np.sqrt(error_sum / n_itemsrated)
    
    '''Store final predicted matrix'''
    if(t == 19):
        for j in range(M):
            p = np.dot(Uu , V[i])
            pred_v.append(p[0])
            
    '''Group-wise RMSE'''
    #disadvantaged group
    if i_d in u_list:
        demographic = 0
        return RMSE, V_UA_EF, Uu, demographic , error_sum/n_itemsrated , pred_v
    #advantaged group
    else:
        demographic = 1
        return RMSE, V_UA_EF, Uu, demographic , error_sum/n_itemsrated , pred_v

@njit(parallel=False)
def dot_product(P,Q,R,s):
    Q=Q.T
    u18_loss = 0
    a18_loss = 0
    RMSE = 0
    
    #for each user at server
    for i in range(N_server):
        n=0
        e_user = 0
        nerr_user = 0
        i_d = users[i] #extract corr id.
        
        #for each item in validation set
        for j in range(M_server):
            
            #if item has actual rating in validation set
            if R[i][j]>0:
                n           += 1                
                X            = np.ascontiguousarray(P[i, : ])
                Y            = np.ascontiguousarray(Q[j, : ])
                err          = np.round(pow(R[i][j] - np.dot(X,Y), 2) , 3) #squared error                    
                e_user      += err
         
        nerr_user = np.round( (e_user/n) , 3)
        RMSE += np.sqrt(nerr_user)
        demographic = l_server[i_d]
        
        if demographic == 0:
            u18_loss += nerr_user
        else:
            a18_loss += nerr_user
            
    RMSE = RMSE / N_server     
    bias = abs((u18_loss/server_u18) - (a18_loss/server_a18))
    
    return RMSE , bias , s

@njit(parallel=False)
def fair_mf(R, P, Q, K, steps, alpha = 0.012, l=0.05, l_ =1.5):
    
    for step in range(steps):
        RMSE , bias, s = dot_product(P,Q,R,0)
        
        for i in range(N_server): #for every row
            items_rated = np.count_nonzero(R[i])
            
            for j in range(M_server): #for every column
                
                if R[i][j] > 0: #if user has rated the item
                    t1 = np.ascontiguousarray(P[i])
                    t2 = Q.T
                    t22 = np.ascontiguousarray(t2[j])
                    eij = R[i][j] - np.dot( t1,t22) #error = actual rating - predicted rating
                    
                    for k in range(K): #for kth feature
                        if l_server[i]==0:
                            P[i][k] = P[i][k] + 2 * alpha * (  ((eij * Q[k][j]) * (1 - (2*l_*bias)/((server_u18)*(items_rated))) ) - l*(P[i][k])) 
                            Q[k][j] = Q[k][j] + 2 * alpha * (  ((eij * P[i][k]) * (1 - (2*l_*bias)/((server_u18)*(items_rated))) ) - l*(Q[k][j]))
                        
                        else:
                            P[i][k] = P[i][k] + 2 * alpha * (  ((eij * Q[k][j]) * (1 + (2*l_*bias)/((server_a18)*(items_rated))) ) - l*(P[i][k])) 
                            Q[k][j] = Q[k][j] + 2 * alpha * (  ((eij * P[i][k]) * (1 + (2*l_*bias)/((server_a18)*(items_rated))) ) - l*(Q[k][j]))

        print(RMSE , bias, s)   
    return Q

user_vec = np.random.uniform(low=0.1, high=0.9, size=(N_server,K))
item_vec = np.random.uniform(low=0.1, high=0.9, size=(K,M_server))

P = fair_mf(server_df , user_vec , item_vec , 20 , 25)

'''Batch FedRec'''

V = np.random.uniform(low=0.1, high=0.9, size=(M,K))
localloss_list = []
dloss_list     = []
aloss_list     = []
bias_list      = []

for t in range(20):
    
    print("                             t=",t)
    local_loss = 0
    local_data = []
    loss       = {}
    i_grad     = {}
    d_loss     = 0
    d_sqloss   = 0
    a_loss     = 0
    a_sqloss   = 0
    pred_v     = {}
    data2      = []
    
    '''Randomply sub-sample tau% of total clients'''
    sample_users = np.random.choice(clients, size=tau, replace=False)  
    
    '''Perform Fair_MF'''
    user_vec = np.round(np.random.uniform(low=0.1, high =0.9, size = (N_server,K)), 2)
    V = V.T   
    V_fair = fair_mf(server_df , user_vec , V , 20 , 15)
    V_fair = V_fair.T
    V = V.T
    
    '''Send item vectors to users'''
    for i in range(N):
        u   = R[i,:]
        i_d = clients[i]
        
        if(t==0):
            Uu = np.random.uniform(low=0.1, high=0.9, size=(1,K))
        else:
            Uu = U[i_d]
        local_data.append(tuple((V , V_fair, u, t, Uu, i, i_d,gamma)))
        
    '''Train Clients Locally'''
    with mp.Pool() as pool:
        ret = pool.starmap(ClientBatch, local_data)
        
    '''Retrieve item gradients and losses of each local client'''
    for i in range(N):
        i_d         = clients[i]
        RMSE        = ret[i][0]
        grad        = ret[i][1]
        U[i_d]      = ret[i][2]
        i_grad[i_d] = grad
        local_loss += RMSE
        demographic = ret[i][3]
        
        #retrieve predicted vector
        if(t==19):
            pred = ret[i][5]
            pred_v[i] = pred            
        
        '''if disadvantaged'''
        if(demographic == 0):
            d_loss   += RMSE
            d_sqloss += ret[i][4]
            
        '''if    advantaged'''
        if(demographic == 1):
            a_loss   += RMSE
            a_sqloss += ret[i][4]
            
    '''Store the final losses'''    
    #into lists
    localloss_list.append(local_loss / N)
    dloss_list.append(d_loss / u_count)
    aloss_list.append(a_loss / a_count)
    bias_list.append(abs((d_sqloss/u_count)-(a_sqloss/a_count)))
    
    #into df
    output_df = pd.DataFrame(list(zip(localloss_list , dloss_list , aloss_list , bias_list)),
               columns =['  RMSE  ', '  RMSE_Disadv  ' , '  RMSE_adv. ' , '  Demographic bias  '])
    #into csv
    output_df.to_csv('5.csv')
    
    #store predicted matrix
    if(t==19):
        with open("5_pred.txt" , "w") as f:
            for key,value in pred_v.items():
                f.write("%s:%s\n"%(key,value))
                
    '''Gradient aggregation at server'''
    for i in range(M):
        sum_gradi    = np.zeros((1,K))
        n_usersrated = 0
        grads_item   = []
        for j in sample_users:
            grad_i = i_grad[j]
            for key,val in grad_i.items():
                if i == key:
                    n_usersrated += 1
                    grads_item.append(val)
        if(n_usersrated >=1):            
            for grad in grads_item:
                grad      = np.array(grad)
                sum_gradi = np.add(sum_gradi,grad)
        
            grad_vi = sum_gradi / n_usersrated
            V[i]    = V[i] - (gamma*grad_vi)
        
    gamma = 0.9*gamma



