import pandas as pd
from collections import defaultdict

# Specify the file paths for the dataset files
users_path = "../datasets/ml-1m/users.dat"
ratings_path = "../datasets/ml-1m/ratings.dat"
movies_path = "../datasets/ml-1m/movies.dat"


# Define column names for each dataset
users_cols = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
movies_cols = ['movie_id', 'title', 'genres']

# Load data into Pandas DataFrames
users_df = pd.read_csv(users_path, sep='::', header=None, names=users_cols, encoding='latin-1', engine='python')
ratings_df = pd.read_csv(ratings_path, sep='::', header=None, names=ratings_cols, encoding='latin-1', engine='python')
movies_df = pd.read_csv(movies_path, sep='::', header=None, names=movies_cols, encoding='latin-1', engine='python')

# Optionally, convert DataFrames to NumPy arrays/matrices
users_array = users_df.values
ratings_array = ratings_df.values
movies_array = movies_df.values

ratings_df = pd.merge(ratings_df, movies_df)[['user_id', 'title', 'rating', 'timestamp']]
ratings_df["user_id"] = ratings_df["user_id"].astype(str)
user_lookup = {v: i+1 for i, v in enumerate(ratings_df['user_id'].unique())}
movie_lookup = {v: i+1 for i, v in enumerate(ratings_df['title'].unique())}
ratings_df['movie_id'] = ratings_df['title'].map(movie_lookup)
ratings_df['user_int'] = ratings_df['user_id'].map(user_lookup)
ratings_per_user = ratings_df.groupby('user_id').rating.count()
ratings_per_item = ratings_df.groupby('movie_id').rating.count()
sorted_ratings_per_item = ratings_per_item.sort_values(ascending=False)
user_item_rating_tuples = ratings_df[['user_int', 'movie_id', 'rating']].values.tolist()

from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import SVD
from surprise import accuracy

reader = Reader()
data = Dataset.load_from_df(ratings_df[['user_int', 'movie_id', 'rating']], reader)

# Retrieve the trainset
trainset = data.build_full_trainset()

from surprise import dump

file_path = 'surprise_model_full.dump'

# Load the saved model
_, model = dump.load(file_path)
def get_last_n_ratings_by_user(
    df, n, min_ratings_per_user=1, user_colname="user_id", timestamp_colname="timestamp"
):
    return (
        df.groupby(user_colname)
        .filter(lambda x: len(x) >= min_ratings_per_user)
        .sort_values(timestamp_colname)
        .groupby(user_colname)
        .tail(n)
        .sort_values(user_colname)
    )
def mark_last_n_ratings_as_validation_set(
    df, n, min_ratings=1, user_colname="user_id", timestamp_colname="timestamp"
):
    """
    Mark the chronologically last n ratings as the validation set.
    This is done by adding the additional 'is_valid' column to the df.
    :param df: a DataFrame containing user item ratings
    :param n: the number of ratings to include in the validation set
    :param min_ratings: only include users with more than this many ratings
    :param user_id_colname: the name of the column containing user ids
    :param timestamp_colname: the name of the column containing the imestamps
    :return: the same df with the additional 'is_valid' column added
    """
    df["is_valid"] = False
    df.loc[
        get_last_n_ratings_by_user(
            df,
            n,
            min_ratings,
            user_colname=user_colname,
            timestamp_colname=timestamp_colname,
        ).index,
        "is_valid",
    ] = True

    return df
mark_last_n_ratings_as_validation_set(ratings_df, 1)
train_df = ratings_df[ratings_df.is_valid==False]
valid_df = ratings_df[ratings_df.is_valid==True]
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import SVD
from surprise import accuracy

reader = Reader()
data = Dataset.load_from_df(train_df[['user_int', 'movie_id', 'rating']], reader)
# Retrieve the trainset
trainset1 = data.build_full_trainset()

from surprise import dump

file_path = 'surprise_model_last_item.dump'

# Load the saved model
_, model_last_item = dump.load(file_path)
def get_last_n_ratings_by_user(
    df, n, min_ratings_per_user=1, user_colname="user_id", timestamp_colname="timestamp"
):
    return (
        df.groupby(user_colname)
        .filter(lambda x: len(x) >= min_ratings_per_user)
        .sort_values(timestamp_colname)
        .groupby(user_colname)
        .tail(n)
        .sort_values(user_colname)
    )
def mark_last_n_ratings_as_validation_set(
    df, n, min_ratings=1, user_colname="user_id", timestamp_colname="timestamp"
):
    """
    Mark the chronologically last n ratings as the validation set.
    This is done by adding the additional 'is_valid' column to the df.
    :param df: a DataFrame containing user item ratings
    :param n: the number of ratings to include in the validation set
    :param min_ratings: only include users with more than this many ratings
    :param user_id_colname: the name of the column containing user ids
    :param timestamp_colname: the name of the column containing the imestamps
    :return: the same df with the additional 'is_valid' column added
    """
    df["is_valid"] = False
    df.loc[
        get_last_n_ratings_by_user(
            df,
            n,
            min_ratings,
            user_colname=user_colname,
            timestamp_colname=timestamp_colname,
        ).index,
        "is_valid",
    ] = True

    return df
mark_last_n_ratings_as_validation_set(ratings_df, 5)
train_df_5 = ratings_df[ratings_df.is_valid==False]
valid_df_5 = ratings_df[ratings_df.is_valid==True]
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import SVD
from surprise import accuracy

reader = Reader()
data = Dataset.load_from_df(train_df_5[['user_int', 'movie_id', 'rating']], reader)

# Retrieve the trainset
trainset5 = data.build_full_trainset()

from surprise import dump

file_path = 'surprise_model_last_five.dump'

# Load the saved model
_, model_last_five = dump.load(file_path)
user_item_matrix = trainset.ur
user_item_matrix_1 = trainset1.ur
user_item_matrix_5 = trainset5.ur
# Convert to a dictionary of dictionaries
user_item_rating_dict = defaultdict(dict)

# Populate the user_item_rating_dict
for user, items_ratings in user_item_matrix.items():
    items_dict = {item: rating for item, rating in items_ratings}
    user_item_rating_dict[user] = items_dict
import torch
user_item_rating_tensor = {}

# Iterate over each user_id and their item-rating dictionary
for user_id, item_rating_dict in user_item_rating_dict.items():
    # Convert the item-rating dictionary to a list of tuples
    item_rating_list = list(item_rating_dict.items())
    
    # Convert the list of tuples to a tensor
    item_rating_tensors = [torch.tensor([[item_id, rating]], dtype=torch.float) for item_id, rating in item_rating_list]
    
    # Stack the list of tensors along a new dimension to create a single tensor
    item_rating_tensor = torch.stack(item_rating_tensors)
    
    # Store the item-rating tensor in the converted dictionary with the user_id as the key
    user_item_rating_tensor[user_id] = item_rating_tensor

# Convert to a dictionary of dictionaries
user_item_rating_dict1 = defaultdict(dict)

# Populate the user_item_rating_dict
for user, items_ratings in user_item_matrix_1.items():
    items_dict = {item: rating for item, rating in items_ratings}
    user_item_rating_dict1[user] = items_dict

import torch
user_item_rating_tensor1 = {}

# Iterate over each user_id and their item-rating dictionary
for user_id, item_rating_dict in user_item_rating_dict1.items():
    # Convert the item-rating dictionary to a list of tuples
    item_rating_list = list(item_rating_dict.items())
    
    # Convert the list of tuples to a tensor
    item_rating_tensors = [torch.tensor([[item_id, rating]], dtype=torch.float) for item_id, rating in item_rating_list]
    
    # Stack the list of tensors along a new dimension to create a single tensor
    item_rating_tensor = torch.stack(item_rating_tensors)
    
    # Store the item-rating tensor in the converted dictionary with the user_id as the key
    user_item_rating_tensor1[user_id] = item_rating_tensor

# Convert to a dictionary of dictionaries
user_item_rating_dict5 = defaultdict(dict)

# Populate the user_item_rating_dict
for user, items_ratings in user_item_matrix_5.items():
    items_dict = {item: rating for item, rating in items_ratings}
    user_item_rating_dict5[user] = items_dict
    
import torch
user_item_rating_tensor5 = {}

# Iterate over each user_id and their item-rating dictionary
for user_id, item_rating_dict in user_item_rating_dict5.items():
    # Convert the item-rating dictionary to a list of tuples
    item_rating_list = list(item_rating_dict.items())
    
    # Convert the list of tuples to a tensor
    item_rating_tensors = [torch.tensor([[item_id, rating]], dtype=torch.float) for item_id, rating in item_rating_list]
    
    # Stack the list of tensors along a new dimension to create a single tensor
    item_rating_tensor = torch.stack(item_rating_tensors)
    
    # Store the item-rating tensor in the converted dictionary with the user_id as the key
    user_item_rating_tensor5[user_id] = item_rating_tensor
    
import numpy as np
import torch
from tqdm import tqdm

def update_user_tensor_single(user_vector, items, ratings):
    Q_list = [model_last_item.qi[item] for item in items]
    Q = torch.tensor(Q_list, dtype=torch.float64)
    # p = np.linalg.inv(Q.T @ Q) @ Q.T @ ratings
    p = torch.inverse(Q.t() @ Q) @ Q.t() @ ratings
    return p
def update_user_tensor(user_vector, items, ratings):
    Q_list = [model.qi[item] for item in items]
    Q = torch.tensor(Q_list, dtype=torch.float64)
    # p = np.linalg.inv(Q.T @ Q) @ Q.T @ ratings
    p = torch.inverse(Q.t() @ Q) @ Q.t() @ ratings
    return p
def update_user_tensor_five(user_vector, items, ratings):
    Q_list = [model_last_five.qi[item] for item in items]
    Q = torch.tensor(Q_list, dtype=torch.float64)
    # p = np.linalg.inv(Q.T @ Q) @ Q.T @ ratings
    p = torch.inverse(Q.t() @ Q) @ Q.t() @ ratings
    return p

def get_all_recommendation_scores_stochastic(user_vector, sample_size, type_, beta = 0.8):
    device = user_vector.device
    num_samples = sample_size
    if type_ == 'keepall':
        n_items =len(model.qi)
    if type_ == 'single':
        n_items =len(model_last_item.qi)
    if type_ == 'five':
        n_items =len(model_last_five.qi)
    predicted_ratings = {index: None for index in range(0, n_items)}
    if type_ == 'keepall':
        for item in range(0, n_items):
            item_rating = user_vector @ torch.tensor(model.qi[item])#.to(device)
            predicted_ratings[item] = item_rating
    if type_ == 'single':
        for item in range(0, n_items):
            item_rating = user_vector @ torch.tensor(model_last_item.qi[item])
            predicted_ratings[item] = item_rating    
    if type_ == 'five':
        for item in range(0, n_items):
            item_rating = user_vector @ torch.tensor(model_last_five.qi[item])
            predicted_ratings[item] = item_rating       
    
    # Convert the predicted ratings dictionary to PyTorch tensor
    ratings_tensor_1 = torch.tensor(list(predicted_ratings.values()), dtype=torch.float)
    
    # Compute probabilities proportional to exp(beta*predicted_rating)
    probabilities = F.softmax(beta * ratings_tensor_1, dim=0)
    
    # Sample num_samples items based on the probability distribution
    sampled_indices = torch.multinomial(probabilities, num_samples, replacement=False)
    
    # Convert indices to item names and corresponding predicted scores
    sampled_items = [list(predicted_ratings.keys())[idx] for idx in sampled_indices]
    sampled_scores = [list(predicted_ratings.values())[idx] for idx in sampled_indices]
    
    return sampled_items, sampled_scores

#Finalized
#What I compare with is the first value
torch.set_printoptions(precision=7)
print("Starting now...")

def past_user_item_reachability(user_id, item_id, past_time):
    item_to_be_reached = item_id
    item_and_rating =  trainset.ur[user_id][-past_time:]
    chosen_items = [i[0] for i in item_and_rating]
    chosen_ratings = [i[1] for i in item_and_rating]
    user_action = torch.tensor(chosen_ratings, requires_grad=True, dtype=torch.float64)
    optimizer = torch.optim.Adam([user_action], lr=3)
    # reach_probabilities = []
    ratings_dict1 = user_item_rating_dict[user_id]
    for j in user_item_rating_dict[user_id]:
        ratings_dict1[j]=torch.tensor(ratings_dict1[j], dtype=torch.float64)
    ratings_dict = ratings_dict1
    #print(final_rating/total_sum)   
    
    final_rating=0
    init_val = 0
    for epoch in tqdm(range(1, 20)):
        user_action_clamped = user_action.clamp(1, 5)
        rating_tensor = torch.tensor(list(ratings_dict.values()))
        user_vector_initial = torch.tensor(model_last_five.pu[user_id])
        user_vector = user_vector_initial
        time_max = past_time
        already_rated = list(user_item_rating_dict[user_id].keys())
        already_rated = already_rated[:-past_time]
        ratings_old = rating_tensor[:-past_time]
        n = len(ratings_old)
        zeros_to_add = torch.zeros(time_max)
        ratings = torch.cat((ratings_old, zeros_to_add), dim=0)
        for timestep in range(0,time_max):
            curr_item = chosen_items[timestep]
            ratings[n+timestep] = user_action_clamped[timestep]
            already_rated.append(curr_item)
            user_vector = update_user_tensor_five(user_vector, already_rated, ratings[:n+timestep+1])
        total_sum = 0
        for item in range(len(model_last_five.qi)):
            total_sum += torch.exp(0.8 * torch.matmul(user_vector, torch.tensor(model_last_five.qi[item])))
            #print(final_rating/total_sum)  
        item_rating = -torch.exp(0.8*torch.matmul(user_vector, torch.tensor(model_last_five.qi[item_to_be_reached])))/total_sum
        #print(item_rating)
        if epoch == 1:
            init_val = item_rating.item()
        item_rating.backward()
        optimizer.step()
        optimizer.zero_grad()
        #user_action = user_action.clamp(1, 5) 
        #print(init_val)
    if init_val!=0:
        final_rating = item_rating.item()/init_val
    return final_rating, item_rating.item()

n_users = len(model_last_five.pu)
n_items = len(model_last_five.qi)

import random

def generate_random_lists(n_items, n_users, num_samples=30):
    item_list = [random.randint(0, n_items - 1) for _ in range(num_samples)]
    user_list = [random.randint(0, n_users - 1) for _ in range(num_samples)]
    return item_list, user_list

#ok time to create influential and non-influential users
ratings_count = {}
for user_id, ratings1 in trainset.ur.items():
    # Count the number of ratings for the user and store it in ratings_count
    ratings_count[user_id] = len(ratings1)

# Sort the ratings_count dictionary by the number of ratings in descending order
sorted_ratings_count = sorted(ratings_count.items(), key=lambda x: x[1], reverse=True)

# Get the top 10 influential users based on the number of ratings
top_influential_users = [user_id for user_id, _ in sorted_ratings_count[:20]]

mid_users = [i for i, _ in sorted_ratings_count[200:220]]

top_influential_items = [i for i in sorted_ratings_per_item[:20].keys()]

mid_items = [i for i in sorted_ratings_per_item[200:220].keys()]


#items, users = generate_random_lists(n_items, n_users)
items = [3417, 2001, 239, 1642, 2584, 3400, 2172, 3699, 535, 2447, 1087, 561, 698, 2531, 3440, 3164, 1326, 447, 3176, 451]# 276, 3208, 1109, 2385, 2675, 1212, 2654, 3051, 6, 305]
users = [485, 920, 196, 5, 773, 714, 537, 665, 192, 920, 964, 709, 766, 173, 97, 119, 947, 270, 623, 569]#, 3, 733, 532, 632, 173, 211, 467, 285, 82, 175]
time_till = 5

def save_list_to_file(lst, filename):
    with open(filename, 'w') as file:
        for item in lst:
            file.write(str(item) + '\n')

all_reachabilities1 = []
final_reachabilities1 = []
for p_user in tqdm(top_influential_users):
    for p_item in items:
        ratio_val, reach_val = past_user_item_reachability(p_user, p_item, time_till)
        if ratio_val!=0:
            all_reachabilities1.append(ratio_val)
            final_reachabilities1.append(reach_val)
            
all_reachabilities2 = []
final_reachabilities2 = []
for p_user in tqdm(mid_users):
    for p_item in items:
        ratio_val, reach_val = past_user_item_reachability(p_user, p_item, time_till)
        if ratio_val!=0:
            all_reachabilities2.append(ratio_val)
            final_reachabilities2.append(reach_val)
            
all_reachabilities3 = []
final_reachabilities3 = []
for p_user in tqdm(users):
    for p_item in top_influential_items:
        ratio_val, reach_val = past_user_item_reachability(p_user, p_item, time_till)
        if ratio_val!=0:
            all_reachabilities3.append(ratio_val)
            final_reachabilities3.append(reach_val)
            
all_reachabilities4 = []
final_reachabilities4 = []
for p_user in tqdm(users):
    for p_item in mid_items:
        ratio_val, reach_val = past_user_item_reachability(p_user, p_item, time_till)
        if ratio_val!=0:
            all_reachabilities4.append(ratio_val)
            final_reachabilities4.append(reach_val)

import statistics
import math

def calculate_mean_std(lst):
    # Using the statistics module
    mean = statistics.mean(lst)
    std_dev = statistics.stdev(lst)

    # Calculating manually
    manual_mean = sum(lst) / len(lst)
    manual_std_dev = math.sqrt(sum((x - manual_mean) ** 2 for x in lst) / len(lst))

    print(f"Mean (using statistics module): {mean}")
    print(f"Standard Deviation (using statistics module): {std_dev}")
    print(f"Mean (calculated manually): {manual_mean}")
    print(f"Standard Deviation (calculated manually): {manual_std_dev}")

calculate_mean_std(all_reachabilities1)

save_list_to_file(all_reachabilities1, 'aggregate/past_reachability_top_users_all_items.txt')  
save_list_to_file(final_reachabilities1, 'aggregate/past_freachability_top_users_all_items.txt')    

calculate_mean_std(all_reachabilities2)

save_list_to_file(all_reachabilities2, 'aggregate/past_reachability_mid_users_all_items.txt')  
save_list_to_file(final_reachabilities2, 'aggregate/past_freachability_mid_users_all_items.txt')    

calculate_mean_std(all_reachabilities3)

save_list_to_file(all_reachabilities3, 'aggregate/past_reachability_all_users_top_items.txt')  
save_list_to_file(final_reachabilities3, 'aggregate/past_freachability_all_users_top_items.txt')    

calculate_mean_std(all_reachabilities4)

save_list_to_file(all_reachabilities4, 'aggregate/past_reachability_all_users_mid_items.txt')  
save_list_to_file(final_reachabilities4, 'aggregate/past_freachability_all_users_mid_items.txt')          



