import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import warnings

warnings.filterwarnings('ignore')

def _sigmoid(z):
    return 1 / (1 + np.exp(-z))

def _logistic_variance(z):
    p = _sigmoid(z)
    return p * (1 - p)

def _precompute_optimal_lists(U, V, theta_star, K, d):
    n_users = len(U)
    n_items = len(V)
    optimal_lists = np.zeros((n_users, K), dtype=int)
    
    theta_mat = theta_star.reshape(d, d)
    
    for u_idx in range(n_users):
        u_vec = U[u_idx] # (d,)
        
        temp = np.dot(u_vec, theta_mat) 

        logits = np.dot(V, temp)
        
        top_k_indices = np.argpartition(logits, -K)[-K:]
        
        optimal_lists[u_idx] = top_k_indices
        
    return optimal_lists

def preprocess(seed, K, d, B):
    column_names = ['user_id', 'item_id', 'rating', 'timestamp']
    df = pd.read_csv('./ml-100k/u.data', sep='\t', names=column_names)

    df['binary_rating'] = df['rating'].apply(lambda x: 1 if x >= 4 else 0)

    R = df.pivot(index='user_id', columns='item_id', values='binary_rating').fillna(0)
    R_sparse = csr_matrix(R.values).astype(float)
 
    U, S, Vt = svds(R_sparse, k=d)
    U = U / (np.linalg.norm(U, axis=1, keepdims=True) + 1e-9)
    V_matrix = Vt.T
    V = V_matrix / (np.linalg.norm(V_matrix, axis=1, keepdims=True) + 1e-9)

    rng_u = np.random.RandomState(seed)
    rng_v = np.random.RandomState(seed)

    u_scale = rng_u.uniform(0.5, 1.0, size=(U.shape[0], 1))
    v_scale = rng_v.uniform(0.5, 1.0, size=(V.shape[0], 1))

    U = U * u_scale
    V = V * v_scale

    theta_matrix = np.eye(d) 
    theta_matrix += np.random.normal(0, 0.3, (d, d))

    raw_theta = theta_matrix.flatten()

    rng_theta = np.random.RandomState(seed+2) 

    target_norm = rng_theta.uniform(B - 1, B)
    current_norm = np.linalg.norm(raw_theta)

    theta_star = (raw_theta / current_norm) * target_norm

    u_norms = np.linalg.norm(U, axis=1) 
    v_norms = np.linalg.norm(V, axis=1) 

    max_u = np.max(u_norms)
    max_v = np.max(v_norms)
    max_x_norm = max_u * max_v

    max_logit_norm = max_x_norm * np.linalg.norm(theta_star)

    kappa = _logistic_variance(max_logit_norm)

    optimal_lists = _precompute_optimal_lists(U, V, theta_star, K, d)

    return R, U, V, theta_star, kappa, optimal_lists
