import numpy as np
import urllib.request
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import csv
import os

np.random.seed(43)

# URL of the German Credit dataset
# url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/german.numer"
url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/mushrooms"

# File path to save the dataset
# file_path = "german.numer"
file_path = "mushrooms"

# Download the dataset if it does not exist in the working directory
if not os.path.isfile(file_path):
    urllib.request.urlretrieve(url, file_path)

# Load the German Credit dataset
X, y = load_svmlight_file(file_path)
X = X.toarray()
# y[y == -1] = 0  # converting labels from {-1,1} to {0,1}
y[y == 2] = 0  # converting labels from {2,1} to {0,1}


# Preprocess the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(len(X_train))
print(len(X_test))

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Define the sigmoid loss
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))


def non_convex_loss(X, y, w, lambd=0.5):
    z = X @ w
    y_pred = sigmoid(z)
    cross_entropy_loss = -np.mean(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))

    # Adding the non-convex regularizer
    regularization = lambd * np.sum(w**2 / (1 + w**2))

    # Total loss
    total_loss = cross_entropy_loss + regularization
    return total_loss

# Define the zeroth-order gradient estimate function
def zeroth_order_gradient_estimate(X, y, w, epsilon=1e-5):
    g = np.zeros_like(w)
    for i in range(len(w)):
        w_perturbed = w.copy()
        w_perturbed[i] += epsilon
        g[i] = (non_convex_loss(X, y, w_perturbed) - non_convex_loss(X, y, w)) / epsilon
    return g

# Define the ZO-SGD and ZO-SVRG gradient estimate functions
def zeroth_order_sgd_gradient_estimate(X, y, w, batch_size=1, epsilon=1e-5):
    idx = np.random.choice(len(y), batch_size, replace=False)
    X_batch = X[idx]
    y_batch = y[idx]
    return zeroth_order_gradient_estimate(X_batch, y_batch, w, epsilon)



############################## ZO_GD #################################

# Training parameters
learning_rate = 1e-3
num_iterations = 2000
method = 'ZO-GD'  # Replace with 'ZO-SGD' or 'ZO-SVRG' as needed



# w_initial = np.random.randn(X_train.shape[1])
w_initial = np.zeros(X_train.shape[1])
w = w_initial.copy()

# Store losses
training_losses_gd = []
test_losses_gd = []

# Optimization loop
for i in range(num_iterations):
    # Choose a method and compute the gradient estimate
    if method == 'ZO-GD':
        g = zeroth_order_gradient_estimate(X_train, y_train, w)
    elif method == 'ZO-SGD':
        g = zeroth_order_sgd_gradient_estimate(X_train, y_train, w)

    # Update weights
    w = w - learning_rate * g
    train_loss = non_convex_loss(X_train, y_train, w)
    test_loss = non_convex_loss(X_test, y_test, w)
    if (i+1)%100 == 0:
      print(f"Epoch {i + 1}/{num_iterations}, Train Loss: {train_loss}, Test Loss: {test_loss}")

    # Record losses
    training_losses_gd.append(train_loss)
    test_losses_gd.append(test_loss)




# # Save the loss values after the loop
with open('logistic_regression/train_loss_gd.csv', 'w', newline='') as f1:
    writer = csv.writer(f1)
    writer.writerows([[val] for val in training_losses_gd])

with open('logistic_regression/test_loss_gd.csv', 'w', newline='') as f2:
    writer = csv.writer(f2)
    writer.writerows([[val] for val in test_losses_gd])


############################## ZO_SGD #################################


# Training parameters
learning_rate = 1e-3
num_iterations = 2000
method = 'ZO-SGD'  # Replace with 'ZO-SGD' or 'ZO-SVRG' as needed

# Initialize weights
w = w_initial.copy()

# Store losses
training_losses_sgd = []
test_losses_sgd = []

# Optimization loop
for i in range(num_iterations):
    # Choose a method and compute the gradient estimate
    if method == 'ZO-GD':
        g = zeroth_order_gradient_estimate(X_train, y_train, w)
    elif method == 'ZO-SGD':
        g = zeroth_order_sgd_gradient_estimate(X_train, y_train, w)

    # Update weights
    w = w - learning_rate * g
    train_loss = non_convex_loss(X_train, y_train, w)
    test_loss = non_convex_loss(X_test, y_test, w)
    if (i+1)%100 == 0:
      learning_rate *= 0.9
      print(f"Epoch {i + 1}/{num_iterations}, Train Loss: {train_loss}, Test Loss: {test_loss}")

    # Record losses
    training_losses_sgd.append(train_loss)
    test_losses_sgd.append(test_loss)



#ZO-SGD
# # Save the loss values after the loop
with open('logistic_regression/train_loss_sgd.csv', 'w', newline='') as f1:
    writer = csv.writer(f1)
    writer.writerows([[val] for val in training_losses_sgd])

with open('logistic_regression/test_loss_sgd.csv', 'w', newline='') as f2:
    writer = csv.writer(f2)
    writer.writerows([[val] for val in test_losses_sgd])


########################### ZO-SVRG ######################


# Define the ZO-SVRG gradient estimate function
def zeroth_order_svrg_gradient_estimate(X, y, w, w_old, full_grad, batch_size=1, epsilon=1e-5):
    idx = np.random.choice(len(y), batch_size, replace=False)
    X_batch = X[idx]
    y_batch = y[idx]

    g = zeroth_order_gradient_estimate(X_batch, y_batch, w, epsilon)
    g_old = zeroth_order_gradient_estimate(X_batch, y_batch, w_old, epsilon)

    return g - g_old + full_grad

# Training parameters
learning_rate = 1e-3
num_iterations = 100
num_epochs = 20  # Number of epochs for ZO-SVRG
batch_size = 1  # Batch size for ZO-SVRG
method = 'ZO-SVRG'

# Initialize weights
w = w_initial.copy()

# Store losses
training_losses_svrg = []
test_losses_svrg = []

# Optimization loop
for epoch in range(num_epochs):
    full_grad = zeroth_order_gradient_estimate(X_train, y_train, w)
    w_old = w.copy()

    for i in range(num_iterations):
        if method == 'ZO-SVRG':
            g = zeroth_order_svrg_gradient_estimate(X_train, y_train, w, w_old, full_grad, batch_size)

        # Update weights
        w = w - learning_rate * g
        train_loss = non_convex_loss(X_train, y_train, w)
        test_loss = non_convex_loss(X_test, y_test, w)
        if (epoch * num_iterations + i + 1)%100 == 0:
          learning_rate *= 0.85
          print(f"Epoch {epoch * num_iterations + i + 1}/{num_epochs * num_iterations}, Train Loss: {train_loss}, Test Loss: {test_loss}")

        # Record losses
        training_losses_svrg.append(train_loss)
        test_losses_svrg.append(test_loss)


#ZO-SVRG

# Save the loss values after the loop
with open('logistic_regression/train_loss_svrg.csv', 'w', newline='') as f1:
    writer = csv.writer(f1)
    writer.writerows([[val] for val in training_losses_svrg])

with open('logistic_regression/test_loss_svrg.csv', 'w', newline='') as f2:
    writer = csv.writer(f2)
    writer.writerows([[val] for val in test_losses_svrg])


