import numpy as np 
import matplotlib.pyplot as plt 

from storm.datasets import *

from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

import sys
import random
import pickle
import os 
import time 

datafile = 'data/australian.svm'
testfile = 'data/australian.svm'
# datafile = 'data/phishing.svm'
# testfile = 'data/phishing.svm'
# samples = [2,4,10,20,40,60,80,100,200,400,600,800,1000,2000]
#,750,800,850,900,950,1000,1500,2000]
samples = [2,10,20,50,100,150,200,250,300,350,400,450,500,550,600,650]
n_trials = 50

use_intercept = True

x,y,x_test,y_test = load(datafile,testfile)
C0,C1,x,y = format_class(x,y)


N,d = x.shape

svm_accuracy = np.zeros((len(samples),n_trials))
svm_sizes = np.zeros((len(samples),n_trials))

for j,sample_size in enumerate(samples):
	print(f"Sketching with {sample_size}")
	for i in range(n_trials):
		if sample_size <= N:
			N0 = C0.shape[0]
			N1 = C1.shape[1]
			ind = np.random.choice(N0, size=sample_size//2, replace=False)
			x_c0 = C0[ind]
			x_c1 = C1[ind]
			xt = np.vstack((x_c0,x_c1))
			yt = np.ones(x_c0.shape[0]+x_c1.shape[0])
			yt[0:x_c0.shape[0]] = -1

			start = time.time()
			clf = SGDClassifier(max_iter=1000, tol=1e-3)
			clf.fit(xt,yt)
			end = time.time()
			print(f"Training took {end-start}")
			sys.stdout.flush()
			pred = clf.predict(x)
			train_accuracy = 1 - np.sum(np.abs(y-pred))/len(y)
			sys.stdout.flush()
			svm_accuracy[j,i] = train_accuracy
			svm_sizes[j,i] = xt.shape[0]*d
			print(f"\tAccuracy: {train_accuracy:.2f}")

name = os.path.splitext(os.path.basename(datafile))[0]
np.savetxt(name+"-svm.results",svm_accuracy, delimiter = ',')
np.savetxt(name+"-svm.sizes",svm_sizes, delimiter = ',')



