import pdb
import numpy as np
import random
import pandas
from operator import itemgetter

def getApplicationData(dataSetName,nSamplePerClass=0):
	import socket
	import pickle

	if dataSetName.upper()=='MNIST':
		trFile,trLabelFile = sourcePath+'PhD/Dataset/MNIST/MNISTTrainData_Org.p',sourcePath+'PhD/Dataset/MNIST/MNISTTrainLabel_Org.p'
		trData,trLabels = pickle.load(open(trFile,'rb')),pickle.load(open(trLabelFile,'rb'))

		lTrData = np.hstack((trData,trLabels))
		trData,trLabels=makeStratifiedSubset(lTrData,nSamplePerClass)
	    
		testFile,testLabelFile = sourcePath+'PhD/Dataset/MNIST/MNISTTestData_Org.p',sourcePath+'PhD/Dataset/MNIST/MNISTTestLabel_Org.p'
		tstData,tstLabels = pickle.load(open(testFile,'rb')),pickle.load(open(testLabelFile,'rb'))
		lTstData = np.hstack((tstData,tstLabels))
		tstData,tstLabels=makeStratifiedSubset(lTstData,nSamplePerClass)
		
	elif dataSetName.upper()=='INDIAN PINE' or dataSetName.upper()=='INDIANPINE':
		from scipy.io import loadmat
		dataFile = 'DataFile/IndianPine/Indian_pines.mat'
		labelFile = 'DataFile/IndianPine/Indian_pines_gt.mat'
		#pdb.set_trace()
		D = loadmat(dataFile)
		D = D['indian_pines']
		L = loadmat(labelFile)
		L = L['indian_pines_gt']
		trData = np.array([])
		trLabels = np.array([])
		for i in range(1,17,1):#loop through each class
			x1 = np.where(L == i)[0] # X-coordinate
			x2 = np.where(L == i)[1] # Y-coordinate
			classData = np.array([])
			for j in range(np.shape(D)[2]):#loop through each band
				d = D[x1,x2,j].reshape(-1,1)#sample for each band
				if j == 0:
					classData = d
				else:
					classData = np.hstack((classData,d))
			
			if i == 1:
				trData = classData
				trLabels = (i-1)*np.ones(len(x1)) #making labels start from 0
			else:
				trData = np.vstack((trData,classData))
				trLabels = np.hstack((trLabels,(i-1)*np.ones(len(x1))))
			
		trLabels = trLabels.reshape(-1,1)
		tstData,tstLabels = [],[]
		
	elif dataSetName.upper()=='FOREST COVER' or dataSetName.upper()=='FORESTCOVER':
		dataFile = 'DataFile/ForestCover/covtype.data'
		D = pandas.read_csv(dataFile,delimiter=',',header=None)
		D = np.array(D)
		trData = D[:,:-1]
		trLabels = D[:,-1] - 1
		trLabels = trLabels.reshape(-1,1)
		tstData,tstLabels = [],[]
		
	return trData,trLabels,tstData,tstLabels

def splitData_n(data,labels,nTrData):
	# This method will split the dataset into training and test set.
	# The labeled_data will be shuffled and then first nTrData will be put in training set
	# and the rest will be put together in test set
	train_set =[]
	test_set =[]
	no_data = len(data)
	dataDim = np.shape(data)[1]
	indices = np.arange(no_data)
	np.random.shuffle(indices)
	trData,trLabels = data[indices[:nTrData],:],labels[indices[:nTrData]].reshape(-1,1)
	lTrData = np.hstack((trData,trLabels))
	sortedTrData = np.array(sorted(lTrData, key=itemgetter(dataDim)))
	trData,trLabels = sortedTrData[:,:-1],sortedTrData[:,-1]
	tstData,tstLabels = data[indices[nTrData:],:],labels[indices[nTrData:]].reshape(-1,1)
	lTstData = np.hstack((tstData,tstLabels))
	sortedTstData = np.array(sorted(lTstData, key=itemgetter(dataDim)))
	tstData,tstLabels = sortedTstData[:,:-1],sortedTstData[:,-1]
	return trData,trLabels.reshape(-1,1),tstData,tstLabels.reshape(-1,1)

def standardizeData(data,mu=[],std=[]):
	#data: a m x n matrix where m is the no of observations and n is no of features
	#if any(mu) == None and any(std) == None:
	if not(len(mu) and len(std)):
		#pdb.set_trace()
		std = np.std(data,axis=0)
		mu = np.mean(data,axis=0)
		std[np.where(std==0)[0]] = 1.0 #This is for the constant features.
		standardizeData = (data - mu)/std
		return mu,std,standardizeData
	else:
		standardizeData = (data - mu)/std
		return standardizeData
		
def unStandardizeData(data,mu,std):
	return std * data + mu

def splitData(labeled_data,split_ratio):
	# This method will split the dataset into training and test set based on the split ratio.
	# Training and test set will have data from each class according to the split ratio.
	# First hold the data of different classes in different variables.
	train_set =[]
	test_set =[]
	no_data = len(labeled_data)	
	sorted_data = labeled_data[np.argsort(labeled_data[:,-1])]#sorting based on the numeric label.
	first_time = 'Y'
	for classes in np.unique(sorted_data[:,-1]):
		temp_class = np.array([sorted_data[i] for i in range(no_data) if sorted_data[i,-1] == classes])
		np.random.shuffle(temp_class)#Shuffle the data so that we'll get variation in each run
		tr_samples = np.floor(len(temp_class)*split_ratio)
		tst_samples = len(temp_class) - tr_samples
		if(first_time == 'Y'):
			train_set = temp_class[:int(tr_samples),]
			test_set = temp_class[-int(tst_samples):,]
			first_time = 'N'
		else:
			train_set = np.vstack((train_set,temp_class[:int(tr_samples),]))
			test_set = np.vstack((test_set,temp_class[-int(tst_samples):,]))
	
	#no_of_trn_samples = int(np.ceil(split_ratio*len(labeled_data)))
	#no_of_tst_samples = int(len(labeled_data) - no_of_trn_samples)
	#return labeled_data[:no_of_trn_samples,:],labeled_data[-no_of_tst_samples:,:]
	return train_set,test_set
	
def topNFeaturePruning(W,T=100):
	#W: a list. Each element of this list in also a list containing feature indices.
	#Note: the lengths of each list element is not equal. len(W) indicates the no
	#of repetition a feature extrcator ran on the original dataset.
	# T: no. of top features to be returned. T can be -1 meaning return all the features with corresponding count
	W_flat = np.hstack((W))
	uniqueW=np.unique(W_flat)

	#following dictionary will contain feature index and its number of occurance(count)
	fCnt={}
	s=0
	for f in uniqueW:
		cnt = len(np.where(W_flat==f)[0])
		fCnt[int(float(f))] = cnt
	#pdb.set_trace()
	#arrange the features in descending order based on the count.
	#orderedFCnt is a list of tuples. The 1st element is the feature index and the
	#2nd element is the count of that feature
	orderedFCnt = sorted(fCnt.items(), key=lambda k:k[1],reverse=True)
	
	# convert orderedFCnt into an numpy array
	orderedFCnt = np.array(orderedFCnt)

	if T == -1: #return the entire ordered list of features
		return orderedFCnt[:,0], orderedFCnt
	else:
		#now take the top 'T' no. of featutes
		return orderedFCnt[:T,0], np.array(orderedFCnt[:T])

def returnImpFeaturesElbow(W):
	#This program will use the concept of elbow search
	
	#sort the weights in descending order on the absolute values
	#pdb.set_trace()
	sortedW = (-1)*np.sort((-1)*np.abs(W))
	sortedIndices = np.argsort((-1)*np.abs(W))
	
	#assign each ordered weight value as a point in xy plane, where y values are the weights and x values are number in R(+)
	X = np.arange(len(sortedW))
	P = np.hstack((X.reshape(-1,1),sortedW.reshape(-1,1)))
	
	#now using the first and last point make a straight line y = mx + c or mx - y + c = 0
	m = (sortedW[0] - sortedW[-1])/(X[0] - X[-1])
	c = sortedW[0] - m * X[0]
	
	#now calculate the distance for each point in P from the straight line.
	# the elbow point will be the point in P whihc has maximum distance
	#The distance of a point P(x_o,y_0) from a straight line Ax + By + C = 0 is given by:
	# d = |(Ax_0 + By_0 + C)|/sqrt(A^2 + B^2)
	#In our case A = m, B = -1 and C = c
	dists = []
	denom = np.sqrt(1+m**2)
	for p in P:
		x_0,y_0 = p[0],p[1]
		numerator = np.abs(m*x_0 + (-1)*y_0 + c)
		dists.append(numerator/denom)
	dists = np.array(dists)
	maxDistIndex = np.where(dists==max(dists))[0][0]
	#import matplotlib.pyplot as plt
	#plt.plot(sortedW)
	#plt.scatter(P[maxDistIndex,0],P[maxDistIndex,1],c='red')
	#plt.text(2500, 0.00003, 'No of features:%s'%(maxDistIndex+1), fontsize = 10)
	#plt.xlabel('feature indices',size=15)
	#plt.ylabel('feature weight',size=15)
	#plt.show()
	#pdb.set_trace()
	return sortedIndices[:maxDistIndex+1],sortedW[:maxDistIndex+1]

def MSE(orgData,regenData):
	#orgData,regenData is n x m matrix where n is no. of points and m is no. of features. 
	noSamples = np.shape(orgData)[0]	
	errSum = 0
	for i in range(noSamples):
		#err = abs(np.linalg.norm(orgData[i,:]) - np.linalg.norm(regenData[i,:]))
		residual = (orgData[i,:] - regenData[i,:])
		err = np.dot(residual,residual.T)
		errSum = errSum + err	
	return(errSum/noSamples)
	
def calcCentroid(data,label):
	centroids=[]
	centroidLabels=np.unique(label)
	for i in range(len(centroidLabels)):
		tmpData=data[np.where(centroidLabels[i]==label)[0],:]
		centroids.append(np.mean(tmpData,axis=0))
	centroids=np.vstack((centroids))
	return centroids
