
import os
os.system("pip install --upgrade scikit-learn")

import sys

PACKAGE_PARENT = '..'
sys.path.append(PACKAGE_PARENT)

import torch
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from dnamite.models import DNAMiteRegressor

import time
import sklearn
import argparse
from sklearn.datasets import load_diabetes
import pandas as pd

def normalize(x):
    x_min = x.min(dim=0, keepdim=True).values  
    x_max = x.max(dim=0, keepdim=True).values  

    x_normalized = (x - x_min) / (x_max - x_min)

    return x_normalized

import h5py
def main():

	parser = argparse.ArgumentParser(description="Train a model")
    parser.add_argument('--data_name', default = "None", type=str, help='type of data')
    args = parser.parse_args()
	
	path = '/home/users/yhung7/SDAM/Dataset/Real-Data-Application/'


	###################################################
	#                  Model Setting                  #
	###################################################

	model = DNAMiteRegressor()

	###################################################
	#                Experiment Setting               #
	###################################################
	if args.data_name == 'Image':
		with h5py.File(path+'EstimatedResponses.mat', 'r') as f:
			YTrainS1 = f['dataTrnS1'][:]
			YValS1 = f['dataValS1'][:]
			roiS1 = f['roiS1'][:]
		
		y_trn = torch.tensor(YTrainS1[:, np.where(roiS1 == 1)[1]])
		y_val = torch.tensor(YValS1[:, np.where(roiS1 == 1)[1]])
		
		X_trn = np.load(path+'complex_cell_train.npy')
		X_trn = torch.tensor(X_trn).to(torch.float32)
		X_test = np.load(path+'complex_cell_valid.npy')
		X_test = torch.tensor(X_test).to(torch.float32)
		
		
		sub_sample = 300; sub_feature = 1800; v1_idx = 810
		
		device = torch.device("cpu:0")
		X_train = X_trn[:sub_sample, :sub_feature].to(device)
		y_train = y_trn[:sub_sample, v1_idx].to(torch.float32).view(-1, 1).to(device)
		
		
		## Standardization
		X_test = X_test[:sub_sample, :sub_feature].to(device)
		y_test = y_val[:sub_sample, v1_idx].to(torch.float32).view(-1, 1).to(device)

		model.fit(pd.DataFrame(X_train), np.array(y_train))
		pred_test = model.predict(pd.DataFrame(X_test))
		
		print(f"Case: {case} | RMSE: {np.sqrt(np.mean(np.square(pred_test - np.array(y_test)))):.4f}")

	elif args.data_name == 'Diabete':

		diabetes = load_diabetes()
	
		# Accessing features and target
		X = diabetes.data
		sparse_array = np.random.uniform(low=-1, high=1, size=(X.shape[0], 40))
		X = np.concatenate([X, sparse_array], axis = 1)
		X = torch.tensor(X).to(torch.float32)
		
		y = diabetes.target
		y = torch.tensor(y).to(torch.float32)
		
		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
		X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
		
		model.fit(pd.DataFrame(X_train), np.array(y_train))
		pred_test = model.predict(pd.DataFrame(X_test))
		
		print(f"Case: {case} | RMSE: {np.sqrt(np.mean(np.square(pred_test - np.array(y_test)))):.4f}")

	elif args.data_name == 'Chip':
	
		Chip_data = torch.load(path + 'Chip_dict.pt', weights_only = True)
		X_train, y_train, X_test, y_test = Chip_data['Xtrain'], Chip_data['ytrain'], Chip_data['Xtest'], Chip_data['ytest']
		X_train, y_train, X_test, y_test = X_train.numpy(), y_train.numpy().reshape(-1), X_test.numpy(), y_test.numpy().reshape(-1)
		
		print("Down sampling on training data ... ")
		subsize = 150
		X_train, y_train = X_train[:subsize, :], y_train[:subsize]
		
		sparse_array_train = np.random.uniform(low=-1, high=1, size=(X_train.shape[0], 21))
		X_train = np.concatenate([X_train, sparse_array_train], axis = 1)
		X_train = torch.tensor(X_train).to(torch.float32)
		y_train = torch.tensor(y_train).to(torch.float32)
		
		X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.333, random_state=42)
		
		sparse_array_test = np.random.uniform(low=-1, high=1, size=(X_test.shape[0], 21))
		X_test = np.concatenate([X_test, sparse_array_test], axis = 1)

		model.fit(pd.DataFrame(X_train), np.array(y_train))
		pred_test = model.predict(pd.DataFrame(X_test))
		
		print(f"Case: {case} | RMSE: {np.sqrt(np.mean(np.square(pred_test - np.array(y_test)))):.4f}")


	else:
		##################################################
		#              Wine/ Bike/ CA/ FICO              #
		##################################################
		cases = ['wine', 'bike', 'ca', 'fico'] 
		path = '/home/users/yhung7/SDAM/Additional_exp/'
		for case in cases:
				
			X_train = pd.read_csv(path+case+'_Train.csv')
			y_train = X_train['y']
			X_val = pd.read_csv(path+case+'_Valid.csv')
			y_val = X_val['y']
			X_test = pd.read_csv(path+case+'_Test.csv')
			y_test = X_test['y']
			X_train.drop(['y'], axis=1, inplace = True)
			X_val.drop(['y'], axis=1, inplace = True)
			X_test.drop(['y'], axis=1, inplace = True)
	
			
			model.fit(pd.DataFrame(X_train), np.array(y_train))
			pred_test = model.predict(pd.DataFrame(X_test))

			print(f"Case: {case} | RMSE: {np.sqrt(np.mean(np.square(pred_test - np.array(y_test)))):.4f}")
			
if __name__ == "__main__":
    main()


