import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
import tensorflow_probability as tfp

class Generator(Model):
	'''
	This class is for a Generator.
	'''
	def __init__(self, num_layers=4, hidden_neurons=100, output_neurons=10, activation='relu'):
		'''
		Initializer of the Generator class. Input parameters:
		 - num_layers: Number of hidden layers of the generator
		 - hidden_neurons: Number of hidden neurons in each layer of the generator
		 - output neurons: Number of neurons in the output layer
		 - activation: Activation function of the hidden layers.
		'''
		super(Generator, self).__init__()

		# Define the parameters
		self.num_layers = num_layers
		self.hidden_neurons = hidden_neurons
		self.output_neurons = output_neurons
		self.activation = activation

		# Create the hidden layers
		self.hidden_layers = list()
		for i in range(num_layers):
			self.hidden_layers.append(Dense(hidden_neurons, activation=activation))

		# Create the output layer
		self.final_layer = Dense(output_neurons)

	def call(self, inputs):
		'''
		Code executed when this class is called
		'''
		# Run the inputs through the first hidden layer
		x = self.hidden_layers[0](inputs)

		# Now run over the rest of the layers
		for i in range(1, self.num_layers - 1):
			x = self.hidden_layers[i](x)

		# Compute the output
		y = self.final_layer(x)

		return y

class Regressor(Model):
	'''
	This class is for a Generator.
	'''
	def __init__(self, num_layers=4, hidden_neurons=100, activation='relu'):
		'''
		Initializer of the Generator class. Input parameters:
		 - num_layers: Number of hidden layers of the generator
		 - hidden_neurons: Number of hidden neurons in each layer of the generator
		 - output neurons: Number of neurons in the output layer
		 - activation: Activation function of the hidden layers.
		'''
		super(Regressor, self).__init__()

		# Define the parameters
		self.num_layers = num_layers
		self.hidden_neurons = hidden_neurons
		self.output_neurons = 1
		self.activation = activation

		# Create the hidden layers
		self.hidden_layers = list()
		for i in range(num_layers):
			self.hidden_layers.append(Dense(hidden_neurons, activation=activation))

		# Create the output layer
		self.final_layer = Dense(self.output_neurons, activation = 'sigmoid')

	def call(self, inputs):
		'''
		Code executed when this class is called
		'''
		# Run the inputs through the first hidden layer
		x = self.hidden_layers[0](inputs)

		# Now run over the rest of the layers
		for i in range(1, self.num_layers - 1):
			x = self.hidden_layers[i](x)

		# Compute the output
		y = self.final_layer(x)

		return y

class WCE_CRM():
	'''
	Class that implements the semi-CRM
	'''
	def __init__(self, generator):
		'''
		This class receives as an input:

		- generator: Initial generator distribution pi_{theta_0} (a | x)
		- critic: Initial critic parameteric function f_{omega_0} (z, x, p)
		'''

		self.generator = generator

	def sample_S(self, S, m_instances=100):
		total_num_instances = S.shape[0]
		
		# Sample m_instances
		index_S = np.random.permutation(total_num_instances)[0:m_instances]
		instances_x = S[index_S]
		
		x = tf.cast(instances_x[:,0:-3], tf.float32)
		a = tf.cast(
			tf.squeeze(tf.one_hot(instances_x[:,-3:-2], depth=self.generator.output_neurons), axis=1), 
			tf.float32)
		p = tf.cast(instances_x[:, -2:-1], tf.float32)
		r = tf.cast(instances_x[:, -1:], tf.float32)
		
		return x, a, p, r
	
	def sample_Su(self, S_u, m_instances):
		total_num_instances = S_u.shape[0]
		
		# Sample m instances
		index_Su = np.random.permutation(total_num_instances)[0:m_instances]
		instances_x = S_u[index_Su]
		
		x = tf.cast(instances_x[:,0:-2], tf.float32)
		a = tf.cast(
			tf.squeeze(tf.one_hot(instances_x[:,-2:-1], depth=self.generator.output_neurons),axis=1), 
			tf.float32)
		p = tf.cast(instances_x[:, -1:], tf.float32)
		
		return x, a, p

	
	def train(self, S, S_u, alpha=0, beta_g=.001, M=100, m_instances=100, 
			  n_instances=100, tau=0.001, zeta=.001):
		'''
		Train the WS_CRM.
		Inputs:

		- S = ((x_i, a_i, p_i, r_i)) with i=1...n sampled from pi_0
		- S_u = ((x_j, a_j, p_j)) with j=1...n sampled from pi_0
		- alpha: relative weight of the gradient of the Reward and 
				 the gradient of the C.E.
		- beta: Learning rate of the generator
		- M: maximum number of epochs for the whole algorithm
		- m_samples: Number of instances used to optimize the critic
		- n_samples: Number of instances used to compute the Reward
		'''

		epoch = 0
		optimizer_generator = tf.keras.optimizers.Adam(beta_g)
		total_loss = 0.0
		loss_t_1 = np.inf
		
		# Initialize the model with a dummy run
		x, a, p, r = self.sample_S(S, 2)
		
		total_instances_S = S.shape[0]
		total_instances_Su = S_u.shape[0]
		
		print('Total instances in the dataset')
		print(total_instances_S)
		print(total_instances_Su)
		
		while epoch <= M:
#             print('Epoch: ', epoch)
			epoch += 1

			# Using the labeled dataset
			# ------------------------------------------------
			# Sample n instances (x_i, a_i, p_i, r_i) from S
			x, a, p, r = self.sample_S(S, n_instances)
			# Estimate the re-weighted loss
			with tf.GradientTape() as tape:
				pi_theta_distribution = self.get_policy(x)
				pi_theta_action = tf.reduce_sum(tf.multiply(pi_theta_distribution, a), 
												axis=-1, keepdims=True)
				denominator = tf.math.maximum(zeta, p)
				
				factor = tf.divide(pi_theta_action, denominator)
				
				risk = tf.reduce_mean(tf.multiply(r, factor))
				
			g_1 = tape.gradient(risk, self.generator.trainable_weights)
			
			# Using the unlabeled dataset
			# ------------------------------------------------
			# Sample m instances from Su
			x, a, p = self.sample_Su(S_u, m_instances)
			# Estimate the weighted cross entropy
			with tf.GradientTape() as tape2:
				pi_theta_distribution = self.get_policy(x)
				pi_theta_action = tf.reduce_sum(tf.multiply(pi_theta_distribution, a), 
												axis=-1, keepdims=True)
				log_pi_theta_action = tf.math.log(pi_theta_action+1E-7)
				
				max_term = tf.math.maximum(tau, p)
				
				wce = -tf.reduce_mean(tf.multiply(max_term, log_pi_theta_action))
				
			g_2 = tape2.gradient(wce, self.generator.trainable_weights)
			
			# Compute the compound gradient
			g_theta = list()

			for element_1, element_2 in zip(g_1, g_2):
				element = alpha*element_1 + (1.0-alpha)*element_2
				g_theta.append(element)
			
			
			#Update the generator
			optimizer_generator.apply_gradients(zip(g_theta, self.generator.trainable_weights))
			print('Epoch: ', epoch, 'Loss: ', np.round((1.0-alpha)*wce + alpha*risk,3), end='\r')

	def get_policy(self, x):
		'''
		This functions returns a policy for each context  in x.
		Inputs:
		 - x: Array of num_instances x num_features
		'''
		x_tensor = tf.convert_to_tensor(x, dtype=tf.float32)
		logits = self.generator(x_tensor)
		policy = tf.nn.softmax(logits)

		return policy

class KL_CRM():
	'''
	Class that implements the semi-CRM
	'''
	def __init__(self, generator):
		'''
		This class receives as an input:

		- generator: Initial generator distribution pi_{theta_0} (a | x)
		- critic: Initial critic parameteric function f_{omega_0} (z, x, p)
		'''

		self.generator = generator

	def sample_S(self, S, m_instances=100):
		total_num_instances = S.shape[0]
		
		# Sample m_instances
		index_S = np.random.permutation(total_num_instances)[0:m_instances]
		instances_x = S[index_S]
		
		x = tf.cast(instances_x[:,0:-3], tf.float32)
		a = tf.cast(
			tf.squeeze(tf.one_hot(instances_x[:,-3:-2], depth=self.generator.output_neurons), axis=1), 
			tf.float32)
		p = tf.cast(instances_x[:, -2:-1], tf.float32)
		r = tf.cast(instances_x[:, -1:], tf.float32)
		
		return x, a, p, r
	
	def sample_Su(self, S_u, m_instances):
		total_num_instances = S_u.shape[0]
		
		# Sample m instances
		index_Su = np.random.permutation(total_num_instances)[0:m_instances]
		instances_x = S_u[index_Su]
		
		x = tf.cast(instances_x[:,0:-2], tf.float32)
		a = tf.cast(
			tf.squeeze(tf.one_hot(instances_x[:,-2:-1], depth=self.generator.output_neurons),axis=1), 
			tf.float32)
		p = tf.cast(instances_x[:, -1:], tf.float32)
		
		return x, a, p

	
	def train(self, S, S_u, alpha=0, beta_g=.001, M=100, m_instances=100, 
			  n_instances=100, tau=0.001, zeta=.001):
		'''
		Train the WS_CRM.
		Inputs:

		- S = ((x_i, a_i, p_i, r_i)) with i=1...n sampled from pi_0
		- S_u = ((x_j, a_j, p_j)) with j=1...n sampled from pi_0
		- alpha: relative weight of the gradient of the Reward and 
				 the gradient of the C.E.
		- beta: Learning rate of the generator
		- M: maximum number of epochs for the whole algorithm
		- m_samples: Number of instances used to optimize the critic
		- n_samples: Number of instances used to compute the Reward
		'''

		epoch = 0
		optimizer_generator = tf.keras.optimizers.Adam(beta_g)
		total_loss = 0.0
		loss_t_1 = np.inf
		
		# Initialize the model with a dummy run
		x, a, p, r = self.sample_S(S, 2)
		
		total_instances_S = S.shape[0]
		total_instances_Su = S_u.shape[0]
		
		print('Total instances in the dataset')
		print(total_instances_S)
		print(total_instances_Su)
		
		while epoch <= M:
#             print('Epoch: ', epoch)
			epoch += 1

			# Using the labeled dataset
			# ------------------------------------------------
			# Sample n instances (x_i, a_i, p_i, r_i) from S
			x, a, p, r = self.sample_S(S, n_instances)
			# Estimate the re-weighted loss
			with tf.GradientTape() as tape:
				pi_theta_distribution = self.get_policy(x)
				pi_theta_action = tf.reduce_sum(tf.multiply(pi_theta_distribution, a), 
												axis=-1, keepdims=True)
				denominator = tf.math.maximum(zeta, p)
				
				factor = tf.divide(pi_theta_action, denominator)
				
				risk = tf.reduce_mean(tf.multiply(r, factor))
				
			g_1 = tape.gradient(risk, self.generator.trainable_weights)
			
			# Using the unlabeled dataset
			# ------------------------------------------------
			# Sample m instances from Su
			x, a, p = self.sample_Su(S_u, m_instances)
			# Estimate the weighted cross entropy
			with tf.GradientTape() as tape2:
				pi_theta_distribution = self.get_policy(x)
				pi_theta_action = tf.reduce_sum(tf.multiply(pi_theta_distribution, a), 
												axis=-1, keepdims=True)                
				max_term = tf.math.maximum(tau, p)
				term = tf.divide(pi_theta_action, max_term)
				log_term = tf.math.log(term + 1E-14)
				
				kl = tf.reduce_mean(tf.multiply(pi_theta_action, log_term))
				
			g_2 = tape2.gradient(kl, self.generator.trainable_weights)
			
			# Compute the compound gradient
			g_theta = list()

			for element_1, element_2 in zip(g_1, g_2):
				element = alpha*element_1 + (1.0-alpha)*element_2
				g_theta.append(element)
			
			
			#Update the generator
			optimizer_generator.apply_gradients(zip(g_theta, self.generator.trainable_weights))
			print('Epoch: ', epoch, 'Loss: ', np.round((1.0-alpha)*kl + alpha*risk,3), end='\r')

	def get_policy(self, x):
		'''
		This functions returns a policy for each context  in x.
		Inputs:
		 - x: Array of num_instances x num_features
		'''
		x_tensor = tf.convert_to_tensor(x, dtype=tf.float32)
		logits = self.generator(x_tensor)
		policy = tf.nn.softmax(logits)

		return policy

class TV_CRM():
	'''
	Class that implements the semi-CRM
	'''
	def __init__(self, generator):
		'''
		This class receives as an input:

		- generator: Initial generator distribution pi_{theta_0} (a | x)
		- critic: Initial critic parameteric function f_{omega_0} (z, x, p)
		'''

		self.generator = generator

	def sample_S(self, S, m_instances=100):
		total_num_instances = S.shape[0]
		
		# Sample m_instances
		index_S = np.random.permutation(total_num_instances)[0:m_instances]
		instances_x = S[index_S]
		
		x = tf.cast(instances_x[:,0:-3], tf.float32)
		a = tf.cast(
			tf.squeeze(tf.one_hot(instances_x[:,-3:-2], depth=self.generator.output_neurons), axis=1), 
			tf.float32)
		p = tf.cast(instances_x[:, -2:-1], tf.float32)
		r = tf.cast(instances_x[:, -1:], tf.float32)
		
		return x, a, p, r
	
	def sample_Su(self, S_u, m_instances):
		total_num_instances = S_u.shape[0]
		
		# Sample m instances
		index_Su = np.random.permutation(total_num_instances)[0:m_instances]
		instances_x = S_u[index_Su]
		
		x = tf.cast(instances_x[:,0:-2], tf.float32)
		a = tf.cast(
			tf.squeeze(tf.one_hot(instances_x[:,-2:-1], depth=self.generator.output_neurons),axis=1), 
			tf.float32)
		p = tf.cast(instances_x[:, -1:], tf.float32)
		
		return x, a, p

	
	def train(self, S, S_u, alpha=0, beta_g=.001, M=100, m_instances=100, 
			  n_instances=100, tau=0.001, zeta=.001):
		'''
		Train the WS_CRM.
		Inputs:

		- S = ((x_i, a_i, p_i, r_i)) with i=1...n sampled from pi_0
		- S_u = ((x_j, a_j, p_j)) with j=1...n sampled from pi_0
		- alpha: relative weight of the gradient of the Reward and 
				 the gradient of the C.E.
		- beta: Learning rate of the generator
		- M: maximum number of epochs for the whole algorithm
		- m_samples: Number of instances used to optimize the critic
		- n_samples: Number of instances used to compute the Reward
		'''

		epoch = 0
		optimizer_generator = tf.keras.optimizers.Adam(beta_g)
		total_loss = 0.0
		loss_t_1 = np.inf
		
		# Initialize the model with a dummy run
		x, a, p, r = self.sample_S(S, 2)
		
		total_instances_S = S.shape[0]
		total_instances_Su = S_u.shape[0]
		
		print('Total instances in the dataset')
		print(total_instances_S)
		print(total_instances_Su)
		
		while epoch <= M:
#             print('Epoch: ', epoch)
			epoch += 1

			# Using the labeled dataset
			# ------------------------------------------------
			# Sample n instances (x_i, a_i, p_i, r_i) from S
			x, a, p, r = self.sample_S(S, n_instances)
			# Estimate the re-weighted loss
			with tf.GradientTape() as tape:
				pi_theta_distribution = self.get_policy(x)
				pi_theta_action = tf.reduce_sum(tf.multiply(pi_theta_distribution, a), 
												axis=-1, keepdims=True)
				denominator = tf.math.maximum(zeta, p)
				
				factor = tf.divide(pi_theta_action, denominator)
				
				risk = tf.reduce_mean(tf.multiply(r, factor))
				
			g_1 = tape.gradient(risk, self.generator.trainable_weights)
			
			# Using the unlabeled dataset
			# ------------------------------------------------
			# Sample m instances from Su
			x, a, p = self.sample_Su(S_u, m_instances)
			# Estimate the weighted cross entropy
			with tf.GradientTape() as tape2:
				pi_theta_distribution = self.get_policy(x)
				pi_theta_action = tf.reduce_sum(tf.multiply(pi_theta_distribution, a), 
												axis=-1, keepdims=True)
				
				max_term = tf.math.maximum(tau, p)
				
				tv = tf.reduce_mean(tf.math.abs(tf.subtract(max_term, pi_theta_action)))
				
			g_2 = tape2.gradient(tv, self.generator.trainable_weights)
			
			# Compute the compound gradient
			g_theta = list()

			for element_1, element_2 in zip(g_1, g_2):
				element = alpha*element_1 + (1.0-alpha)*element_2
				g_theta.append(element)
			
			
			#Update the generator
			optimizer_generator.apply_gradients(zip(g_theta, self.generator.trainable_weights))
			print('Epoch: ', epoch, 'Loss: ', np.round((1.0-alpha)*tv + alpha*risk,3), end='\r')

	def get_policy(self, x):
		'''
		This functions returns a policy for each context  in x.
		Inputs:
		 - x: Array of num_instances x num_features
		'''
		x_tensor = tf.convert_to_tensor(x, dtype=tf.float32)
		logits = self.generator(x_tensor)
		policy = tf.nn.softmax(logits)

		return policy



class PR_CRM():
	'''
	Class that implements the semi-CRM
	'''
	def __init__(self, generator):
		'''
		This class receives as an input:

		- generator: Initial generator distribution pi_{theta_0} (a | x)
		- critic: Initial critic parameteric function f_{omega_0} (z, x, p)
		'''

		self.generator = generator

	def sample_S(self, S, m_instances=100):
		total_num_instances = S.shape[0]
		
		# Sample m_instances
		index_S = np.random.permutation(total_num_instances)[0:m_instances]
		instances_x = S[index_S]
		
		x = tf.cast(instances_x[:,0:-3], tf.float32)
		a = tf.cast(
			tf.squeeze(tf.one_hot(instances_x[:,-3:-2], depth=self.generator.output_neurons), axis=1), 
			tf.float32)
		p = tf.cast(instances_x[:, -2:-1], tf.float32)
		r = tf.cast(instances_x[:, -1:], tf.float32)
		
		return x, a, p, r
	
	def sample_Su(self, S_u, m_instances):
		total_num_instances = S_u.shape[0]
		
		# Sample m instances
		index_Su = np.random.permutation(total_num_instances)[0:m_instances]
		instances_x = S_u[index_Su]
		
		x = tf.cast(instances_x[:,0:-2], tf.float32)
		a = tf.cast(
			tf.squeeze(tf.one_hot(instances_x[:,-2:-1], depth=self.generator.output_neurons),axis=1), 
			tf.float32)
		p = tf.cast(instances_x[:, -1:], tf.float32)
		
		return x, a, p

	
	def train(self, S, S_u, alpha=0, beta_g=.001, M=100, m_instances=100, 
			  n_instances=100, tau=0.001, zeta=.001):
		'''
		Train the WS_CRM.
		Inputs:

		- S = ((x_i, a_i, p_i, r_i)) with i=1...n sampled from pi_0
		- S_u = ((x_j, a_j, p_j)) with j=1...n sampled from pi_0
		- alpha: relative weight of the gradient of the Reward and 
				 the gradient of the C.E.
		- beta: Learning rate of the generator
		- M: maximum number of epochs for the whole algorithm
		- m_samples: Number of instances used to optimize the critic
		- n_samples: Number of instances used to compute the Reward
		'''

		epoch = 0
		optimizer_generator = tf.keras.optimizers.Adam(beta_g)
		optimizer_regressor = tf.keras.optimizers.Adam(beta_g)
		total_loss = 0.0
		loss_t_1 = np.inf
		
		# Initialize the model with a dummy run
		x, a, p, r = self.sample_S(S, 2)
		
		total_instances_S = S.shape[0]
		total_instances_Su = S_u.shape[0]
		
		print('Total instances in the dataset')
		print(total_instances_S)
		print(total_instances_Su)

		# Estimate the reward using the regression
		# ------------------------------------------------
		regressor = Regressor(num_layers=2, hidden_neurons=20)
		
		while epoch <= M:
			# Using the labeled dataset
			# ------------------------------------------------
			# Sample n instances (x_i, a_i, p_i, r_i) from S
			x, a, p, r = self.sample_S(S, n_instances)
			instances_x = tf.concat([x, a], axis=-1)
			r = -r
			with tf.GradientTape() as tape:
				r_hat = regressor(instances_x)
				diff = tf.subtract(r, r_hat)
				diff_sq = tf.square(diff)
				term = tf.multiply(p, diff_sq)
				
				loss = tf.reduce_mean(term)
			
			g_regressor = tape.gradient(loss, regressor.trainable_weights)
			optimizer_regressor.apply_gradients(zip(g_regressor, regressor.trainable_weights))
			
			epoch += 1
		
		epoch = 0
		while epoch <= M:
#             print('Epoch: ', epoch)
			epoch += 1

			# Using the labeled dataset
			# ------------------------------------------------
			# Sample n instances (x_i, a_i, p_i, r_i) from S
			x, a, p, r = self.sample_S(S, n_instances)
			
			# Using the unlabeled dataset
			# ------------------------------------------------
			# Sample m instances from Su
			x_u, a_u, p_u = self.sample_Su(S_u, m_instances)
			instances_x_u = tf.concat([x_u, a_u], axis=-1)
			r_hat = -regressor(instances_x_u)
			
			# Estimate the re-weighted loss
			with tf.GradientTape() as tape:
				pi_theta_distribution = self.get_policy(x)
				pi_theta_action = tf.reduce_sum(tf.multiply(pi_theta_distribution, a), 
												axis=-1, keepdims=True)
				denominator = tf.math.maximum(zeta, p)
				
				factor = tf.divide(pi_theta_action, denominator)
				
				risk_1 = tf.reduce_mean(tf.multiply(r, factor))

			
				pi_theta_distribution_u = self.get_policy(x_u)
				pi_theta_action_u = tf.reduce_sum(tf.multiply(pi_theta_distribution_u, a_u), 
												axis=-1, keepdims=True)
				denominator_u = tf.math.maximum(zeta, p_u)
				
				factor_u = tf.divide(pi_theta_action_u, denominator_u)
				
				risk_2 = tf.reduce_mean(tf.multiply(r_hat, factor_u))
			
				loss = risk_1 + risk_2
			
			g = tape.gradient(loss, self.generator.trainable_weights)
			
			
			#Update the generator
			optimizer_generator.apply_gradients(zip(g, self.generator.trainable_weights))
			print('Epoch: ', epoch, 'Loss: ', loss.numpy(), end='\r')

	def get_policy(self, x):
		'''
		This functions returns a policy for each context  in x.
		Inputs:
		 - x: Array of num_instances x num_features
		'''
		x_tensor = tf.convert_to_tensor(x, dtype=tf.float32)
		logits = self.generator(x_tensor)
		policy = tf.nn.softmax(logits)

		return policy

class PR_CRM_Reg():
	'''
	Class that implements the semi-CRM
	'''
	def __init__(self, generator):
		'''
		This class receives as an input:

		- generator: Initial generator distribution pi_{theta_0} (a | x)
		- critic: Initial critic parameteric function f_{omega_0} (z, x, p)
		'''

		self.generator = generator

	def sample_S(self, S, m_instances=100):
		total_num_instances = S.shape[0]
		
		# Sample m_instances
		index_S = np.random.permutation(total_num_instances)[0:m_instances]
		instances_x = S[index_S]
		
		x = tf.cast(instances_x[:,0:-3], tf.float32)
		a = tf.cast(
			tf.squeeze(tf.one_hot(instances_x[:,-3:-2], depth=self.generator.output_neurons), axis=1), 
			tf.float32)
		p = tf.cast(instances_x[:, -2:-1], tf.float32)
		r = tf.cast(instances_x[:, -1:], tf.float32)
		
		return x, a, p, r
	
	def sample_Su(self, S_u, m_instances):
		total_num_instances = S_u.shape[0]
		
		# Sample m instances
		index_Su = np.random.permutation(total_num_instances)[0:m_instances]
		instances_x = S_u[index_Su]
		
		x = tf.cast(instances_x[:,0:-2], tf.float32)
		a = tf.cast(
			tf.squeeze(tf.one_hot(instances_x[:,-2:-1], depth=self.generator.output_neurons),axis=1), 
			tf.float32)
		p = tf.cast(instances_x[:, -1:], tf.float32)
		
		return x, a, p

	
	def train(self, S, S_u, alpha=0, beta_g=.001, M=100, m_instances=100, 
			  n_instances=100, tau=0.001, zeta=.001):
		'''
		Train the WS_CRM.
		Inputs:

		- S = ((x_i, a_i, p_i, r_i)) with i=1...n sampled from pi_0
		- S_u = ((x_j, a_j, p_j)) with j=1...n sampled from pi_0
		- alpha: relative weight of the gradient of the Reward and 
				 the gradient of the C.E.
		- beta: Learning rate of the generator
		- M: maximum number of epochs for the whole algorithm
		- m_samples: Number of instances used to optimize the critic
		- n_samples: Number of instances used to compute the Reward
		'''

		epoch = 0
		optimizer_generator = tf.keras.optimizers.Adam(beta_g)
		optimizer_regressor = tf.keras.optimizers.Adam(beta_g)
		total_loss = 0.0
		loss_t_1 = np.inf
		
		# Initialize the model with a dummy run
		x, a, p, r = self.sample_S(S, 2)
		
		total_instances_S = S.shape[0]
		total_instances_Su = S_u.shape[0]
		
		print('Total instances in the dataset')
		print(total_instances_S)
		print(total_instances_Su)

		# Estimate the reward using the regression
		# ------------------------------------------------
		regressor = Regressor(num_layers=2, hidden_neurons=20)
		
		while epoch <= M:
			# Using the labeled dataset
			# ------------------------------------------------
			# Sample n instances (x_i, a_i, p_i, r_i) from S
			x, a, p, r = self.sample_S(S, n_instances)
			instances_x = tf.concat([x, a], axis=-1)
			r = -r
			with tf.GradientTape() as tape:
				r_hat = regressor(instances_x)
				diff = tf.subtract(r, r_hat)
				diff_sq = tf.square(diff)
				term = tf.multiply(p, diff_sq)
				
				loss = tf.reduce_mean(term)
			
			g_regressor = tape.gradient(loss, regressor.trainable_weights)
			optimizer_regressor.apply_gradients(zip(g_regressor, regressor.trainable_weights))
			
			epoch += 1
		
		epoch = 0
		while epoch <= M:
#             print('Epoch: ', epoch)
			epoch += 1

			# Using the labeled dataset
			# ------------------------------------------------
			# Sample n instances (x_i, a_i, p_i, r_i) from S
			x, a, p, r = self.sample_S(S, n_instances)
			
			# Using the unlabeled dataset
			# ------------------------------------------------
			# Sample m instances from Su
			x_u, a_u, p_u = self.sample_Su(S_u, m_instances)
			instances_x_u = tf.concat([x_u, a_u], axis=-1)
			r_hat = -regressor(instances_x_u)
			
			# Estimate the re-weighted loss
			with tf.GradientTape() as tape:
				pi_theta_distribution = self.get_policy(x)
				pi_theta_action = tf.reduce_sum(tf.multiply(pi_theta_distribution, a), 
												axis=-1, keepdims=True)
				denominator = tf.math.maximum(zeta, p)
				
				factor = tf.divide(pi_theta_action, denominator)
				
				risk_1 = tf.reduce_mean(tf.multiply(r, factor))

			
				pi_theta_distribution_u = self.get_policy(x_u)
				pi_theta_action_u = tf.reduce_sum(tf.multiply(pi_theta_distribution_u, a_u), 
												axis=-1, keepdims=True)
				denominator_u = tf.math.maximum(zeta, p_u)
				
				factor_u = tf.divide(pi_theta_action_u, denominator_u)
				
				risk_2 = tf.reduce_mean(tf.multiply(r_hat, factor_u))
			
				loss = risk_1 + risk_2
			
			g_1 = tape.gradient(loss, self.generator.trainable_weights)

			# Estimate the weighted cross entropy
			with tf.GradientTape() as tape2:
				pi_theta_distribution = self.get_policy(x_u)
				pi_theta_action = tf.reduce_sum(tf.multiply(pi_theta_distribution, a_u), 
												axis=-1, keepdims=True)
				log_pi_theta_action = tf.math.log(pi_theta_action+1E-7)
				
				max_term = tf.math.maximum(tau, p_u)
				
				wce = -tf.reduce_mean(tf.multiply(max_term, log_pi_theta_action))
				
			g_2 = tape2.gradient(wce, self.generator.trainable_weights)
			
			# Compute the compound gradient
			g_theta = list()

			for element_1, element_2 in zip(g_1, g_2):
				element = alpha*element_1 + (1.0-alpha)*element_2
				g_theta.append(element)
			
			
			#Update the generator
			optimizer_generator.apply_gradients(zip(g_theta, self.generator.trainable_weights))

			print('Epoch: ', epoch, 'Loss: ', loss.numpy(), end='\r')

	def get_policy(self, x):
		'''
		This functions returns a policy for each context  in x.
		Inputs:
		 - x: Array of num_instances x num_features
		'''
		x_tensor = tf.convert_to_tensor(x, dtype=tf.float32)
		logits = self.generator(x_tensor)
		policy = tf.nn.softmax(logits)

		return policy

class BanditNet_CRM():
	'''
	Class that implements the semi-CRM
	'''
	def __init__(self, generator):
		'''
		This class receives as an input:

		- generator: Initial generator distribution pi_{theta_0} (a | x)
		- critic: Initial critic parameteric function f_{omega_0} (z, x, p)
		'''

		self.generator = generator

	def sample_S(self, S, m_instances=100):
		total_num_instances = S.shape[0]
		
		# Sample m_instances
		index_S = np.random.permutation(total_num_instances)[0:m_instances]
		instances_x = S[index_S]
		
		x = tf.cast(instances_x[:,0:-3], tf.float32)
		a = tf.cast(
			tf.squeeze(tf.one_hot(instances_x[:,-3:-2], depth=self.generator.output_neurons), axis=1), 
			tf.float32)
		p = tf.cast(instances_x[:, -2:-1], tf.float32)
		r = tf.cast(instances_x[:, -1:], tf.float32)
		
		return x, a, p, r
	
	def sample_Su(self, S_u, m_instances):
		total_num_instances = S_u.shape[0]
		
		# Sample m instances
		index_Su = np.random.permutation(total_num_instances)[0:m_instances]
		instances_x = S_u[index_Su]
		
		x = tf.cast(instances_x[:,0:-2], tf.float32)
		a = tf.cast(
			tf.squeeze(tf.one_hot(instances_x[:,-2:-1], depth=self.generator.output_neurons),axis=1), 
			tf.float32)
		p = tf.cast(instances_x[:, -1:], tf.float32)
		
		return x, a, p

	
	def train(self, S, S_u, alpha=0, beta_g=.001, M=100, m_instances=100, 
			  n_instances=100, tau=0.001, zeta=.001, lambda_val=0.0):
		'''
		Train the WS_CRM.
		Inputs:

		- S = ((x_i, a_i, p_i, r_i)) with i=1...n sampled from pi_0
		- S_u = ((x_j, a_j, p_j)) with j=1...n sampled from pi_0
		- alpha: relative weight of the gradient of the Reward and 
				 the gradient of the C.E.
		- beta: Learning rate of the generator
		- M: maximum number of epochs for the whole algorithm
		- m_samples: Number of instances used to optimize the critic
		- n_samples: Number of instances used to compute the Reward
		'''

		epoch = 0
		optimizer_generator = tf.keras.optimizers.Adam(beta_g)
		total_loss = 0.0
		loss_t_1 = np.inf
		
		# Initialize the model with a dummy run
		x, a, p, r = self.sample_S(S, 2)
		
		total_instances_S = S.shape[0]
		total_instances_Su = S_u.shape[0]
		
		print('Total instances in the dataset')
		print(total_instances_S)
		print(total_instances_Su)
		
		while epoch <= M:
#             print('Epoch: ', epoch)
			epoch += 1

			# Using the labeled dataset
			# ------------------------------------------------
			# Sample n instances (x_i, a_i, p_i, r_i) from S
			x, a, p, r = self.sample_S(S, n_instances)
			# Estimate the re-weighted loss
			with tf.GradientTape() as tape:
				pi_theta_distribution = self.get_policy(x)
				pi_theta_action = tf.reduce_sum(tf.multiply(pi_theta_distribution, a), 
												axis=-1, keepdims=True)
				denominator = tf.math.maximum(zeta, p)
				
				factor = tf.divide(pi_theta_action, denominator)
				
				risk = tf.reduce_mean(tf.multiply(r-lambda_val, factor))
				
			g_1 = tape.gradient(risk, self.generator.trainable_weights)
					   
			
			#Update the generator
			optimizer_generator.apply_gradients(zip(g_1, self.generator.trainable_weights))
			print('Epoch: ', epoch, 'Loss: ', np.round(alpha*risk,3), end='\r')

	def get_policy(self, x):
		'''
		This functions returns a policy for each context  in x.
		Inputs:
		 - x: Array of num_instances x num_features
		'''
		x_tensor = tf.convert_to_tensor(x, dtype=tf.float32)
		logits = self.generator(x_tensor)
		policy = tf.nn.softmax(logits)

		return policy		
def main():
	return -1

if __name__ == '__main__':
	main()