# Lint as: python3
"""Experimental code for CARCB algorithm."""

from absl import app
from absl import flags
import numpy as np
from sklearn.linear_model import LinearRegression

FLAGS = flags.FLAGS

flags.DEFINE_bool('compute_opt', False,
                  'Compute the optimal reward instead of the algorithm reward')
flags.DEFINE_integer('window', 10, 'Congestion window size')
flags.DEFINE_integer('k', 2, 'Number of actions')
flags.DEFINE_integer('horizon', 10000, 'Horizon')
flags.DEFINE_bool('print_theta_error', False,
                  'Print error between learned and true theta parameter.')


def number_to_vector(num, k, window):
  """Convert a node number to a vector of actions."""
  vector = []
  for digit in range(window):
    shifted_num = num // k**digit
    vector.append(shifted_num % k)
  vector = list(reversed(vector))
  return vector


def vector_to_number(vector, k, window):
  """Convert a vector of actions to a number."""
  num = 0
  place_value = k**(window - 1)
  for value in vector:
    num = num + value * place_value
    place_value = place_value // k
  return num


def get_node_neighbors(node, k, window):
  """Get all neighbors of a given node."""
  neighbors = []
  node_vector = number_to_vector(node, k, window)
  for i in range(len(node_vector) - 1):
    node_vector[i] = node_vector[i + 1]
  for next_action in range(k):
    node_vector[window - 1] = next_action
    next_num = vector_to_number(node_vector, k, window)
    neighbors.append(next_num)
  return neighbors


def get_action_congestion(vector, action):
  congestion = 0
  for a in vector:
    if a == action:
      congestion = congestion + 1
  return congestion


def main(argv):
  window = FLAGS.window
  k = FLAGS.k
  horizon = FLAGS.horizon
  compute_opt = FLAGS.compute_opt

  seed = 0  # Controls randomness for costs
  rs = np.random.RandomState(seed)

  # Draw contexts, theta_star and first theta
  contexts = [[[] for i in range(horizon)] for i in range(k)]
  noise = [[[] for i in range(horizon)] for i in range(k)]
  for i in range(k):
    for t in range(horizon):
      context = rs.uniform(low=0.0, high=1.0, size=10)
      sum_of_squares = sum(map(lambda i: i * i, context))
      normalized_context = [x / np.sqrt(sum_of_squares) for x in context]
      contexts[i][t] = normalized_context
      noise[i][t] = rs.normal(0, 0.001)
  initial_theta_star = rs.uniform(low=0.0, high=1.0, size=10)
  theta_star_sum_of_squares = sum(map(lambda i: i * i, initial_theta_star))
  theta_star = [
      x / np.sqrt(theta_star_sum_of_squares) for x in initial_theta_star
  ]
  initial_theta = rs.uniform(low=0.0, high=1.0, size=10)
  theta_sum_of_squares = sum(map(lambda i: i * i, initial_theta))
  theta = [x / np.sqrt(theta_sum_of_squares) for x in initial_theta]

  # Start the simulation
  aggregate_rewards = 0
  t = 0
  epoch_size = window
  while True:
    if compute_opt:
      theta = theta_star
    dp = [[-1 for i in range(k**window)] for i in range(epoch_size)]
    dp_next = [[-1 for i in range(k**window)] for i in range(epoch_size)]
    # Initialize the last row of the dp
    for i in range(k**window):
      action_vector = number_to_vector(i, k, window)
      last_action = action_vector[window - 1]
      last_action_congestion = get_action_congestion(action_vector, last_action)
      context = contexts[last_action][t + epoch_size - 1]
      if compute_opt:
        dp[epoch_size - 1][i] = (
            np.dot(theta, context) +
            noise[last_action][t + epoch_size - 1]) / last_action_congestion
      else:
        dp[epoch_size - 1][i] = np.dot(theta, context) / last_action_congestion
    # Compute the rest of the rows
    for tt in range(epoch_size - 2, -1, -1):
      for i in range(k**window):
        action_vector = number_to_vector(i, k, window)
        last_action = action_vector[window - 1]
        last_action_congestion = get_action_congestion(action_vector,
                                                       last_action)
        context = contexts[last_action][t + tt]
        action_reward = 0
        if compute_opt:
          action_reward = (np.dot(theta, context) +
                           noise[last_action][t + tt]) / last_action_congestion
        else:
          action_reward = np.dot(theta, context) / last_action_congestion
        neighbors = get_node_neighbors(i, k, window)
        best_value = -1
        for n in neighbors:
          if best_value < action_reward + dp[tt + 1][n]:
            best_value = action_reward + dp[tt + 1][n]
            dp_next[tt][i] = n
        dp[tt][i] = best_value

    # Get the true value of the solution
    contexts_seen = []
    rewards_seen = []
    total_reward = 0
    x = 0
    for tt in range(epoch_size):
      action_vector = number_to_vector(x, k, window)
      last_action = action_vector[window - 1]
      last_action_congestion = get_action_congestion(action_vector, last_action)
      context = contexts[last_action][t + tt]
      action_reward = (np.dot(theta_star, context) +
                       noise[last_action][t + tt]) / last_action_congestion
      total_reward = total_reward + action_reward
      aggregate_rewards = aggregate_rewards + action_reward
      if (t + tt) % 100 == 0:
        print(aggregate_rewards / (t + tt + 1))
      x = dp_next[tt][x]
      contexts_seen.append(context)
      rewards_seen.append(action_reward * last_action_congestion)

    # Update theta, t, epoch_size
    matrix = np.array(contexts_seen)
    y = np.array(rewards_seen)
    reg = LinearRegression()
    reg.fit(matrix, y)
    theta = reg.coef_

    t = t + epoch_size
    if t >= horizon:
      break
    if t + epoch_size * 2 < horizon:
      epoch_size = epoch_size * 2
    else:
      epoch_size = horizon - t
  if FLAGS.print_theta_error:
    print('Error between learned theta and true theta* is', theta-theta_star)


if __name__ == '__main__':
  app.run(main)

