import numpy as np
import copy
from env import *
from agent import *
from utils import *


def Local_TD_Learning(env, agent, Index_POLICY, num_sims):
    env = copy.deepcopy(env)
    agent = copy.deepcopy(agent)

    env.reset()
    agent.reset()
    curr_state = next_state = env.global_state_agent
    curr_action = next_action = env.global_action_agent
    curr_reward = next_reward = 0

    # Local Q-learning
    for i in range(num_sims):
        next_state, next_action, next_reward = env.step_by_agent(Index_POLICY)
        if i >= 0.1 * num_sims:
            agent.update(curr_state[0], curr_action[0], curr_reward[0], next_state[0], next_action[0]) # agent 0
        curr_state = next_state
        curr_action = next_action
        curr_reward = next_reward
        
        
    Q_values = agent.get_Q_values()
    # Q_values_0 = Q_values[::2]
    # Q_values_1 = Q_values[1::2]
    # env.reset_to_m_star()
    # Qvalue_Approx = get_global_reward(env.config/env.N, Index_POLICY, env.alpha, Q_values_1, Q_values_0) * env.N

    return Q_values


