import numpy as np

def planning_function(processed_state):
    """
    Determines optimal tasks for each agent based on the current state.
    
    Args:
        processed_state: 
            Dict of list of (n, ): dict('agent_0', 'agent_1', 'agent_2)
                List: Agent : (m, ) [landmark0_2_agent, landmark1_2_agent, landmark2_2_agent, other_agent_2_agent, other_agent_2_agent]

    Returns:
        dict: Optimal tasks for each agent ('No op','Landmark_0','Landmark_1','Landmark_2')
    """
    llm_tasks = {}
    assigned_landmarks = set()

    # Calculate distances to landmarks for each agent
    distances = {agent: [np.linalg.norm(landmark) for landmark in state[:3]] for agent, state in processed_state.items()}

    # Assign landmarks to agents
    for _ in range(len(processed_state)):
        min_distance = float('inf')
        best_agent = None
        best_landmark = None

        for agent, agent_distances in distances.items():
            if agent not in llm_tasks:
                for i, distance in enumerate(agent_distances):
                    if i not in assigned_landmarks and distance < min_distance:
                        min_distance = distance
                        best_agent = agent
                        best_landmark = i

        if best_agent is not None:
            llm_tasks[best_agent] = f"Landmark_{best_landmark}"
            assigned_landmarks.add(best_landmark)
        else:
            break

    # Assign 'No op' to any remaining agents
    for agent in processed_state.keys():
        if agent not in llm_tasks:
            llm_tasks[agent] = "No op"

    return llm_tasks

def compute_reward(processed_state, llm_actions, actions):
    """
    Calculate rewards based on the tasks assigned and their outcomes.
    
    Args:
        processed_state: returned from function process_state(state, p, f)
        llm_actions (dict): dictionary of list of integers which means the suggest actions from llm for each agent. E.g. {"agent_0": [2,3], "agent_1": [4],...}
        actions (dict): dictionary of a integer action that actually perform by each agent. E.g. {"agent_0": 2, "agent_1": 4, ...}

        **Note: the index of this action space is [no_action, move_left, move_right, move_down, move_up]**
        
    Returns:
        reward: Dict containing rewards for each agent. For example: {'agent_0': reward1, 'agent_1', reward2, ...}
    """
    reward = {}
    collision_penalty = -1
    collision_threshold = 0.3
    occupation_threshold = 0.1

    for agent, state in processed_state.items():
        # Base reward: negative distance to assigned landmark
        assigned_landmark = np.argmin([np.linalg.norm(landmark) for landmark in state[:3]])
        distance_to_landmark = np.linalg.norm(state[assigned_landmark])
        base_reward = -distance_to_landmark

        # Reward for following LLM actions
        action_reward = 0.5 if actions[agent] in llm_actions[agent] else 0

        # Collision penalty
        collision_reward = 0
        for other_agent_pos in state[3:]:
            if np.linalg.norm(other_agent_pos) < collision_threshold:
                collision_reward += collision_penalty

        # Occupation reward
        occupation_reward = 1 if distance_to_landmark <= occupation_threshold else 0

        # Combine rewards
        reward[agent] = base_reward + action_reward + collision_reward + occupation_reward

    return reward