import torch
import random
from typing import List
import numpy as np

action_dict = {
    'MOVE_UP': 0,
    'MOVE_DOWN': 1,
    'MOVE_LEFT': 2,
    'MOVE_RIGHT': 3
}

# Define the function that processes the list of strings according to the specified rules
def text_projection(text_actions: List[str], env_name):
    print(text_actions)
    try:
        action = eval(text_actions[0]).get("action").upper()
        if "catrap" in env_name.lower():
            action = action_dict[action]
            
    except Exception as e:
        print("error on parsing")
        action = np.random.choice(list(range(4)))
        print(f"Cannot find correct action, {text_actions[0]}", e)
    return action
    

def make_observation(past_images, prev_actions):
    messages = []

    question = f'''
    You are an agent controlling the protagonist in the Game Boy puzzle game Catrap.

    GAME CONTEXT:
    - You control a character who can move UP, DOWN, LEFT, or RIGHT, and can push blocks or attack enemies.
    - The level objective: Defeat all enemies to clear the stage.
    - Clearing enemies may require climbing ladders, moving strategically around obstacles, and avoiding dead-ends.


    OBSERVATION:
    Below are the last {len(past_images)} game states from the current trajectory and your corresponding actions.
    - Frame -1 is the current state (use this for decision-making).
    - Earlier frames (Frame -2, -3, …) and corresponding actions are for context only and show how the game evolved after your moves.

    --- PAST FRAME SEQUENCE ---
    '''

    for N in np.arange(len(past_images), 0, -1):
        question += f'''Frame -{N}: <IMAGE_FRAME_{-N}>\n'''
        if N != 1:
            question += f'''Action -{N}: {prev_actions[len(past_images) - N]}\n'''

    question += f'''
    --- END PAST FRAME SEQUENCE ---

    ACTION SPACE:
    - MOVE_UP
    - MOVE_DOWN
    - MOVE_LEFT
    - MOVE_RIGHT

    CONSTRAINTS:
    - Output must be exactly one of the four valid moves: MOVE_UP, MOVE_DOWN, MOVE_LEFT, MOVE_RIGHT.
    - Never invent new actions.
    - Do not repeat the same move if it results in no progress (e.g., walking into a wall).
    - Avoid looping (e.g., moving back and forth between the same two spaces without progress).
    - Prefer moves that bring you closer to enemies or ladders, even if multiple steps are required.

    REASONING:
    Always think in this order:
    1. Identify the protagonist’s position in the most recent frame.
    2. Check which directions are blocked (walls, edges, obstacles).
    3. Locate enemies and ladders and decide which are reachable.
    4. Choose the move that progresses toward defeating an enemy or climbing a ladder, even if immediate attack is not possible.

    FINAL OUTPUT:
    Respond with exactly one JSON object, with three keys:
    - "description": Description of the current state
    - "thoughts": Your reasoning about the next move based on previous moves and your final goal.
    - "action": One of [MOVE_UP, MOVE_DOWN, MOVE_LEFT, MOVE_RIGHT].

    Output only a JSON object, starting with {{ and ending with }}. No additional text, no explanations, no fences.
    Do not output anything before or after the JSON.
    '''
    # Maybe adding "description" to the json output above?
    # print(question)
    if len(past_images):
        for image in past_images:
            messages.append({"type": "image", "image": image})
    else:
        raise ValueError("No video frames found.")
    messages.append({"type": "text", "text": question})

    conversation = [
        {
            "role": "user",
            "content": messages,
        }
    ]
    return conversation

