from textwrap import dedent
from rich import print as rprint

env_description = (
    "The Tetris environment from Gymnasium's Atari Learning Environment presents the agent with a grid-based game where tetrominoes fall from the top of the screen. " +
    "The agent's objective is to manipulate and place these tetrominoes to form complete horizontal lines, which are then cleared from the grid. " +
    "The environment provides visual observations of the game state and discrete actions corresponding to tetromino movements and rotations. " +
    "The primary task is to encourage the agent to maximize the number of lines cleared over an episode. " +
    "The current reward signal is sparse, giving positive feedback only when lines are cleared."
)

eval_criteria = "mean_ep_lines_cleared"

reward_func_args = [
    {"name": "action", "type_annotation": "int"},
    {"name": "curr_board", "type_annotation": "np.ndarray"},
    {"name": "curr_active_tetromino", "type_annotation": "np.ndarray"},
    {"name": "prev_board", "type_annotation": "np.ndarray"},
    {"name": "prev_active_tetromino", "type_annotation": "np.ndarray"},
    {"name": "lines_cleared", "type_annotation": "int"},
]
reward_func_return_type = "float"
reward_func_definition = dedent("""\
    Args:
        - `action` (int): 
            The action taken by the agent.
            For each column on the board, the agent can rotate the tetromino counter-clockwise for 0, 1, 2, or 3 times. This results in a total of board_width*4 possible actions. 
            Therefore, the action space is a Discrete space with board_width*4 possible actions. The value is interpreted as column index + number of rotations. 
            So the actions [0, 1, 2, 3] correspond to the first column and the tetromino rotated 0, 1, 2, 3 times respectively. 
            The actions [4, 5, 6, 7] correspond to the second column and the tetromino rotated 0, 1, 2, 3 times respectively, and so on.
            Action not within the action space is invalid and will result in a reward of -1.
        - `curr_board` (2D numpy array): A binary array representation of the game board after the `action` is taken, where `1` indicates a filled cell and `0` indicates an empty cell. This representation also includes the newly apperaed tetromino to be placed.
        - `curr_active_tetromino` (2D numpy array): A binary array of the same shape as `curr_board`, containing ONLY the tetromino to be placed for the NEXT step. Therefore, current board without active tetromino is curr_board - curr_active_tetromino.
        - `prev_board` (2D numpy array): A binary array of the same shape as `curr_board`, representing the game board of previous step.
        - `prev_active_tetromino` (2D numpy array): A binary array of the same shape as `prev_board`. Previous board without active tetromino is prev_board - prev_active_tetromino.
        - `lines_cleared` (int): The number of lines cleared resulted from the `action` taken in the current step.
    
    Example:
        Consider a 7x5 board at a given point during gameplay, where the following inputs are given:
        - action: 9 (column index 2, rotates counter-clockwise for 1 time)
        - curr_board:
            [
                [0, 0, 1, 1, 0],
                [0, 0, 1, 1, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 1, 1, 0],
                [0, 0, 0, 1, 1]
            ]
        - curr_active_tetromino:
            [
                [0, 0, 1, 1, 0],
                [0, 0, 1, 1, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0]
            ]
        - prev_board:
            [
                [0, 0, 0, 1, 0],
                [0, 0, 1, 1, 0],
                [0, 0, 1, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0]
            ]
        - prev_active_tetromino: 
            [
                [0, 0, 0, 1, 0],
                [0, 0, 1, 1, 0],
                [0, 0, 1, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0]
            ]
        - lines_cleared: 0

        This example shows that by taking action=9, the game board transitions from prev_board to curr_board by placing a z-shape tetromino to the lower right corner of the board.
        Additionally, a new tetromino appears at the top of the curr_board.
        Since no lines are cleared, the number of lines cleared is 0.
        The example is simplified for clarity. The actual dimension of the board is 20x10.

    Returns:
        You need to return the reward signal (float) based on the given inputs.
""")

reward_func_return_type_baseline = "Tuple[float, Dict[str, float]]"
reward_func_definition_baseline = dedent("""\
    Args:
        - `action` (int): 
            The action taken by the agent.
            For each column on the board, the agent can rotate the tetromino counter-clockwise for 0, 1, 2, or 3 times. This results in a total of board_width*4 possible actions. 
            Therefore, the action space is a Discrete space with board_width*4 possible actions. The value is interpreted as column index + number of rotations. 
            So the actions [0, 1, 2, 3] correspond to the first column and the tetromino rotated 0, 1, 2, 3 times respectively. 
            The actions [4, 5, 6, 7] correspond to the second column and the tetromino rotated 0, 1, 2, 3 times respectively, and so on.
            Action not within the action space is invalid and will result in a reward of -1.
        - `curr_board` (2D numpy array): A binary array representation of the game board after the `action` is taken, where `1` indicates a filled cell and `0` indicates an empty cell. This representation also includes the newly apperaed tetromino to be placed.
        - `curr_active_tetromino` (2D numpy array): A binary array of the same shape as `curr_board`, containing ONLY the tetromino to be placed for the NEXT step. Therefore, current board without active tetromino is curr_board - curr_active_tetromino.
        - `prev_board` (2D numpy array): A binary array of the same shape as `curr_board`, representing the game board of previous step.
        - `prev_active_tetromino` (2D numpy array): A binary array of the same shape as `prev_board`. Previous board without active tetromino is prev_board - prev_active_tetromino.
        - `lines_cleared` (int): The number of lines cleared resulted from the `action` taken in the current step.
    
    Example:
        Consider a 7x5 board at a given point during gameplay, where the following inputs are given:
        - action: 9 (column index 2, rotates counter-clockwise for 1 time)
        - curr_board:
            [
                [0, 0, 1, 1, 0],
                [0, 0, 1, 1, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 1, 1, 0],
                [0, 0, 0, 1, 1]
            ]
        - curr_active_tetromino:
            [
                [0, 0, 1, 1, 0],
                [0, 0, 1, 1, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0]
            ]
        - prev_board:
            [
                [0, 0, 0, 1, 0],
                [0, 0, 1, 1, 0],
                [0, 0, 1, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0]
            ]
        - prev_active_tetromino: 
            [
                [0, 0, 0, 1, 0],
                [0, 0, 1, 1, 0],
                [0, 0, 1, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0]
            ]
        - lines_cleared: 0

        This example shows that by taking action=9, the game board transitions from prev_board to curr_board by placing a z-shape tetromino to the lower right corner of the board.
        Additionally, a new tetromino appears at the top of the curr_board.
        Since no lines are cleared, the number of lines cleared is 0.
        The example is simplified for clarity. The actual dimension of the board is 20x10.

    Returns (Tuple[float, Dict\[str, float]]):
        1. return the reward signal for the current step.
        2. return a dictionary of each individual reward component for the current step.
""")

if __name__ == "__main__":
    # print(env_description)
    instructions=dedent(f"""\
        Design an reward structure for training an reinforcement learning agent based on the environment specification:
        
        <Environment>
        {env_description}
        </Environment>

        Task:
        Return a list of individual reward function components that could be included in a dense reward structure. Each component should:
        1. Be specific and concrete, describing exactly what behavior it encourages.
        2. Be suitable for generating a real-time reward signal at every time step.
        3. Address different aspects of agent performance or game strategy, beyond just line clearing.
        4. Include only one idea per item in the list.
        """)

    rprint(env_description)
    # rprint(instructions)
    # rprint(reward_func_params_explanation)
    # rprint(reward_func_definition)