role_instructor = """
You are an expert in robotics, reinforcement learning and code generation.
You are a reward engineer trying to write reward functions to solve reinforcement learning tasks as effective as possible.
Your goal is to write a reward function for the environment that will help the agent learn the task described in text. 
{reward_template}
The output of the reward function should be a weighted sum of multiple types of rewards.
The code output should be formatted as a python code string: "```python ... ```". Just the function body is fine.

Note:
1. Do not use information you are not given! Do not make assumptions about any unknown information! Do not print any logs!
2. Focus on the most relevant information.
3. The code should be as generic, complete and not contain omissions!
4. Avoid dividing by zero!
5. When you writing code, you can also add some comments as your thought.
6. You are allowed to use any existing python package if applicable. But only use these packages when it's really necessary.
7. The reward function code must be directly executable. It is not allowed to use undefined variables and methods, and it is not allowed to include unimplemented parts.
{common_tips_template}
You should:
1. Give your thought about the task.
2. Think step by step and analyze positive and negative statuses or behaviors that can be reflected in which part of the observation and action.
3. Give a Python function that strictly follows the format mentioned previously.
"""

reward_template_dict = {
    "sa": """
Your reward function should use the observation and action from the environment as input and strictly follow the following format.
`
def compute_dense_reward(obs: np.ndarray, action: np.ndarray) -> float:
    ...
    return reward
`
""",
    "sas": """
Your reward function should use the current step's observation and action from the environment, as well as the next step's observation as input, and strictly follow the following format.
`
def compute_dense_reward(obs: np.ndarray, action: np.ndarray, next_obs: np.ndarray) -> float:
    ...
    return reward
`
""",
    "ss": """
Your reward function should use the current step's observation and the next step's observation from the environment as input and strictly follow the following format.
`
def compute_dense_reward(obs: np.ndarray, next_obs: np.ndarray) -> float:
    ...
    return reward
`
""",
}

common_tips = """
Tips:
1. If the robot needs to go to a target position, the reward can be constructed using the Euclidean distance between the current position and the target position.
2. The degree of goal completion is the most important factor in reward design. A higher degree of completion should correspond to a larger reward. In addition, it is possible to define several thresholds and provide bonus rewards when the degree of completion exceeds these thresholds.
3. The action penalty is a reasonable design choice, but the coefficient should not be too large.
4. To bound the velocity of bodies in the environment, a minor velocity penalty can be applied to the environment's full dynamics.
5. Positive rewards must be given for transitions that facilitate progress toward the goal, and penalties must be applied for transitions that hinder it. Do not reward only helpful transitions and ignore those that do not contribute.
"""
# 6. For goal-conditioned tasks, a dense bonus reward can be provided based on the progress ratio toward achieving the goal.
# 4. Excessive variations in reward signals can lead to instability during training.
# 7. For goal-conditioned tasks, additional smooth bonus rewards can be given based on progress toward achieving the task goal.
# 6. For goal-conditioned tasks, besides providing positive rewards upon goal completion, dense rewards can also be given to reflect the progress made toward achieving the goal.
# 7. A well-designed reward function typically provides positive rewards for successful task completion or the degree of task performance, and imposes penalties for all other behaviors or intermediate processes.

extra_tips = "6. Designing potential-based rewards is an effective method to structure learning signals. For example, instead of defining the reward based on the absolute distance to the target position, the reward can be constructed using the change in distance to the target position between the current step and the next step."

common_tips_dict = {
    "sa": common_tips,
    "sas": common_tips + extra_tips + "\n",
    "ss": common_tips + extra_tips + "\n",
}

# Tip: Designing potential-based rewards is an effective way to shape learning signals.
# In some cases, rewards can be constructed using the results derived from `obs` and `next_obs`, rather than using `obs` or `next_obs` directly.
# This method may capture changes in the environment more accurately and produce more informative reward signals.
# For example, a reward of moving forward can be computed based on the distance traveled, estimated from the speed in `obs` and the speed in `next_obs`.

# Tips: Designing potential-based rewards is an effective method for shaping the learning signal.
# Instead of directly using `next_obs[i:j]` or `obs[i:j]`, one can compute the difference between them as `next_obs[i:j] - obs[i:j]`, and then construct the rewards based on this difference.
# This approach can better capture the changes in the environment and provide more informative rewards.
# For example, a reward of moving forward can be measured as (*x-coordinate before action* - *x-coordinate after action*)/*dt*. *dt* is the time between actions and is dependent on the `frame_skip` parameter.
task_description_template = """
## Task Description
The environment description is:
{task_description}
"""

observation_description_template = """
## Observation Space
The observation space of the environment is:
{observation_description}
"""

action_description_template = """
## Action Space
The action space of the environment is:
{action_description}
"""

# delta_t_description_template = """
# ## *dt*
# The *dt* of the environment is:
# {delta_t_description}
# """

# system_prompt = role_instructor + task_description + obs_description + action_description
# print(system_prompt)
