[
  {
    "unique_id": "1_instruction_comprehension/explicit_object_goal/images/rtx_frames__new_85_85_droid#episode_1036_Q1",
    "task_type": [
      "1_instruction_comprehension",
      "explicit_object_goal"
    ],
    "input_type": "video",
    "robotic_type": "single-arm",
    "question": "With <draw a shower curtain> as the goal, plan what steps to do to complete the goal?\n\nPlease limit your answer to at most 3 steps.",
    "gt_answer": "1-move_to(none, left_edge_of_the_shower_curtain),2-grasp(left_edge_of_the_shower_curtain),3-pull(left_edge_of_the_shower_curtain, left_side_of_the_shower_curtain)",
    "image_urls": [
      "frame_0.png"
    ]
  },
  {
    "unique_id": "1_instruction_comprehension/implicit_demand_goal/images/rtx_frames__new_85_85_droid#episode_1036_Q1",
    "task_type": [
        "1_instruction_comprehension",
        "implicit_demand_goal"
    ],
    "input_type": "video",
    "robotic_type": "single-arm",
    "question": "With <I'm about to shower, respect my privacy> as the goal, plan what steps to do to complete the goal?\n\nPlease limit your answer to at most 3 steps.",
    "gt_answer": "1-move_to(none, left_edge_of_the_shower_curtain),2-grasp(left_edge_of_the_shower_curtain),3-pull(left_edge_of_the_shower_curtain, left_side_of_the_shower_curtain)",
    "image_urls": [
        "frame_0.png"
    ]
},
{
  "unique_id": "2_perception_reasoning/object_centric/static_attribute/images/is_sealed/is_sealed_0043",
  "task_type": [
    "2_perception_reasoning",
    "object_centric",
    "static_attribute"
  ],
  "input_type": "image",
  "question": "A container is sealed if it can be rotated by any amount in any direction without spilling its contents if it has anything inside. Is this container sealed? The response should using only one word, A, B, C ,or D.\nA. No\nB. Partially yes\nC. Yes\nD. Unknown",
  "gt_answer": "A",
  "image_urls": [
    "06DA00CB8DC35BB06C136212D5D6E62D_06_32.jpg"
  ],
  "bounding_box_1": {
    "x1": 0.4118,
    "y1": 0.3183,
    "x2": 0.4507,
    "y2": 0.3606
  },
  "bounding_box_2": null,
  "question_type": "multiple_choice"
},
{
  "unique_id": "2_perception_reasoning/object_centric/tool_usage/images/77CD0EBE180C006CAE7E858CD89877C9_09_86",
  "task_type": [
      "2_perception_reasoning",
      "object_centric",
      "tool_usage"
  ],
  "input_type": "image",
  "question": "What functionality does the object inside the Green Bounding Box have?\n(A) Used for contain brushes\n(B) Used for brushing teeth\n(C) Used for cutting fabric\n(D) Used for recording sound \n Please answer only one word, e.g., A, B, C, or D.",
  "gt_answer": "A",
  "image_urls": [
      "77CD0EBE180C006CAE7E858CD89877C9_09_86.jpg"
  ],
  "bounding_box_1": {
      "x1": 0.1687,
      "y1": 0.1076,
      "x2": 0.4197,
      "y2": 0.6395
  }
},
{
  "unique_id": "2_perception_reasoning/robotic_centric/robot_type/images/DSL_shrinked_fold_towel_episode_1150",
  "task_type": [
      "2_perception_reasoning",
      "robotic_centric",
      "robot_type"
  ],
  "input_type": "video",
  "question": "Watch this video, what type of robotic arm is this?\n(A) Single arm with gripper\n(B) Bi-manual with gripper\n(C) Single arm with dexterous hand\n(D) Mobile Manipulation\n Please answer only one word, e.g., A, B, C, or D.",
  "gt_answer": "A",
  "image_urls": [
      "frame_0.png",
      "frame_1.png",
      "frame_2.png",
      "frame_3.png",
      "frame_4.png",
      "frame_5.png",
      "frame_6.png",
      "frame_7.png",
      "frame_8.png",
      "frame_9.png"
  ]
},
{
  "unique_id": "2_perception_reasoning/robotic_centric/robot_view/images/rdt-ft_connect_charging_cable_episode_7_cam_high",
  "task_type": [
      "2_perception_reasoning",
      "robotic_centric",
      "robot_view"
  ],
  "input_type": "video",
  "question": "Watch this video, what type of view is this?\n(A) Exo\n(B) Front Ego\n(C) Hand Ego\n(D) Head Ego\n Please answer only one word, e.g., A, B, C, or D.",
  "gt_answer": "B",
  "image_urls": [
      "frame_0.png",
      "frame_1.png",
      "frame_2.png",
      "frame_3.png",
      "frame_4.png",
      "frame_5.png",
      "frame_6.png",
      "frame_7.png",
      "frame_8.png",
      "frame_9.png"
  ]
},
{
  "unique_id": "2_perception_reasoning/scene_centric/spatial_centric/spatial_relation/images/13_packbowl",
  "task_type": [
    "2_perception_reasoning",
    "scene_centric",
    "spatial_centric",
    "spatial_relation"
  ],
  "input_type": "image",
  "question": "From the world-centric view, where is the spatial relation of the middle green bowl to the blue bowl?\nA. Inside\nB. Under\nC. On top\nD. Outside(on the same plane)\n Please answer only one word, e.g., A, B, C, or D.",
  "gt_answer": "C",
  "image_urls": [
    "13_packbowl.png"
  ]
},
{
  "unique_id": "2_perception_reasoning/scene_centric/spatial_temporal_centric/spatial_temporal_causality/images/benchmark1_h5_franka_3rgb_piled_on_yellow_block_on_purple_block_train_1011_172054",
  "task_type": [
    "2_perception_reasoning",
    "scene_centric",
    "spatial_temporal_centric",
    "spatial_temporal_causality"
  ],
  "input_type": "image",
  "question": "Where will the initial point (0.6650, 0.6800) appear after doing the Place the bananas in the basket.?\nA: P1 (0.3757, 0.8009)\nB: P2 (0.1689, 0.2175)\nC: P3 (0.7233, 0.2934)\nD: P4 (0.3835, 0.5696)\n Please answer only one word, e.g., A, B, C, or D.",
  "gt_answer": "C",
  "image_urls": [
    "benchmark1_h5_franka_3rgb_piled_on_yellow_block_on_purple_block_train_1011_172054.png"
  ]
},
{
  "unique_id": "2_perception_reasoning/scene_centric/temporal_centric/timestamp_analysis/images/RDT-FT_imgs_1_choose_toy_by_size_episode_2",
  "task_type": [
    "2_perception_reasoning",
    "scene_centric",
    "temporal_centric",
    "timestamp_analysis"
  ],
  "input_type": "video",
  "question": "Please watch the 8 sequential frames of this video and find the frame interval that is most relevant to the move_away_from(none, box) event. The frame interval format (start_frame, end_frame) that precisely captures the timespan of this event. All frame indices must be between 0 and 7 (inclusive).\nA. (5, 8)\nB. (3, 3)\nC. (4, 4)\nD. (7, 8)\n\nPlease answer only one word, e.g., A, B, C, or D.",
  "gt_answer": "A",
  "image_urls": [
    "frame_0.png",
    "frame_2.png",
    "frame_4.png",
    "frame_6.png",
    "frame_8.png",
    "frame_10.png",
    "frame_12.png",
    "frame_14.png",
    "frame_16.png"
  ],
  "action_list": "1-move_to(none, doll_holding_the_basketball),2-grasp(doll_holding_the_basketball),3-pick_up(doll_holding_the_basketball),4-move_to(doll_holding_the_basketball, box),5-place(doll_holding_the_basketball, box),6-move_away_from(none, box)"
},
{
  "unique_id": "2_perception_reasoning/task_centric/task_instruction_comprehension/images/rtx_frames__new_85_85_droid#episode_66030",
  "task_type": [
      "2_perception_reasoning",
      "task_centric",
      "task_instruction_comprehension"
  ],
  "input_type": "image",
  "question": "Which object that the robot gripper will hold in hand while executing the separating and rearranging a pile of laundry items?\nA: A\nB: B\nC: C\nD: D\n Please answer only one word, e.g., A, B, C, or D.",
  "gt_answer": "C",
  "image_urls": [
      "rtx_frames__new_85_85_droid#episode_66030.png"
  ],
  "bounding_box_A": {
      "x1": 0.8031,
      "y1": 0.6111,
      "x2": 0.8781,
      "y2": 0.7389
  },
  "bounding_box_B": {
      "x1": 0.0781,
      "y1": 0.4611,
      "x2": 0.3,
      "y2": 0.7667
  },
  "bounding_box_C": {
      "x1": 0.4531,
      "y1": 0.5,
      "x2": 0.6188,
      "y2": 0.8222
  },
  "bounding_box_D": {
      "x1": 0.525,
      "y1": 0.1722,
      "x2": 0.6875,
      "y2": 0.45
  }
},
{
  "unique_id": "3_generalized_planning/cross_embodiment/dual_arm/images/ARIO_imgs_extract-frames_Songling_series-1_task-pick_U_driver_20_4_7th_PCL_episode-2_rgbd-cam_high_Q1",
  "task_type": [
    "3_generalized_planning",
    "cross_embodiment",
    "dual_arm"
  ],
  "input_type": "video",
  "robotic_type": "dual-arm",
  "question": "With <remove the usb from the device on the right and plug it into the device on the left> as the goal, plan what steps to do to complete the goal?\n\nPlease limit your answer to at most 5 steps.",
  "gt_answer": "1-left:move_to(none, the_right_device), right:move_to(none, the_right_device), 2-left:grasp(the_right_device), right:grasp(usb_drive), 3-left:press(none, the_right_device), right:pick_up(usb_drive), 4-left:no_ops, right:move_to(usb_drive, the_left_device), 5-left:no_ops, right:insert(usb_drive, the_left_device)",
  "image_urls": [
    "frame_0.png"
  ]
},
{
  "unique_id": "3_generalized_planning/cross_object/physical_attribute/color/images/benchmark1_h5_franka_3rgb_piled_on_yellow_block_on_purple_block_val_1011_175539_Q1",
  "task_type": [
    "3_generalized_planning",
    "cross_object",
    "physical_attribute",
    "color"
  ],
  "input_type": "image",
  "robotic_type": "single-arm",
  "question": "When you planning the steps to encounter the goal, replace color-described objects with the number in your responses according to the visual prompt in the image. For example, when given a goal like 'put a white object into a yellow object' (and combined with the visual prompt in the image), your response should be specific and practical, such as '..., place(2, 6), ...'.\n\n With <stack a yellow cube on a purple cube> as the goal, plan what steps to do to complete the goal?\n\nPlease limit your answer to at most 5 steps.",
  "gt_answer": "1-move_to(none,6),2-grasp(6),3-pick_up(6),4-move_to(6,1),5-place(6,1)",
  "image_urls": [
    "benchmark1_h5_franka_3rgb_piled_on_yellow_block_on_purple_block_val_1011_175539.png"
  ]
},
{
  "unique_id": "3_generalized_planning/cross_view/multi_view/images/RDT-FT_arrange_word_ABCD_episode_2_Q1",
  "task_type": [
      "3_generalized_planning",
      "cross_view",
      "multi_view"
  ],
  "input_type": "video",
  "robotic_type": "dual-arm",
  "question": "When you planning the steps to encounter the goal, please combine images from different perspectives, in order to recognize occluded objects or select arms for operation based on the distance between each arm and the target object.\n\nWith <Arranging letter blocks to spell \"ABCD\"> as the goal, plan what steps to do to complete the goal?\n\nPlease limit your answer to at most 12 steps.",
  "gt_answer": "0-left: move_to(none, 'a'_block),right:no_ops, 1-left:grasp('a'_block),right:no_ops, 2-left:pick_up('a'_block),right:no_ops, 3-left:place('a'_block, left_of_table),right:no_ops, 4-left:move_to(none,'b'_block),right:no_ops, 5-left:grasp('b'_block),right:no_ops, 6-left:pick_up('b'_block),right:no_ops, 7-left:place('b'_block,right_of_'a'_block), right:no_ops, 8-left:no_ops,right:move_to(none, 'd'_block), 9-left:no_ops,right:grasp('d'_block), 10-left:no_ops, right:pick_up('d'_block), 11-left:no_ops,right:place('d'_clock,right_of_'c'_clock)",
  "image_urls": [
      "cam_right_wrist_0.png",
      "cam_left_wrist_0.png",
      "cam_high_0.png"
  ]
},
{
  "unique_id": "4_affordance_reasoning/dynamic_affordance/images/49_bridge#episode_10049#frame_15",
  "task_type": [
      "4_affordance_reasoning",
      "dynamic_affordance"
  ],
  "input_type": "image",
  "question": "Which set of 4 points is the right trajectory when doing place a cucumber into a pot?",
  "gt_answer": [
      [
          0.46839080459770116,
          0.07854406130268199
      ],
      [
          0.514367816091954,
          0.24137931034482757
      ],
      [
          0.5560344827586207,
          0.3773946360153257
      ],
      [
          0.5933908045977011,
          0.5727969348659003
      ]
  ],
  "image_urls": [
      "49_bridge#episode_10049#frame_15.png"
  ]
},
{
  "unique_id": "4_affordance_reasoning/static_affordance/images/rt_frames_success_rtx_frames_success_16_49_bridge#episode_779_sf0_ef14",
  "task_type": [
      "4_affordance_reasoning",
      "static_affordance"
  ],
  "input_type": "image",
  "question": "Which point in the image corresponds to the object affordance that should be manipulated to fulfill the instruction, reach for the banana?\n",
  "gt_answer": "(0.564186, 0.768602)",
  "image_urls": [
      "rt_frames_success_rtx_frames_success_16_49_bridge#episode_779_sf0_ef14.png"
  ]
}
]