{
    "vln_identify_robot": {
        "max_num_query_image": 0,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Navigation",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Planning"
    },
    "dish_ingredient_match": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Knowledge;World_Knowledge",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "video_eval_visual_pref": {
        "max_num_query_image": 16,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "video",
        "taxonomy_tree_path": "Metrics;Generated_Video_Eval",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Videos",
        "app": "Metrics"
    },
    "logical_reasoning_find_odd_one_out": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Planning"
    },
    "cultural_vqa": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture",
        "skills": [
            "Object Recognition and Classification",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "TRANCE_physics_reasoning_basic": {
        "max_num_query_image": 2,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Physical_Understanding;Physical_Reasoning",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "3D Models and Aerial Imagery",
        "app": "Perception"
    },
    "logical_reasoning_fit_pattern": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Planning"
    },
    "knowledge_sign_recognition": {
        "max_num_query_image": 6,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Logo_and_Sign",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Knowledge"
    },
    "relative_depth_of_different_points": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;3D_understanding",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "autonomous_driving_scene_analysis": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Object_and_Scene_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "geometry_reasoning_count_line_intersections": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Geometry",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "llavaguard": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;safety_and_norm",
        "skills": [
            "Ethical and Safety Reasoning",
            "Scene and Event Understanding",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "knowledge_graph_understanding": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Diagram_QA",
        "skills": [
            "Object Recognition and Classification",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "medical_content_based_retrieval_radiology": {
        "max_num_query_image": 4,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Science;Life_Sciences;Medical",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Science"
    },
    "recover_masked_word_in_figure": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "photoshop_operation": {
        "max_num_query_image": 2,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Physical_Understanding;Lighting_and_Shading",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "multilingual_news_qa": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Information_Extraction;Multimodal_QA",
        "skills": [
            "Language Understanding and Generation",
            "Text Recognition (OCR)"
        ],
        "input_format": "Photographs",
        "app": "Information_Extraction"
    },
    "rocks_samples_identify": {
        "max_num_query_image": 5,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "paper_vqa": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Document;Document_QA",
        "skills": [
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Perception"
    },
    "product_ocr_qa": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Multimodal_QA",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Information_Extraction"
    },
    "planning_visual_barman": {
        "max_num_query_image": 2,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Symbolic_Planning",
        "skills": [
            "Planning and Decision Making",
            "Spatial and Temporal Reasoning",
            "Object Recognition and Classification"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Planning"
    },
    "visual_correspondance_in_two_images": {
        "max_num_query_image": 2,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Spatial_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "video_segments_reordering": {
        "max_num_query_image": 10,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "video",
        "taxonomy_tree_path": "Perception;Temporal_Understanding",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Scene and Event Understanding"
        ],
        "input_format": "Videos",
        "app": "Perception"
    },
    "media_recommend_solutions_stackoverflow": {
        "max_num_query_image": 4,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Coding;Code_Understanding;Code_Match",
        "skills": [
            "Language Understanding and Generation",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Coding"
    },
    "planning_visual_floortile": {
        "max_num_query_image": 2,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Symbolic_Planning",
        "skills": [
            "Planning and Decision Making",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Planning"
    },
    "medical_abdomen_MRI_organ_recognition": {
        "max_num_query_image": 4,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Science;Life_Sciences;Medical",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Science"
    },
    "signage_navigation": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "vln_english_next_step": {
        "max_num_query_image": 11,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Navigation",
        "skills": [
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Planning"
    },
    "sign_language": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Language Understanding and Generation"
        ],
        "input_format": "Videos",
        "app": "Knowledge"
    },
    "Ad_count_detection": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Counting",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Perception"
    },
    "2d_image_jigsaw_puzzle_easy": {
        "max_num_query_image": 2,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Spatial_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "flowchart_code_generation": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Coding;Code_Understanding;Code_Match",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Coding"
    },
    "painting_QA": {
        "max_num_query_image": 4,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Painting",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Knowledge"
    },
    "song_title_identification_from_lyrics": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Knowledge"
    },
    "MMMU_pro_exam_screenshot": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;STEM",
        "skills": [
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Science"
    },
    "TRANCE_physics_reasoning_event": {
        "max_num_query_image": 2,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Physical_Understanding;Physical_Reasoning",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "3D Models and Aerial Imagery",
        "app": "Perception"
    },
    "monthly_weather_days_count": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Table_QA",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Perception"
    },
    "code_solution_compare": {
        "max_num_query_image": 2,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Coding;Code_Understanding;Code_Match",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Language Understanding and Generation",
            "Text Recognition (OCR)"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Coding"
    },
    "functionality_matching_in_different_objects": {
        "max_num_query_image": 2,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Object_and_Scene_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "planning_visual_storage": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Symbolic_Planning",
        "skills": [
            "Planning and Decision Making",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Planning"
    },
    "geometry_reasoning_nested_squares": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Geometry",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "GUI_Act_Web_Multi": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;GUI_Operation",
        "skills": [
            "Object Recognition and Classification",
            "Planning and Decision Making"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "mensa_iq_test": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Planning"
    },
    "music_sheet_sentiment": {
        "max_num_query_image": 4,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Music",
        "skills": [
            "Object Recognition and Classification",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Knowledge"
    },
    "paper_review_rating": {
        "max_num_query_image": 4,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Metrics;Paper_Review",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Language Understanding and Generation",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Metrics"
    },
    "go_capture_stone": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games;Board_Games",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Planning and Decision Making"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Planning"
    },
    "medical_multi_organ_segmentation_rater": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;Life_Sciences;Medical",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Science"
    },
    "MMMU_physics_chemistry_selected": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;Physics",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Science"
    },
    "arxiv_vqa": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;STEM",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Science"
    },
    "planning_screenshot_grippers": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Symbolic_Planning",
        "skills": [
            "Planning and Decision Making",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Planning"
    },
    "counting_multi_image": {
        "max_num_query_image": 3,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Counting",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "chinese_idiom_recognition": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Knowledge"
    },
    "video_action_recognition": {
        "max_num_query_image": 0,
        "output_format": "structured_output",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Perception;Temporal_Understanding",
        "skills": [
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Videos",
        "app": "Perception"
    },
    "GUI_Act_Mobile_swipe": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;GUI_Operation",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Planning and Decision Making"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "road_map_find_highway_between_two_place": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "code_translation_easy": {
        "max_num_query_image": 2,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Coding;Code_Translation",
        "skills": [
            "Text Recognition (OCR)",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "medical_cell_recognition": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;Life_Sciences;Medical",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Photographs",
        "app": "Science"
    },
    "geometry_plot_position_relationship": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Spatial_Understanding",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "booking_web_recommendation": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Information_Extraction;Search_by_Attribute_wo_Calculate",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Text Recognition (OCR)"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "music_sheet_author": {
        "max_num_query_image": 3,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Music",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Knowledge"
    },
    "font_recognition": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Knowledge"
    },
    "bongard_problem": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Planning"
    },
    "code_translation_Python": {
        "max_num_query_image": 3,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Coding;Code_Translation",
        "skills": [
            "Text Recognition (OCR)",
            "Domain-Specific Knowledge and Skills",
            "Language Understanding and Generation"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "google_streetview_line_sorting": {
        "max_num_query_image": 9,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Perception;Temporal_Understanding",
        "skills": [
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "vln_identify_location": {
        "max_num_query_image": 4,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Navigation",
        "skills": [
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Planning"
    },
    "image_translation_en2cn": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Structured_Parsing",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Information_Extraction"
    },
    "clevrer_physics": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Physical_Understanding;Physical_Reasoning",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "media_homepage_profile": {
        "max_num_query_image": 0,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Information_Extraction;Search_by_Attribute_wo_Calculate",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "geometry_reasoning_overlapped_circle": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Geometry",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "medical_image_artifacts_indentification": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;Life_Sciences;Medical",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Photographs",
        "app": "Science"
    },
    "GUI_Act_Mobile_tap": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;GUI_Operation",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "booking_web_rating": {
        "max_num_query_image": 0,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Information_Extraction;Structured_Parsing",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Scene and Event Understanding",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "medical_blood_vessels_recognition": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;Life_Sciences;Medical",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Science"
    },
    "ishihara_test": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Knowledge"
    },
    "code_translation_hard": {
        "max_num_query_image": 7,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Coding;Code_Translation",
        "skills": [
            "Text Recognition (OCR)",
            "Domain-Specific Knowledge and Skills",
            "Language Understanding and Generation"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "character_recognition_in_TV_shows": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Actor_Character_and_Famous_People",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Knowledge"
    },
    "game_platform_support_identification": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Compound_Search_and_Calculate",
        "skills": [
            "Text Recognition (OCR)",
            "Object Recognition and Classification"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "planning_visual_blocksworld": {
        "max_num_query_image": 2,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Symbolic_Planning",
        "skills": [
            "Planning and Decision Making",
            "Spatial and Temporal Reasoning",
            "Object Recognition and Classification"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Planning"
    },
    "healthcare_info_judgement": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;Life_Sciences;Medical",
        "skills": [
            "Text Recognition (OCR)",
            "Domain-Specific Knowledge and Skills",
            "Ethical and Safety Reasoning"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Science"
    },
    "pokemon_3D_recognition": {
        "max_num_query_image": 2,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Perception;3D_understanding",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)"
        ],
        "input_format": "3D Models and Aerial Imagery",
        "app": "Perception"
    },
    "worldle": {
        "max_num_query_image": 3,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Knowledge;World_Knowledge",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "circuit_diagram_understanding": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;Physics",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Science"
    },
    "planning_screenshot_barman": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Symbolic_Planning",
        "skills": [
            "Planning and Decision Making",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "counting_single_image": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Counting",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "tv_show_retrieval_by_character": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Information_Extraction;Search_by_Attribute_wo_Calculate",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "multiview_reasoning_camera_moving": {
        "max_num_query_image": 2,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;3D_understanding",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "music_sheet_note_count": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Music",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Knowledge"
    },
    "code_match_problem": {
        "max_num_query_image": 4,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Coding;Code_Understanding;Code_Match",
        "skills": [
            "Text Recognition (OCR)",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "code_translation_advanced": {
        "max_num_query_image": 4,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Coding;Code_Translation",
        "skills": [
            "Text Recognition (OCR)",
            "Domain-Specific Knowledge and Skills",
            "Language Understanding and Generation"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "google_streetview_circle_reasoning": {
        "max_num_query_image": 11,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Perception;3D_understanding",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "stackoverflow_debug_QA": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Coding;Code_Debugging",
        "skills": [
            "Text Recognition (OCR)",
            "Domain-Specific Knowledge and Skills",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Coding"
    },
    "realworld_qa_en2cn": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Multimodal_QA",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Information_Extraction"
    },
    "video_grounding_spatial": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Perception;Spatial_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Videos",
        "app": "Perception"
    },
    "move_pos_to_pos_hanoi_4_pole": {
        "max_num_query_image": 2,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Planning and Decision Making",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Planning"
    },
    "geometry_reasoning_grid": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Geometry",
        "skills": [
            "Text Recognition (OCR)",
            "Spatial and Temporal Reasoning",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "location_vqa": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge",
        "skills": [
            "Scene and Event Understanding",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "medical_polyp_segmentation_single_object_rater": {
        "max_num_query_image": 4,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Science;Life_Sciences;Medical",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Science"
    },
    "geometry_reasoning_circled_letter": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Perception"
    },
    "transit_map_intersection_points": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Text Recognition (OCR)"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "webpage_code_understanding": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Coding;Code_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Coding"
    },
    "google_streetview_direction_understanding": {
        "max_num_query_image": 2,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;3D_understanding",
        "skills": [
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "chess_find_legal_moves": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games;Board_Games",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Planning and Decision Making"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "topological_sort": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Graph_Theory",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "forensic_detection_of_different_images": {
        "max_num_query_image": 4,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Knowledge;Fact_Checking",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "code_add_tag": {
        "max_num_query_image": 3,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Coding;Code_Understanding",
        "skills": [
            "Text Recognition (OCR)",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "mindmap_elements_parsing": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Diagram_QA",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Language Understanding and Generation"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Perception"
    },
    "interpret_force_perspective_illusion": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Fact_Checking",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "annoying_word_search": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Text Recognition (OCR)",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Planning"
    },
    "google_streetview_line_reasoning": {
        "max_num_query_image": 9,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Perception;Temporal_Understanding",
        "skills": [
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "polygon_interior_angles": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Geometry",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "medical_abdomen_endscopy_organ_recognition": {
        "max_num_query_image": 4,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Science;Life_Sciences;Medical",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Photographs",
        "app": "Science"
    },
    "ancient_map_understanding": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Knowledge;World_Knowledge",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Knowledge"
    },
    "rocks_samples_compare": {
        "max_num_query_image": 3,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Knowledge;World_Knowledge",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "top_video_creator_identification": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Compound_Search_and_Calculate",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Scene and Event Understanding"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "logical_reasoning_2D_views_of_3D_shapes": {
        "max_num_query_image": 5,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Planning"
    },
    "vln_tegulu_next_step": {
        "max_num_query_image": 11,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Navigation",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Planning"
    },
    "video_camera_motion_description": {
        "max_num_query_image": 0,
        "output_format": "exact_text",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Perception;Temporal_Understanding",
        "skills": [
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Videos",
        "app": "Perception"
    },
    "ascii_art_understanding": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Perception"
    },
    "GUI_Act_Web_Single": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;GUI_Operation",
        "skills": [
            "Object Recognition and Classification",
            "Planning and Decision Making"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "video_grounding_temporal": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Perception;Temporal_Understanding",
        "skills": [
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Videos",
        "app": "Perception"
    },
    "video_intent_recognition": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;emotion_and_intent_understanding",
        "skills": [
            "Scene and Event Understanding",
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Videos",
        "app": "Knowledge"
    },
    "calendar_schedule_suggestion": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Agents_and_Planning",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Planning and Decision Making"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "web_action_prediction": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Temporal_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Perception"
    },
    "highest_discount_game_price_identification": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Compound_Search_and_Calculate",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "video_eval_factual_pref": {
        "max_num_query_image": 16,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "video",
        "taxonomy_tree_path": "Metrics;Generated_Video_Eval",
        "skills": [
            "Scene and Event Understanding",
            "Commonsense and Social Reasoning",
            "Ethical and Safety Reasoning"
        ],
        "input_format": "Videos",
        "app": "Metrics"
    },
    "code_execution": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Coding;Code_Understanding;Code_Output",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "music_sheet_format_QA": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Music",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Knowledge"
    },
    "planning_screenshot_termes": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Symbolic_Planning",
        "skills": [
            "Planning and Decision Making",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "actor_recognition_in_Movie": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Actor_Character_and_Famous_People",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Knowledge"
    },
    "extract_webpage_headline": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Text Recognition (OCR)"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Perception"
    },
    "hashtag_recommendation": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "relative_reflectance_of_different_regions": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Physical_Understanding;Lighting_and_Shading",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "planning_screenshot_floortile": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Symbolic_Planning",
        "skills": [
            "Planning and Decision Making",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "planning_screenshot_blocksworld": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Symbolic_Planning",
        "skills": [
            "Planning and Decision Making",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Planning"
    },
    "soccer_offside": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "waldo": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Perception"
    },
    "number_comparison": {
        "max_num_query_image": 0,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Numeric_Reasoning",
        "skills": [
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Mathematics"
    },
    "medical_keywords_based_retrieval_non_radiology": {
        "max_num_query_image": 5,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Science;Life_Sciences;Medical",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Photographs",
        "app": "Science"
    },
    "ascii_art_30": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Arts",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "paper_review_acceptance": {
        "max_num_query_image": 4,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Metrics;Paper_Review",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Language Understanding and Generation",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Metrics"
    },
    "planning_screenshot_storage": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Symbolic_Planning",
        "skills": [
            "Planning and Decision Making",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Planning"
    },
    "scibench_calculus_wo_solution": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Calculus",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "medical_parasite_detection": {
        "max_num_query_image": 7,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Science;Life_Sciences;Medical",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Photographs",
        "app": "Science"
    },
    "mahjong": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games;Board_Games",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Planning"
    },
    "code_visualization_output_understanding": {
        "max_num_query_image": 5,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Coding;Code_Understanding;Code_Match",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Coding"
    },
    "chess_sygyzy_endgames": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games;Board_Games",
        "skills": [
            "Planning and Decision Making",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "movie_retrieval_by_actor": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Information_Extraction;Search_by_Attribute_wo_Calculate",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "rebus": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Planning"
    },
    "comic_page_ordering": {
        "max_num_query_image": 8,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Planning;Reordering",
        "skills": [
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Planning"
    },
    "latex_complex_formula_convertion": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Coding;Code_Generation;Document_Conversion;Image_to_Latex",
        "skills": [
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "icon_arithmetic_puzzle": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Planning"
    },
    "medical_retrieval_given_surgeon_activity": {
        "max_num_query_image": 4,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "video",
        "taxonomy_tree_path": "Science;Life_Sciences;Medical",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Videos",
        "app": "Science"
    },
    "video_eval_dynamic_pref": {
        "max_num_query_image": 16,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "video",
        "taxonomy_tree_path": "Metrics;Generated_Video_Eval",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Videos",
        "app": "Metrics"
    },
    "orchestra_score_recognition": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Music",
        "skills": [
            "Text Recognition (OCR)",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Knowledge"
    },
    "web_action_grounding": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;GUI_Operation",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "logical_reasoning_2d_folding": {
        "max_num_query_image": 5,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Planning"
    },
    "vln_hindi_next_step": {
        "max_num_query_image": 11,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Navigation",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Planning"
    },
    "remaining_playback_time_calculation": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Compound_Search_and_Calculate",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "music_sheet_name": {
        "max_num_query_image": 4,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Music",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Knowledge"
    },
    "code_retrieval": {
        "max_num_query_image": 3,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Coding;Code_Understanding;Code_Output",
        "skills": [
            "Text Recognition (OCR)",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "weather_map_climate_type_temperature_parsing": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Structured_Parsing",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Text Recognition (OCR)"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Information_Extraction"
    },
    "planning_visual_termes": {
        "max_num_query_image": 2,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Symbolic_Planning",
        "skills": [
            "Planning and Decision Making",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Planning"
    },
    "recipe_image_ordering": {
        "max_num_query_image": 6,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Planning;Reordering",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Planning"
    },
    "distinguish_ai_generated_image": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Fact_Checking",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "medical_counting_lymphocytes": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;Life_Sciences;Medical",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Science"
    },
    "planning_screenshot_tyreworld": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Symbolic_Planning",
        "skills": [
            "Planning and Decision Making",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "google_streetview_circle_sorting": {
        "max_num_query_image": 11,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Perception;3D_understanding",
        "skills": [
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "semantic_matching_of_two_images": {
        "max_num_query_image": 2,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Object_and_Scene_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "TRANCE_physics_reasoning_view": {
        "max_num_query_image": 2,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Physical_Understanding;Physical_Reasoning",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "3D Models and Aerial Imagery",
        "app": "Perception"
    },
    "entertainment_web_game_style": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Structured_Parsing",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Scene and Event Understanding"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "scibench_fundamental_wo_solution": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;STEM",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Science"
    },
    "planning_visual_grippers": {
        "max_num_query_image": 2,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Symbolic_Planning",
        "skills": [
            "Planning and Decision Making",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Planning"
    },
    "table2latex_complex": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Coding;Code_Generation;Document_Conversion",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "meme_explain": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;emotion_and_intent_understanding",
        "skills": [
            "Commonsense and Social Reasoning",
            "Scene and Event Understanding",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "sceneqa_scene_transition_video": {
        "max_num_query_image": 0,
        "output_format": "open_ended_output",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Perception;Temporal_Understanding",
        "skills": [
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Videos",
        "app": "Perception"
    },
    "video_summary": {
        "max_num_query_image": 0,
        "output_format": "open_ended_output",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Information_Extraction;Summarization",
        "skills": [
            "Scene and Event Understanding",
            "Language Understanding and Generation"
        ],
        "input_format": "Videos",
        "app": "Information_Extraction"
    },
    "funqa_unexpected_action_magic_video": {
        "max_num_query_image": 0,
        "output_format": "open_ended_output",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Information_Extraction;Multimodal_QA;Video_QA",
        "skills": [
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Videos",
        "app": "Information_Extraction"
    },
    "paper_review_writing": {
        "max_num_query_image": 4,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Metrics;Paper_Review",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Language Understanding and Generation",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Metrics"
    },
    "activitynetqa": {
        "max_num_query_image": 0,
        "output_format": "open_ended_output",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Information_Extraction;Multimodal_QA;Video_QA",
        "skills": [
            "Scene and Event Understanding",
            "Object Recognition and Classification"
        ],
        "input_format": "Videos",
        "app": "Information_Extraction"
    },
    "scibench_w_solution_open_ended": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;STEM",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Domain-Specific Knowledge and Skills",
            "Language Understanding and Generation"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Science"
    },
    "generated_video_artifacts": {
        "max_num_query_image": 0,
        "output_format": "open_ended_output",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Metrics;Generated_Video_Eval",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Videos",
        "app": "Metrics"
    },
    "funny_image_title": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;emotion_and_intent_understanding",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "wikihow_complex_task_completion": {
        "max_num_query_image": 19,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Agents_and_Planning",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Language Understanding and Generation"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Planning"
    },
    "video_detail_description": {
        "max_num_query_image": 0,
        "output_format": "open_ended_output",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Perception;Multimodal_Captioning",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Videos",
        "app": "Perception"
    },
    "funqa_unexpected_action_creative_video": {
        "max_num_query_image": 0,
        "output_format": "open_ended_output",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Information_Extraction;Multimodal_QA;Video_QA",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Videos",
        "app": "Information_Extraction"
    },
    "guess_image_generation_prompt": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Multimodal_Captioning",
        "skills": [
            "Scene and Event Understanding",
            "Language Understanding and Generation"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Perception"
    },
    "traffic_accident_analysis": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Knowledge"
    },
    "video_qa": {
        "max_num_query_image": 0,
        "output_format": "open_ended_output",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Information_Extraction;Multimodal_QA;Video_QA",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Videos",
        "app": "Information_Extraction"
    },
    "image_humor_understanding": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;emotion_and_intent_understanding",
        "skills": [
            "Commonsense and Social Reasoning",
            "Scene and Event Understanding",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "vibe_eval_phrase": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Daily",
        "skills": [
            "Object Recognition and Classification",
            "Ethical and Safety Reasoning",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "nextqa_oe": {
        "max_num_query_image": 0,
        "output_format": "open_ended_output",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Information_Extraction;Multimodal_QA;Video_QA",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Language Understanding and Generation"
        ],
        "input_format": "Videos",
        "app": "Information_Extraction"
    },
    "graph_interpretation": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Interpretation_and_Explanation",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Language Understanding and Generation",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "science_figure_explanation": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Interpretation_and_Explanation",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "ocr_open_ended_qa": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Object_and_Scene_Understanding",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Perception"
    },
    "video_short_title": {
        "max_num_query_image": 0,
        "output_format": "open_ended_output",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Information_Extraction;Summarization",
        "skills": [
            "Language Understanding and Generation",
            "Scene and Event Understanding"
        ],
        "input_format": "Videos",
        "app": "Information_Extraction"
    },
    "video2notes": {
        "max_num_query_image": 0,
        "output_format": "open_ended_output",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Information_Extraction;Summarization",
        "skills": [
            "Scene and Event Understanding",
            "Language Understanding and Generation",
            "Object Recognition and Classification"
        ],
        "input_format": "Videos",
        "app": "Information_Extraction"
    },
    "electrocardiogram": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;Life_Sciences;Medical",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Science"
    },
    "GUI_Chat_Easy": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Multimodal_QA;GUI_Chat",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "doc_vqa": {
        "max_num_query_image": 4,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Document;Document_QA",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Perception"
    },
    "docci_image_description_long": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Multimodal_Captioning",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "humor_explanation": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;emotion_and_intent_understanding",
        "skills": [
            "Commonsense and Social Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Knowledge"
    },
    "video_content_follow_up": {
        "max_num_query_image": 0,
        "output_format": "open_ended_output",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture",
        "skills": [
            "Scene and Event Understanding",
            "Language Understanding and Generation",
            "Planning and Decision Making"
        ],
        "input_format": "Videos",
        "app": "Knowledge"
    },
    "GUI_Chat_Hard": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Multimodal_QA;GUI_Chat",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "iq_test_open_ended": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Planning"
    },
    "unusual_images": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Fact_Checking",
        "skills": [
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "tweets_captioning": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Multimodal_Captioning",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "art_explanation": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Painting",
        "skills": [
            "Language Understanding and Generation",
            "Scene and Event Understanding",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Knowledge"
    },
    "bar_chart_interpretation": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Interpretation_and_Explanation",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "funqa_unexpected_action_humor_video": {
        "max_num_query_image": 0,
        "output_format": "open_ended_output",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Information_Extraction;Multimodal_QA;Video_QA",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Videos",
        "app": "Information_Extraction"
    },
    "figurative_speech_explanation": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;emotion_and_intent_understanding",
        "skills": [
            "Scene and Event Understanding",
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Knowledge"
    },
    "defeasible_reasoning": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge",
        "skills": [
            "Scene and Event Understanding",
            "Language Understanding and Generation",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Knowledge"
    },
    "image_captioning_with_additional_requirements": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Multimodal_Captioning",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "visualization_with_code": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Coding;Code_Generation",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Coding"
    },
    "physics_exams_v": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;Physics",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Science"
    },
    "coco_ood_global_image_retrieval_by_query_property": {
        "max_num_query_image": 0,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Information_Extraction;Search_by_Attribute_wo_Calculate",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Information_Extraction"
    },
    "multilingual_movie_info_parsing": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Structured_Parsing",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "cheapest_flight_identification": {
        "max_num_query_image": 7,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Information_Extraction;Compound_Search_and_Calculate",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "electricity_load_estimate_plot": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Estimation",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "pmc_vqa_medical_image_qa": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;Life_Sciences;Medical",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Science"
    },
    "video_to_camera_trajectory_retrieval": {
        "max_num_query_image": 17,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "video",
        "taxonomy_tree_path": "Perception;Temporal_Understanding",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Scene and Event Understanding"
        ],
        "input_format": "Videos",
        "app": "Perception"
    },
    "movie_info_parsing": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Structured_Parsing",
        "skills": [
            "Text Recognition (OCR)",
            "Object Recognition and Classification"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "code_programming_test_easy": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Coding;Code_Generation;Programming_Problems",
        "skills": [
            "Text Recognition (OCR)",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "perception_test_object_shuffle_video": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Scene and Event Understanding"
        ],
        "input_format": "Videos",
        "app": "Planning"
    },
    "insect_order_classification": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "face_identity_matching": {
        "max_num_query_image": 5,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "graph_shortest_path_kamada_kawai": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Graph_Theory",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "places365_similar_scene_retrieval": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Information_Extraction;Search_by_Attribute_wo_Calculate",
        "skills": [
            "Scene and Event Understanding",
            "Object Recognition and Classification"
        ],
        "input_format": "Photographs",
        "app": "Information_Extraction"
    },
    "multi_load_type_prediction_from_plot": {
        "max_num_query_image": 6,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Future_Prediction",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "finance_table_understanding": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Table_QA",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Text Recognition (OCR)"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "human_relationship_reasoning": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture",
        "skills": [
            "Commonsense and Social Reasoning",
            "Object Recognition and Classification"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "muma_theory_of_mind_social_goal": {
        "max_num_query_image": 20,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "video",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;theory_of_minds",
        "skills": [
            "Scene and Event Understanding",
            "Commonsense and Social Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Videos",
        "app": "Knowledge"
    },
    "graph_shortest_path_planar": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Graph_Theory",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "iconqa_count_and_reasoning": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Numeric_Reasoning",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Mathematics"
    },
    "stock_info_parsing": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Structured_Parsing",
        "skills": [
            "Text Recognition (OCR)",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "stock_price_future_prediction": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Future_Prediction",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "clevrer_moving_direction_video": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Perception;Physical_Understanding;Physical_Reasoning",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Scene and Event Understanding"
        ],
        "input_format": "Videos",
        "app": "Perception"
    },
    "geometry_length": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Geometry",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "3d_fragments_understanding": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Physical_Understanding;Physical_Reasoning",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "3D Models and Aerial Imagery",
        "app": "Perception"
    },
    "vizwiz_quality_accessment_for_blind": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Metrics;Quality_Assessment",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Metrics"
    },
    "traffic_future_prediction_from_line_plot": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Future_Prediction",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "biology_exams_v": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;Life_Sciences",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Science"
    },
    "muma_theory_of_mind_belief_of_goal": {
        "max_num_query_image": 20,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "video",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;theory_of_minds",
        "skills": [
            "Scene and Event Understanding",
            "Commonsense and Social Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Videos",
        "app": "Knowledge"
    },
    "animal_pose_estimation": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "weather_info_retrieval": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Information_Extraction;Compound_Search_and_Calculate",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "average_humidity_estimate_plot": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Estimation",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "vlnqa_egocentric_navigation_video": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;Navigation",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Scene and Event Understanding",
            "Language Understanding and Generation"
        ],
        "input_format": "Videos",
        "app": "Planning"
    },
    "music_info_parsing": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Structured_Parsing",
        "skills": [
            "Text Recognition (OCR)",
            "Object Recognition and Classification"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "code_programming_test_hard": {
        "max_num_query_image": 4,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Coding;Code_Generation;Programming_Problems",
        "skills": [
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "nlvr2_two_image_compare_qa": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Object_and_Scene_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "geometry_transformation": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Geometry",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "multilingual_game_info_parsing": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Structured_Parsing",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "egocentric_analysis_single_image": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Object_and_Scene_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "landmark_recognition_and_qa": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Landmark_and_Buliding",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "action_sequence_understanding": {
        "max_num_query_image": 15,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Perception;Temporal_Understanding",
        "skills": [
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "graph_connectivity": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Graph_Theory",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "youtube_video_info_parsing": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Structured_Parsing",
        "skills": [
            "Text Recognition (OCR)",
            "Object Recognition and Classification"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "physical_property_reasoning": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Physical_Understanding;Physical_Reasoning",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "av_vehicle_multiview_counting": {
        "max_num_query_image": 6,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Perception;Counting",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "arc_agi": {
        "max_num_query_image": 3,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Planning"
    },
    "IAM_line_ocr_and_locate": {
        "max_num_query_image": 2,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Document;Document_Info_Parsing",
        "skills": [
            "Text Recognition (OCR)",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Perception"
    },
    "license_plate_recognition": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Text Recognition (OCR)",
            "Object Recognition and Classification"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "av_human_multiview_counting": {
        "max_num_query_image": 6,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Perception;Counting",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "clevrer_object_existence_video": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Perception;Object_and_Scene_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Videos",
        "app": "Perception"
    },
    "famous_building_recognition": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Landmark_and_Buliding",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "emotion_recognition": {
        "max_num_query_image": 6,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;emotion_and_intent_understanding",
        "skills": [
            "Object Recognition and Classification",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "TV_show_info_parsing": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Structured_Parsing",
        "skills": [
            "Text Recognition (OCR)",
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "image_style_recognition": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Perception"
    },
    "graph_theory": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Graph_Theory",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "electricity_plot_future_prediction": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Future_Prediction",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "game_info_retrieval": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Information_Extraction;Search_by_Attribute_wo_Calculate",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "mnist_pattern": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Planning"
    },
    "graph_isomorphism": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Graph_Theory",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "landmark_check_two_images": {
        "max_num_query_image": 2,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Landmark_and_Buliding",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Knowledge"
    },
    "code_output_result": {
        "max_num_query_image": 5,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Coding;Code_Understanding;Code_Output",
        "skills": [
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "nextqa_mc": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Information_Extraction;Multimodal_QA;Video_QA",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Videos",
        "app": "Information_Extraction"
    },
    "graph_hamiltonian_cycle": {
        "max_num_query_image": 6,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Mathematics;Graph_Theory",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "stock_info_retrieval": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Information_Extraction;Compound_Search_and_Calculate",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "long_string_letter_recognition": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Text Recognition (OCR)"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Perception"
    },
    "kvqa_knowledge_aware_qa": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Actor_Character_and_Famous_People",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "next_action_prediction": {
        "max_num_query_image": 15,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "video",
        "taxonomy_tree_path": "Perception;Temporal_Understanding",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Scene and Event Understanding"
        ],
        "input_format": "Videos",
        "app": "Perception"
    },
    "geometry_area": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Geometry",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "av_view_identification": {
        "max_num_query_image": 6,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Perception;3D_understanding",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "newspaper_ocr_in_query_box": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Document;Document_Info_Parsing",
        "skills": [
            "Text Recognition (OCR)",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Perception"
    },
    "code_programming_test_advanced": {
        "max_num_query_image": 2,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Coding;Code_Generation;Programming_Problems",
        "skills": [
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "code_programming_extremely_hard": {
        "max_num_query_image": 5,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Coding;Code_Generation;Programming_Problems",
        "skills": [
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning",
            "Planning and Decision Making"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "math_exams_v": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;General",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "perception_test_video_character_order": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Planning;Reordering",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Videos",
        "app": "Planning"
    },
    "coco_object_detection_by_query_property": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "3d_indoor_scene_text_bbox_prediction": {
        "max_num_query_image": 0,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;3D_understanding",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "3D Models and Aerial Imagery",
        "app": "Perception"
    },
    "top_rated_hotel_identification": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Compound_Search_and_Calculate",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "widerface_face_count_and_event_classification": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "math_parity": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Functions",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "snli_ve_visual_entailment": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Object_and_Scene_Understanding",
        "skills": [
            "Scene and Event Understanding",
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "brand_logo_recognition_and_elaboration": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Logo_and_Sign",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "logo2k_same_type_logo_retrieval": {
        "max_num_query_image": 6,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Logo_and_Sign",
        "skills": [
            "Object Recognition and Classification"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Knowledge"
    },
    "clevr_arithmetic": {
        "max_num_query_image": 2,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Mathematics;Numeric_Reasoning",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Mathematics"
    },
    "super_clevr_scene_understanding": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Object_and_Scene_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "visual_dialog_image_guessing": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Perception;Object_and_Scene_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "science_molecule_chemistry": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;Chemistry",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Science"
    },
    "handwritten_math_expression_extraction": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Perception"
    },
    "chemistry_exams_v": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;Chemistry",
        "skills": [
            "Text Recognition (OCR)",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Science"
    },
    "graph_hamiltonian_path": {
        "max_num_query_image": 6,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Mathematics;Graph_Theory",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "perception_test_video_action_count": {
        "max_num_query_image": 0,
        "output_format": "numerical_data",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Perception;Temporal_Understanding",
        "skills": [
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Videos",
        "app": "Perception"
    },
    "star_object_interaction_video": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Information_Extraction;Multimodal_QA",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Videos",
        "app": "Information_Extraction"
    },
    "chess_puzzle_single_step": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games;Board_Games",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Planning and Decision Making",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Planning"
    },
    "movie_info_retrieval": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Information_Extraction;Compound_Search_and_Calculate",
        "skills": [
            "Text Recognition (OCR)",
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "exchange_rate_estimate_plot": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Estimation",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "graph_chordless_cycle": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Graph_Theory",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "av_multicamera_tracking_predict_bbox": {
        "max_num_query_image": 9,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "mvsa_sentiment_classification": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;emotion_and_intent_understanding",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "math_breakpoint": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Functions",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "ili_ratio_future_prediction": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Future_Prediction",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "dvqa": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Diagram_QA",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning",
            "Text Recognition (OCR)"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "geometry_solid": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Geometry",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "tqa_textbook_qa": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;STEM",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Text Recognition (OCR)"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Science"
    },
    "question_solution_solving": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;STEM",
        "skills": [
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Science"
    },
    "science_basic_physics": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;Physics",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Science"
    },
    "face_keypoint_detection": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "music_info_retrieval": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Information_Extraction;Search_by_Attribute_wo_Calculate",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "figureqa": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Diagram_QA",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "chess_winner_identification": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games;Board_Games",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Planning"
    },
    "algebra": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Algebra",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "clevrer_video_moving_object_count": {
        "max_num_query_image": 0,
        "output_format": "numerical_data",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Perception;Counting",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Videos",
        "app": "Perception"
    },
    "math_convexity_value_estimation": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Functions",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Object Recognition and Classification"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "map_diagram_qa": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Diagram_QA",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "deciphering_oracle_bone": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge",
        "skills": [
            "Text Recognition (OCR)",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Knowledge"
    },
    "funsd_document_qa": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Document;Document_QA",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Perception"
    },
    "newspaper_page_parse_and_count": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Compound_Search_and_Calculate",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Scene and Event Understanding"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Information_Extraction"
    },
    "weather_info_parsing": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Structured_Parsing",
        "skills": [
            "Text Recognition (OCR)",
            "Object Recognition and Classification"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "vibe_eval_open": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Daily",
        "skills": [
            "Object Recognition and Classification",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "3d_indoor_scene_text_bbox_selection": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;3D_understanding",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "3D Models and Aerial Imagery",
        "app": "Perception"
    },
    "signboard_identification": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "game_info_parsing": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Structured_Parsing",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "cam_traj_to_video_selection": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 2,
        "num_input": "video",
        "taxonomy_tree_path": "Perception;Temporal_Understanding",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Scene and Event Understanding"
        ],
        "input_format": "Videos",
        "app": "Perception"
    },
    "clevrer_video_moving_object_property_recognition": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Perception;Physical_Understanding;Physical_Reasoning",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Videos",
        "app": "Perception"
    },
    "geometry_analytic": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Geometry",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "long_string_number_recognition": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Text Recognition (OCR)"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Perception"
    },
    "waybill_number_sequence_extraction": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Text Recognition (OCR)"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Perception"
    },
    "egocentric_spatial_reasoning": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Spatial_Understanding",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Object Recognition and Classification"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "code_error_line_identification": {
        "max_num_query_image": 2,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Coding;Code_Debugging",
        "skills": [
            "Domain-Specific Knowledge and Skills",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "single_person_pose_estimation": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "photo_sharing_image_retrieval": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "quizlet_question_solving": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Science;STEM",
        "skills": [
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Science"
    },
    "chart_vqa": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Diagram_QA",
        "skills": [
            "Object Recognition and Classification",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "electricity_future_prediction_from_table": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Table_QA",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "humor_understand_caption_match": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;emotion_and_intent_understanding",
        "skills": [
            "Commonsense and Social Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Knowledge"
    },
    "hotel_booking_confirmation_parsing": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Structured_Parsing",
        "skills": [
            "Text Recognition (OCR)",
            "Object Recognition and Classification"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "sta_action_localization_video": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 1,
        "num_input": "video",
        "taxonomy_tree_path": "Perception;Temporal_Understanding",
        "skills": [
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Videos",
        "app": "Perception"
    },
    "geometry_descriptive": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Geometry",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "coco_person_detection": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Object Recognition and Classification"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "video_content_reasoning": {
        "max_num_query_image": 21,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "video",
        "taxonomy_tree_path": "Information_Extraction;Summarization",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Videos",
        "app": "Information_Extraction"
    },
    "graph_maxflow": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Graph_Theory",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Mathematics"
    },
    "places365_scene_type_classification": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "research_website_parsing_blogpost": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Multimodal_QA;Large_Image",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Information_Extraction"
    },
    "research_website_parsing_publication": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Multimodal_QA;Large_Image",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Information_Extraction"
    },
    "research_website_parsing_homepage": {
        "max_num_query_image": 0,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Multimodal_QA;Large_Image",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Information_Extraction"
    },
    "reward_models_i2t_reward": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Metrics;Reward_Models",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Metrics"
    },
    "reward_models_t2i_reward": {
        "max_num_query_image": 2,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Metrics;Reward_Models",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Metrics"
    },
    "contain_contain_images": {
        "max_num_query_image": 2,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Multimodal_Constrained_Captioning",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "contain_repeat_length": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Multimodal_Constrained_Captioning",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "multi_contain_repeat_position_only_length": {
        "max_num_query_image": 2,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Multimodal_Constrained_Captioning",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "contain_length": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Multimodal_Constrained_Captioning",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "contain_position_images": {
        "max_num_query_image": 2,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Multimodal_Constrained_Captioning",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "contain_position_length": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Multimodal_Constrained_Captioning",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "xor_images": {
        "max_num_query_image": 2,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Multimodal_Constrained_Captioning",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "multi_contain_repeat": {
        "max_num_query_image": 3,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Multimodal_Constrained_Captioning",
        "skills": [
            "Language Understanding and Generation",
            "Object Recognition and Classification"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "contain_contain_length": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Multimodal_Constrained_Captioning",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "multi_contain_position_only": {
        "max_num_query_image": 3,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Multimodal_Constrained_Captioning",
        "skills": [
            "Language Understanding and Generation",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "panel_images_single_question": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Diagram_QA",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Perception"
    },
    "panel_images_multi_question": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Diagram;Diagram_QA",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Perception"
    },
    "chess_puzzles_checkmate": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games;Board_Games",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Planning and Decision Making",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "chess_puzzles_equality": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games;Board_Games",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Planning and Decision Making",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "chess_puzzles_crushing": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games;Board_Games",
        "skills": [
            "Planning and Decision Making",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "table_understanding_fact_verification": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Table_QA",
        "skills": [
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "table_understanding_complex_question_answering": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Table_QA",
        "skills": [
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "number_puzzle_sudoku": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Planning"
    },
    "number_puzzle_kakuro_5x5": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Planning"
    },
    "pictionary_chinese_food_img2en": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games;Pictionary",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Planning"
    },
    "pictionary_skribbl_io": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games;Pictionary",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Planning"
    },
    "pictionary_genai_output_chinese": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games;Pictionary",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Planning"
    },
    "pictionary_doodle_guess": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games;Pictionary",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Planning"
    },
    "pictionary_cartoon_drawing_guess": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games;Pictionary",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Planning"
    },
    "face_swap": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Fact_Checking",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "veracity": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Fact_Checking",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Text Recognition (OCR)"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Knowledge"
    },
    "out_of_context": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Fact_Checking",
        "skills": [
            "Scene and Event Understanding",
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "background_change": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Fact_Checking",
        "skills": [
            "Scene and Event Understanding",
            "Object Recognition and Classification"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "clip_stable_diffusion_generate": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Fact_Checking",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "text_style": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Fact_Checking",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "text_entity_replace": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Fact_Checking",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "face_attribute_edit": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Fact_Checking",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "app_interactive_operations_leetcode": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;GUI_Operation",
        "skills": [
            "Object Recognition and Classification",
            "Planning and Decision Making"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "app_interactive_operations_instagram": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;GUI_Operation",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "app_interactive_operations_iphone_settings": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;GUI_Operation",
        "skills": [
            "Object Recognition and Classification",
            "Planning and Decision Making"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "app_interactive_operations_ppt": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;GUI_Operation",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "app_interactive_operations_notes": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;GUI_Operation",
        "skills": [
            "Object Recognition and Classification",
            "Planning and Decision Making"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "app_interactive_operations_amazon": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;GUI_Operation",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "app_interactive_operations_excel": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;GUI_Operation",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "app_interactive_operations_youtube": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;GUI_Operation",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "app_interactive_operations_twitter": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;GUI_Operation",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "app_interactive_operations_alipay": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;GUI_Operation",
        "skills": [
            "Object Recognition and Classification",
            "Planning and Decision Making"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "app_interactive_operations_zoom": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;GUI_Operation",
        "skills": [
            "Object Recognition and Classification",
            "Planning and Decision Making"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "app_interactive_operations_word": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;GUI_Operation",
        "skills": [
            "Object Recognition and Classification",
            "Planning and Decision Making"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "app_interactive_operations_tiktok": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Agents_and_Planning;GUI_Operation",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "crossword_mini_5x5": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Language Understanding and Generation",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Planning"
    },
    "ball_cup_swap_3": {
        "max_num_query_image": 11,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "9-image or more",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Planning"
    },
    "autorater_3d_model_texturing": {
        "max_num_query_image": 2,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Metrics;Generated_Image_Eval",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "3D Models and Aerial Imagery",
        "app": "Metrics"
    },
    "autorater_aesthetics": {
        "max_num_query_image": 2,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Metrics;Generated_Image_Eval",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Metrics"
    },
    "autorater_artifact_reason": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Metrics;Generated_Image_Eval",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Metrics"
    },
    "autorater_subject": {
        "max_num_query_image": 2,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Metrics;Generated_Image_Eval",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Metrics"
    },
    "autorater_motion_guided_editing": {
        "max_num_query_image": 4,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Metrics;Generated_Image_Eval",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Metrics"
    },
    "autorater_unmask": {
        "max_num_query_image": 2,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Metrics;Generated_Image_Eval",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Metrics"
    },
    "autorater_semantics": {
        "max_num_query_image": 2,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Metrics;Generated_Image_Eval",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Metrics"
    },
    "autorater_mask": {
        "max_num_query_image": 3,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Metrics;Generated_Image_Eval",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Metrics"
    },
    "autorater_control": {
        "max_num_query_image": 2,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Metrics;Generated_Image_Eval",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Metrics"
    },
    "autorater_artifact": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Metrics;Generated_Image_Eval",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Metrics"
    },
    "counterfactual_arithmetic": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;Number_Theory",
        "skills": [
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Mathematics"
    },
    "poetry_acrostic_alliteration": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Arts;poetry_generation",
        "skills": [
            "Language Understanding and Generation",
            "Object Recognition and Classification"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "poetry_shakespearean_sonnet": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Arts;poetry_generation",
        "skills": [
            "Language Understanding and Generation",
            "Object Recognition and Classification"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "poetry_haiku": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Arts;poetry_generation",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "poetry_petrarchian_sonnet_optional_meter": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Arts;poetry_generation",
        "skills": [
            "Language Understanding and Generation",
            "Object Recognition and Classification"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "poetry_acrostic": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Arts;poetry_generation",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "poetry_limerick": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Arts;poetry_generation",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "poetry_custom_rhyming_scheme": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Arts;poetry_generation",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "shape_composition_shapes": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Counting",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Perception"
    },
    "shape_composition_colours": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Counting",
        "skills": [
            "Object Recognition and Classification"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Perception"
    },
    "ocr_article_authors": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Structured_Parsing",
        "skills": [
            "Text Recognition (OCR)"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Information_Extraction"
    },
    "ocr_table_to_html": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Coding;Code_Generation;Document_Conversion",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "ocr_article_journal": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Document;Document_Info_Parsing",
        "skills": [
            "Text Recognition (OCR)"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Perception"
    },
    "ocr_resume_skill_plain": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Document;Document_Info_Parsing",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Perception"
    },
    "ocr_math_equation": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Coding;Code_Generation;Document_Conversion;Image_to_Latex",
        "skills": [
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "ocr_table_to_latex": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Coding;Code_Generation;Document_Conversion",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Coding"
    },
    "ocr_resume_experience_plain": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Document;Document_Info_Parsing",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Perception"
    },
    "ocr_resume_employer_plain": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Document;Document_Info_Parsing",
        "skills": [
            "Text Recognition (OCR)"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Perception"
    },
    "ocr_math_text_latex": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Coding;Code_Generation;Document_Conversion;Image_to_Latex",
        "skills": [
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "ocr_table_to_markdown": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Coding;Code_Generation;Document_Conversion",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Coding"
    },
    "ocr_resume_school_plain": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;Structured_Parsing",
        "skills": [
            "Text Recognition (OCR)"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Information_Extraction"
    },
    "ocr_table_to_csv": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Coding;Code_Generation;Document_Conversion",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "memorization_indian_celebrity": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Memorization",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "memorization_chinese_celebrity": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Memorization",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "memorization_famous_treaty": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Memorization",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Knowledge"
    },
    "memorization_papers": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Memorization",
        "skills": [
            "Text Recognition (OCR)",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Knowledge"
    },
    "screenshot_lighteval_math": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;General",
        "skills": [
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Mathematics"
    },
    "screenshot_theoremqa": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Mathematics;General",
        "skills": [
            "Text Recognition (OCR)",
            "Mathematical and Logical Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Mathematics"
    },
    "MMSoc_HatefulMemes": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;safety_and_norm",
        "skills": [
            "Ethical and Safety Reasoning",
            "Commonsense and Social Reasoning",
            "Text Recognition (OCR)"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "MMSoc_Misinformation_PolitiFact": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;safety_and_norm",
        "skills": [
            "Commonsense and Social Reasoning",
            "Language Understanding and Generation",
            "Ethical and Safety Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "MMSoc_Misinformation_GossipCop": {
        "max_num_query_image": 1,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;safety_and_norm",
        "skills": [
            "Commonsense and Social Reasoning",
            "Language Understanding and Generation",
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Ethical and Safety Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "MMSoc_Memotion": {
        "max_num_query_image": 1,
        "output_format": "structured_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;safety_and_norm",
        "skills": [
            "Commonsense and Social Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Knowledge"
    },
    "app_layout_understanding_leetcode": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;App_Function_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "app_layout_understanding_instagram": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;App_Function_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "app_layout_understanding_iphone_settings": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;App_Function_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "app_layout_understanding_ppt": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;App_Function_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "app_layout_understanding_notes": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;App_Function_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation",
            "Text Recognition (OCR)"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "app_layout_understanding_amazon": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;App_Function_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Scene and Event Understanding"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "app_layout_understanding_excel": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;App_Function_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "app_layout_understanding_youtube": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;App_Function_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Text Recognition (OCR)"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "app_layout_understanding_twitter": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;App_Function_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "app_layout_understanding_alipay": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;App_Function_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "app_layout_understanding_zoom": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;App_Function_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "app_layout_understanding_word": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;App_Function_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "app_layout_understanding_tiktok": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Information_Extraction;App_Function_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Information_Extraction"
    },
    "maze_2d_8x8": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games",
        "skills": [
            "Spatial and Temporal Reasoning",
            "Planning and Decision Making"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Planning"
    },
    "video_motion_matching_3D_real": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 5,
        "num_input": "video",
        "taxonomy_tree_path": "Perception;3D_understanding",
        "skills": [
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Videos",
        "app": "Perception"
    },
    "video_motion_matching_real_3D": {
        "max_num_query_image": 0,
        "output_format": "multiple_choice",
        "max_num_query_video": 5,
        "num_input": "video",
        "taxonomy_tree_path": "Perception;3D_understanding",
        "skills": [
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Videos",
        "app": "Perception"
    },
    "table_understanding_fetaqa": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Diagram_and_Document_Understanding;Table_QA",
        "skills": [
            "Text Recognition (OCR)",
            "Language Understanding and Generation"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Perception"
    },
    "red_teaming_jailbreak": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;safety_and_norm",
        "skills": [
            "Text Recognition (OCR)",
            "Ethical and Safety Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Knowledge"
    },
    "red_teaming_celebrity": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;safety_and_norm",
        "skills": [
            "Commonsense and Social Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "red_teaming_captcha": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;safety_and_norm",
        "skills": [
            "Text Recognition (OCR)"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Knowledge"
    },
    "red_teaming_visual_order_B": {
        "max_num_query_image": 2,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;safety_and_norm",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "red_teaming_politics": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;safety_and_norm",
        "skills": [
            "Scene and Event Understanding",
            "Commonsense and Social Reasoning",
            "Ethical and Safety Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "red_teaming_visual_order_A": {
        "max_num_query_image": 2,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;safety_and_norm",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "red_teaming_racial": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;safety_and_norm",
        "skills": [
            "Ethical and Safety Reasoning",
            "Scene and Event Understanding",
            "Object Recognition and Classification"
        ],
        "input_format": "Photographs",
        "app": "Knowledge"
    },
    "red_teaming_visualmisleading": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;Human_and_Culture;safety_and_norm",
        "skills": [
            "Ethical and Safety Reasoning",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Artistic and Creative Content",
        "app": "Knowledge"
    },
    "bridge_strategies_worldclass": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games;Board_Games",
        "skills": [
            "Planning and Decision Making",
            "Mathematical and Logical Reasoning",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Planning"
    },
    "bridge_strategies_advanced": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games;Board_Games",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Planning and Decision Making"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "bridge_strategies_expert": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Planning;Puzzles_and_Games;Board_Games",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Planning and Decision Making"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Planning"
    },
    "multi_lingual_manual_explanation_scooter_Spanish": {
        "max_num_query_image": 0,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Information_Extraction;Detailed_Manual_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Language Understanding and Generation",
            "Scene and Event Understanding",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Information_Extraction"
    },
    "multi_lingual_manual_explanation_scooter_Russian": {
        "max_num_query_image": 0,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Information_Extraction;Detailed_Manual_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation",
            "Ethical and Safety Reasoning"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Information_Extraction"
    },
    "multi_lingual_manual_explanation_scooter_Arabic": {
        "max_num_query_image": 0,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Information_Extraction;Detailed_Manual_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Language Understanding and Generation",
            "Ethical and Safety Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Information_Extraction"
    },
    "multi_lingual_manual_explanation_scooter_Chinese": {
        "max_num_query_image": 0,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Information_Extraction;Detailed_Manual_Understanding",
        "skills": [
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning",
            "Ethical and Safety Reasoning",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Information_Extraction"
    },
    "multi_lingual_manual_explanation_scooter_French": {
        "max_num_query_image": 0,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "6-8 images",
        "taxonomy_tree_path": "Information_Extraction;Detailed_Manual_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)",
            "Language Understanding and Generation",
            "Ethical and Safety Reasoning"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Information_Extraction"
    },
    "multi_lingual_Ruozhiba_expalnation_Spanish": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Ruozhiba",
        "skills": [
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Knowledge"
    },
    "multi_lingual_Ruozhiba_expalnation_English": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Ruozhiba",
        "skills": [
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Knowledge"
    },
    "multi_lingual_Ruozhiba_expalnation_Russian": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Ruozhiba",
        "skills": [
            "Commonsense and Social Reasoning",
            "Language Understanding and Generation"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Knowledge"
    },
    "multi_lingual_Ruozhiba_expalnation_Arabic": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Ruozhiba",
        "skills": [
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Knowledge"
    },
    "multi_lingual_Ruozhiba_expalnation_Japanese": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Ruozhiba",
        "skills": [
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Knowledge"
    },
    "multi_lingual_Ruozhiba_expalnation_French": {
        "max_num_query_image": 1,
        "output_format": "open_ended_output",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Ruozhiba",
        "skills": [
            "Language Understanding and Generation",
            "Commonsense and Social Reasoning"
        ],
        "input_format": "User Interface Screenshots",
        "app": "Knowledge"
    },
    "visual_prediction_rater_depth_estimation": {
        "max_num_query_image": 4,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Perception;3D_understanding",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "visual_prediction_rater_plane_segmentation": {
        "max_num_query_image": 4,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Perception;3D_understanding",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "visual_prediction_rater_openable_part_segmentation": {
        "max_num_query_image": 4,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Perception;Image_Segmentation",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "visual_prediction_rater_panoptic_segmentation": {
        "max_num_query_image": 4,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Perception;Image_Segmentation",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "visual_prediction_rater_surface_normal_estimation": {
        "max_num_query_image": 4,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Perception;3D_understanding",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "visual_prediction_rater_3d_assembled_quality_understanding": {
        "max_num_query_image": 3,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;3D_understanding",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "3D Models and Aerial Imagery",
        "app": "Perception"
    },
    "visual_prediction_rater_novel_view_synthesis": {
        "max_num_query_image": 4,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Perception;3D_understanding",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "3D Models and Aerial Imagery",
        "app": "Perception"
    },
    "visual_prediction_rater_semantic_segmentation": {
        "max_num_query_image": 4,
        "output_format": "multiple_choice",
        "max_num_query_video": 0,
        "num_input": "4-5 images",
        "taxonomy_tree_path": "Perception;Image_Segmentation",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "adapted_cvbench_relation": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Spatial_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "adapted_cvbench_distance": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Spatial_Understanding",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "adapted_cvbench_depth": {
        "max_num_query_image": 1,
        "output_format": "exact_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;3D_understanding",
        "skills": [
            "Object Recognition and Classification",
            "Spatial and Temporal Reasoning"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "adapted_cvbench_count": {
        "max_num_query_image": 1,
        "output_format": "numerical_data",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Perception;Counting",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "Photographs",
        "app": "Perception"
    },
    "symbolic_graphics_programs_scalable_vector_graphics": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Coding;Code_Understanding;Symbolic_Graphics_Programming",
        "skills": [
            "Object Recognition and Classification",
            "Text Recognition (OCR)"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "symbolic_graphics_programs_computer_aided_design": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Coding;Code_Understanding;Symbolic_Graphics_Programming",
        "skills": [
            "Mathematical and Logical Reasoning",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Text-Based Images and Documents",
        "app": "Coding"
    },
    "multiple_states_identify_africa": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Multiple_State_Identification",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Knowledge"
    },
    "multiple_states_identify_europe": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Multiple_State_Identification",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Knowledge"
    },
    "multiple_states_identify_asia": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Multiple_State_Identification",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Knowledge"
    },
    "multiple_states_identify_americas": {
        "max_num_query_image": 1,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "1-image",
        "taxonomy_tree_path": "Knowledge;World_Knowledge;Multiple_State_Identification",
        "skills": [
            "Object Recognition and Classification",
            "Domain-Specific Knowledge and Skills"
        ],
        "input_format": "Diagrams and Data Visualizations",
        "app": "Knowledge"
    },
    "geographic_remote_sensing_land_cover": {
        "max_num_query_image": 2,
        "output_format": "contextual_formatted_text",
        "max_num_query_video": 0,
        "num_input": "2-3 images",
        "taxonomy_tree_path": "Perception;Visual_Recognition",
        "skills": [
            "Object Recognition and Classification",
            "Scene and Event Understanding"
        ],
        "input_format": "3D Models and Aerial Imagery",
        "app": "Perception"
    }
}