{
    "model_id": {
        "value": "xlrs-bench-v1",
        "confidence": 0.7672
    },
    "model_name": {
        "value": "XLRS-Bench",
        "confidence": 1.0
    },
    "version": {
        "value": "1.0",
        "confidence": 0.8931
    },
    "release_date": {
        "value": null,
        "confidence": 0.0
    },
    "last_updated": {
        "value": null,
        "confidence": 0.0
    },
    "short_description": {
        "value": "XLRS-Bench is a comprehensive benchmark for evaluating the perception and reasoning capabilities of multimodal large language models (MLLMs) in ultra-high-resolution remote sensing (RS) scenarios. It features the largest average image size to date, high-quality human-verified annotations, and 16 sub-tasks across three ability levels, supporting both English and Chinese.",
        "confidence": 0.9235
    },
    "paper_link": {
        "value": "https://arxiv.org/abs/2503.23771",
        "confidence": 0.999
    },
    "citations": {
        "value": null,
        "confidence": 0.0
    },
    "repository": {
        "value": "https://xlrs-bench.github.io/",
        "confidence": 0.9994
    },
    "weights": {
        "value": null,
        "confidence": 0.0
    },
    "backbone": {
        "value": null,
        "confidence": 0.0
    },
    "num_layers": {
        "value": null,
        "confidence": 0.0
    },
    "num_parameters": {
        "value": null,
        "confidence": 0.0
    },
    "pretext_training_type": {
        "value": null,
        "confidence": 0.0
    },
    "masking_strategy": {
        "value": null,
        "confidence": 0.0
    },
    "pretraining": {
        "value": null,
        "confidence": 0.0
    },
    "domain_knowledge": {
        "value": [],
        "confidence": 0.0
    },
    "backbone_modifications": {
        "value": [],
        "confidence": 0.0
    },
    "supported_sensors": {
        "value": [],
        "confidence": 0.0
    },
    "modality_integration_type": {
        "value": "Homogeneous Multimodal",
        "confidence": 0.9176
    },
    "modalities": {
        "value": [
            {
                "value": "Multispectral",
                "confidence": 0.9961
            },
            {
                "value": "Text",
                "confidence": 0.7088
            }
        ]
    },
    "spectral_alignment": {
        "value": "none",
        "confidence": 0.9628
    },
    "temporal_alignment": {
        "value": "partial",
        "confidence": 0.8019
    },
    "spatial_resolution": {
        "value": "8500x8500 (average), up to 10000x10000",
        "confidence": 0.8288
    },
    "temporal_resolution": {
        "value": null,
        "confidence": 0.0
    },
    "bands": {
        "value": [],
        "confidence": 0.0
    },
    "pretraining_phases": {
        "value": [],
        "confidence": 0.0
    },
    "benchmarks": [
        {
            "task": {
                "value": "Multimodal Perception and Reasoning (VQA, Captioning, Visual Grounding)",
                "confidence": 0.3217
            },
            "application": {
                "value": "Ultra-high-resolution remote sensing imagery understanding",
                "confidence": 0.335
            },
            "dataset": {
                "value": "XLRS-Bench",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "Accuracy (VQA)",
                        "confidence": 0.5859
                    },
                    {
                        "value": "BLEU-1",
                        "confidence": 0.8399
                    },
                    {
                        "value": "BLEU-2",
                        "confidence": 1.0
                    },
                    {
                        "value": "BLEU-3",
                        "confidence": 1.0
                    },
                    {
                        "value": "BLEU-4",
                        "confidence": 1.0
                    },
                    {
                        "value": "METEOR",
                        "confidence": 0.9794
                    },
                    {
                        "value": "ROUGE L",
                        "confidence": 0.9811
                    },
                    {
                        "value": "Acc@0.5 (Grounding)",
                        "confidence": 0.3947
                    },
                    {
                        "value": "Acc@0.7 (Grounding)",
                        "confidence": 0.9997
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 41.1,
                        "confidence": 0.6837
                    },
                    {
                        "value": 41.12,
                        "confidence": 0.8912
                    },
                    {
                        "value": 20.42,
                        "confidence": 1.0
                    },
                    {
                        "value": 9.94,
                        "confidence": 1.0
                    },
                    {
                        "value": 4.56,
                        "confidence": 1.0
                    },
                    {
                        "value": 19.99,
                        "confidence": 0.998
                    },
                    {
                        "value": 21.03,
                        "confidence": 1.0
                    },
                    {
                        "value": 0.16,
                        "confidence": 0.9292
                    },
                    {
                        "value": 0.0,
                        "confidence": 0.8955
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "Aerial",
                        "confidence": 0.629
                    },
                    {
                        "value": "Satellite",
                        "confidence": 0.9975
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": "Global (multiple cities, diverse real-world RS scenarios)",
                        "confidence": 0.3001
                    }
                ]
            },
            "original_samples": {
                "value": 1400,
                "confidence": 1.0
            },
            "num_samples": {
                "value": 1400,
                "confidence": 0.8707
            },
            "sampling_percentage": {
                "value": 100,
                "confidence": 1.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": []
            },
            "image_resolution": {
                "value": "8500x8500 (average), up to 10000x10000",
                "confidence": 0.4027
            },
            "spatial_resolution": {
                "value": "8500x8500 (average), up to 10000x10000",
                "confidence": 0.4928
            },
            "bands_used": {
                "value": []
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        }
    ]
}