{
    "model_id": {
        "value": "LRS-VQA",
        "confidence": 0.9816
    },
    "model_name": {
        "value": "LRS-VQA",
        "confidence": 0.9975
    },
    "version": {
        "value": null,
        "confidence": 0.0
    },
    "release_date": {
        "value": null,
        "confidence": 0.0
    },
    "last_updated": {
        "value": null,
        "confidence": 0.0
    },
    "short_description": {
        "value": "A text-guided token pruning method for efficient vision-language understanding of large remote sensing images, introducing a Region Focus Module (RFM) and Dynamic Image Pyramid (DIP), and a new benchmark LRS-VQA with 7,333 QA pairs across 8 categories and image lengths up to 27,328 pixels.",
        "confidence": 0.8191
    },
    "paper_link": {
        "value": "https://arxiv.org/abs/2503.07588",
        "confidence": 0.9999
    },
    "citations": {
        "value": null,
        "confidence": 0.0
    },
    "repository": {
        "value": "https://github.com/VisionXLab/LRS-VQA",
        "confidence": 1.0
    },
    "weights": {
        "value": null,
        "confidence": 0.0
    },
    "backbone": {
        "value": "CLIP-L14",
        "confidence": 0.9765
    },
    "num_layers": {
        "value": null,
        "confidence": 0.0
    },
    "num_parameters": {
        "value": null,
        "confidence": 0.0
    },
    "pretext_training_type": {
        "value": "Text-guided token pruning with attention distillation",
        "confidence": 0.8088
    },
    "masking_strategy": {
        "value": "Text-guided token pruning via Region Focus Module (RFM)",
        "confidence": 0.8023
    },
    "pretraining": {
        "value": "Pre-trained on 558K data from LLaVA-1.5; SFT with 484K samples including LLaVA-1.5-665K, RSVQA-HR, and template-based samples from FAIR1M-1.0, GLH-Bridge, and STAR datasets.",
        "confidence": 0.7787
    },
    "domain_knowledge": {
        "value": [
            {
                "value": "Text-guided region localization",
                "confidence": 0.5354
            },
            {
                "value": "Dynamic image pyramid for multi-scale inference",
                "confidence": 0.4846
            }
        ]
    },
    "backbone_modifications": {
        "value": [
            {
                "value": "Integration of Region Focus Module (RFM)",
                "confidence": 0.7669
            },
            {
                "value": "Dynamic Image Pyramid (DIP) for multi-scale inference",
                "confidence": 0.5815
            }
        ]
    },
    "supported_sensors": {
        "value": [
            {
                "value": null,
                "confidence": 0.1
            }
        ]
    },
    "modality_integration_type": {
        "value": "Unimodal",
        "confidence": 0.9947
    },
    "modalities": {
        "value": [
            {
                "value": "Multispectral",
                "confidence": 0.8932
            }
        ]
    },
    "spectral_alignment": {
        "value": "none",
        "confidence": 0.9461
    },
    "temporal_alignment": {
        "value": "none",
        "confidence": 1.0
    },
    "spatial_resolution": {
        "value": "variable",
        "confidence": 0.8108
    },
    "temporal_resolution": {
        "value": "variable",
        "confidence": 0.6934
    },
    "bands": {
        "value": [
            {
                "value": null,
                "confidence": 0.1
            }
        ]
    },
    "pretraining_phases": [
        {
            "dataset": {
                "value": "558K data used in LLaVA-1.5",
                "confidence": 0.4418
            },
            "regions_coverage": {
                "value": []
            },
            "time_range": {
                "value": null,
                "confidence": 0.0
            },
            "num_images": {
                "value": 558000,
                "confidence": 1.0
            },
            "token_size": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": []
            },
            "processing": {
                "value": []
            },
            "sampling": {
                "value": null,
                "confidence": 0.0
            },
            "processing_level": {
                "value": null,
                "confidence": 0.0
            },
            "cloud_cover": {
                "value": null,
                "confidence": 0.0
            },
            "missing_data": {
                "value": null,
                "confidence": 0.0
            },
            "masking_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "dataset": {
                "value": "484K SFT samples (300K from LLaVA-1.5-665K, 146K from RSVQA-HR, 38K template-based from FAIR1M-1.0, GLH-Bridge, STAR)",
                "confidence": 0.338
            },
            "regions_coverage": {
                "value": []
            },
            "time_range": {
                "value": null,
                "confidence": 0.0
            },
            "num_images": {
                "value": 484000,
                "confidence": 1.0
            },
            "token_size": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": []
            },
            "processing": {
                "value": []
            },
            "sampling": {
                "value": null,
                "confidence": 0.0
            },
            "processing_level": {
                "value": null,
                "confidence": 0.0
            },
            "cloud_cover": {
                "value": null,
                "confidence": 0.0
            },
            "missing_data": {
                "value": null,
                "confidence": 0.0
            },
            "masking_ratio": {
                "value": null,
                "confidence": 0.0
            }
        }
    ],
    "benchmarks": [
        {
            "task": {
                "value": "Visual Question Answering",
                "confidence": 0.9228
            },
            "application": {
                "value": "Remote Sensing Image Understanding",
                "confidence": 0.3632
            },
            "dataset": {
                "value": "LRS-VQA",
                "confidence": 0.9579
            },
            "metrics": {
                "value": [
                    {
                        "value": "Accuracy",
                        "confidence": 0.9135
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 41.89,
                        "confidence": 0.8103
                    }
                ]
            },
            "sensor": {
                "value": []
            },
            "regions": {
                "value": []
            },
            "original_samples": {
                "value": 7333,
                "confidence": 0.8722
            },
            "num_samples": {
                "value": 7333,
                "confidence": 1.0
            },
            "sampling_percentage": {
                "value": 100,
                "confidence": 1.0
            },
            "num_classes": {
                "value": 8,
                "confidence": 1.0
            },
            "classes": {
                "value": [
                    {
                        "value": "count",
                        "confidence": 0.9997
                    },
                    {
                        "value": "color",
                        "confidence": 1.0
                    },
                    {
                        "value": "category",
                        "confidence": 1.0
                    },
                    {
                        "value": "shape",
                        "confidence": 1.0
                    },
                    {
                        "value": "status",
                        "confidence": 1.0
                    },
                    {
                        "value": "reasoning",
                        "confidence": 1.0
                    },
                    {
                        "value": "rural/urban classification",
                        "confidence": 0.9577
                    },
                    {
                        "value": "target background",
                        "confidence": 0.9998
                    }
                ]
            },
            "image_resolution": {
                "value": "1024-27328 pixels",
                "confidence": 0.6883
            },
            "spatial_resolution": {
                "value": "variable",
                "confidence": 0.9971
            },
            "bands_used": {
                "value": []
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Visual Question Answering",
                "confidence": 0.9999
            },
            "application": {
                "value": "Remote Sensing Image Understanding",
                "confidence": 0.9916
            },
            "dataset": {
                "value": "MME-RealWorld-RS",
                "confidence": 0.9963
            },
            "metrics": {
                "value": [
                    {
                        "value": "Accuracy",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 39.04,
                        "confidence": 0.6875
                    }
                ]
            },
            "sensor": {
                "value": []
            },
            "regions": {
                "value": []
            },
            "original_samples": {
                "value": 3738,
                "confidence": 0.9795
            },
            "num_samples": {
                "value": 3738,
                "confidence": 0.9857
            },
            "sampling_percentage": {
                "value": 100,
                "confidence": 1.0
            },
            "num_classes": {
                "value": 3,
                "confidence": 1.0
            },
            "classes": {
                "value": [
                    {
                        "value": "color",
                        "confidence": 0.9999
                    },
                    {
                        "value": "count",
                        "confidence": 1.0
                    },
                    {
                        "value": "position",
                        "confidence": 1.0
                    }
                ]
            },
            "image_resolution": {
                "value": "689-11500 pixels",
                "confidence": 0.9945
            },
            "spatial_resolution": {
                "value": "variable",
                "confidence": 0.9994
            },
            "bands_used": {
                "value": []
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        }
    ]
}