{
    "model_id": {
        "value": "ScoreRS-v1",
        "confidence": 0.7222
    },
    "model_name": {
        "value": "ScoreRS",
        "confidence": 1.0
    },
    "version": {
        "value": "v1",
        "confidence": 0.8723
    },
    "release_date": {
        "value": "2025-09-19",
        "confidence": 0.9818
    },
    "last_updated": {
        "value": "2025-09-19",
        "confidence": 0.9999
    },
    "short_description": {
        "value": "ScoreRS is a learned quality scoring model for automated assessment and curation of remote sensing vision-language data, trained on large-scale RS-specific preference datasets. It enables quality-driven data selection, RL reward modeling, and best-of-N test-time scaling for VLMs in remote sensing.",
        "confidence": 0.782
    },
    "paper_link": {
        "value": "https://arxiv.org/abs/2503.00743",
        "confidence": 0.9
    },
    "citations": {
        "value": null,
        "confidence": 0.0
    },
    "repository": {
        "value": null,
        "confidence": 0.0
    },
    "weights": {
        "value": null,
        "confidence": 0.0
    },
    "backbone": {
        "value": "Qwen2VL-7B",
        "confidence": 1.0
    },
    "num_layers": {
        "value": null,
        "confidence": 0.0
    },
    "num_parameters": {
        "value": null,
        "confidence": 0.0
    },
    "pretext_training_type": {
        "value": "Pairwise preference learning (reward modeling)",
        "confidence": 0.7263
    },
    "masking_strategy": {
        "value": null,
        "confidence": 0.0
    },
    "pretraining": {
        "value": "Three-stage progressive training: (1) value head initialized on UltraFeedback text preference data, (2) ViT and value head trained on RS image-caption preference data, (3) full-parameter training on RS vision instruction preference data and RLHF-V.",
        "confidence": 0.82
    },
    "domain_knowledge": {
        "value": [
            {
                "value": "Remote sensing-specific quality dimensions",
                "confidence": 0.6402
            },
            {
                "value": "Domain-specific question categories",
                "confidence": 0.4238
            }
        ]
    },
    "backbone_modifications": {
        "value": [
            {
                "value": "Language head replaced by linear value head for scalar scoring",
                "confidence": 0.8274
            }
        ]
    },
    "supported_sensors": {
        "value": [
            {
                "value": "Multispectral (Sentinel-2, etc.)",
                "confidence": 0.3246
            },
            {
                "value": "Aerial imagery",
                "confidence": 0.5343
            }
        ]
    },
    "modality_integration_type": {
        "value": "Unimodal",
        "confidence": 0.9982
    },
    "modalities": {
        "value": [
            {
                "value": "Multispectral",
                "confidence": 0.983
            },
            {
                "value": "RGB",
                "confidence": 0.4976
            }
        ]
    },
    "spectral_alignment": {
        "value": "none",
        "confidence": 0.6933
    },
    "temporal_alignment": {
        "value": "none",
        "confidence": 1.0
    },
    "spatial_resolution": {
        "value": "variable",
        "confidence": 0.7857
    },
    "temporal_resolution": {
        "value": "variable",
        "confidence": 0.9976
    },
    "bands": {
        "value": null,
        "confidence": 0.0
    },
    "pretraining_phases": [
        {
            "dataset": {
                "value": "UltraFeedback (text preference data)",
                "confidence": 0.3463
            },
            "regions_coverage": {
                "value": []
            },
            "time_range": {
                "value": null,
                "confidence": 0.0
            },
            "num_images": {
                "value": null,
                "confidence": 0.0
            },
            "token_size": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": []
            },
            "processing": {
                "value": []
            },
            "sampling": {
                "value": null,
                "confidence": 0.0
            },
            "processing_level": {
                "value": null,
                "confidence": 0.0
            },
            "cloud_cover": {
                "value": null,
                "confidence": 0.0
            },
            "missing_data": {
                "value": null,
                "confidence": 0.0
            },
            "masking_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "dataset": {
                "value": "LHRS-Align (image-caption preference data)",
                "confidence": 0.3752
            },
            "regions_coverage": {
                "value": [
                    {
                        "value": "Major global urban areas",
                        "confidence": 0.5962
                    }
                ]
            },
            "time_range": {
                "value": null,
                "confidence": 0.0
            },
            "num_images": {
                "value": 76000,
                "confidence": 0.6932
            },
            "token_size": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": []
            },
            "processing": {
                "value": [
                    {
                        "value": "Image deduplication (SSCD, cosine similarity, threshold 0.65)",
                        "confidence": 0.3035
                    }
                ]
            },
            "sampling": {
                "value": null,
                "confidence": 0.0
            },
            "processing_level": {
                "value": null,
                "confidence": 0.0
            },
            "cloud_cover": {
                "value": null,
                "confidence": 0.0
            },
            "missing_data": {
                "value": null,
                "confidence": 0.0
            },
            "masking_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "dataset": {
                "value": "GeoChat, LHRS-Instruct, SkysenseGPT (vision instruction preference data)",
                "confidence": 0.5607
            },
            "regions_coverage": {
                "value": []
            },
            "time_range": {
                "value": null,
                "confidence": 0.0
            },
            "num_images": {
                "value": 112000,
                "confidence": 0.9974
            },
            "token_size": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": []
            },
            "processing": {
                "value": [
                    {
                        "value": "Two-stage similarity-based filtering (BGE text embedding, SSCD image embedding, threshold 0.65)",
                        "confidence": 0.3013
                    }
                ]
            },
            "sampling": {
                "value": null,
                "confidence": 0.0
            },
            "processing_level": {
                "value": null,
                "confidence": 0.0
            },
            "cloud_cover": {
                "value": null,
                "confidence": 0.0
            },
            "missing_data": {
                "value": null,
                "confidence": 0.0
            },
            "masking_ratio": {
                "value": null,
                "confidence": 0.0
            }
        }
    ],
    "benchmarks": [
        {
            "task": {
                "value": "Image classification",
                "confidence": 0.6624
            },
            "application": {
                "value": "Remote sensing scene classification",
                "confidence": 0.6721
            },
            "dataset": {
                "value": "NWPU-RESISC45",
                "confidence": 0.6175
            },
            "metrics": {
                "value": [
                    {
                        "value": "Top-1 Accuracy",
                        "confidence": 0.6059
                    },
                    {
                        "value": "Top-5 Accuracy",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 78.58,
                        "confidence": 0.9998
                    },
                    {
                        "value": 97.54,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "Aerial imagery",
                        "confidence": 0.4837
                    }
                ]
            },
            "regions": {
                "value": []
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": 30.0,
                "confidence": 0.8723
            },
            "num_classes": {
                "value": 45,
                "confidence": 1.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 0.8964
            },
            "batch_size": {
                "value": 1024,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 1e-05,
                "confidence": 0.9858
            },
            "epochs": {
                "value": 5,
                "confidence": 1.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Image classification",
                "confidence": 0.9911
            },
            "application": {
                "value": "Remote sensing scene classification",
                "confidence": 0.6631
            },
            "dataset": {
                "value": "EuroSAT",
                "confidence": 0.9165
            },
            "metrics": {
                "value": [
                    {
                        "value": "Top-1 Accuracy",
                        "confidence": 1.0
                    },
                    {
                        "value": "Top-5 Accuracy",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 63.67,
                        "confidence": 1.0
                    },
                    {
                        "value": 99.01,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "Sentinel-2",
                        "confidence": 0.9926
                    }
                ]
            },
            "regions": {
                "value": []
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": 30.0,
                "confidence": 1.0
            },
            "num_classes": {
                "value": 10,
                "confidence": 1.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 0.9959
            },
            "batch_size": {
                "value": 1024,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 1e-05,
                "confidence": 1.0
            },
            "epochs": {
                "value": 5,
                "confidence": 1.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Image classification",
                "confidence": 0.9471
            },
            "application": {
                "value": "Remote sensing scene classification",
                "confidence": 0.8048
            },
            "dataset": {
                "value": "fMoW",
                "confidence": 0.7352
            },
            "metrics": {
                "value": [
                    {
                        "value": "Top-1 Accuracy",
                        "confidence": 1.0
                    },
                    {
                        "value": "Top-5 Accuracy",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 29.29,
                        "confidence": 1.0
                    },
                    {
                        "value": 60.7,
                        "confidence": 0.8723
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "Multispectral",
                        "confidence": 0.6447
                    }
                ]
            },
            "regions": {
                "value": []
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": 30.0,
                "confidence": 1.0
            },
            "num_classes": {
                "value": 63,
                "confidence": 0.998
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 0.9997
            },
            "batch_size": {
                "value": 1024,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 1e-05,
                "confidence": 1.0
            },
            "epochs": {
                "value": 5,
                "confidence": 1.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Image classification",
                "confidence": 0.9024
            },
            "application": {
                "value": "Remote sensing scene classification",
                "confidence": 0.9963
            },
            "dataset": {
                "value": "AID",
                "confidence": 0.9999
            },
            "metrics": {
                "value": [
                    {
                        "value": "Top-1 Accuracy",
                        "confidence": 1.0
                    },
                    {
                        "value": "Top-5 Accuracy",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 85.9,
                        "confidence": 0.8549
                    },
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "Aerial imagery",
                        "confidence": 0.9871
                    }
                ]
            },
            "regions": {
                "value": []
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": 30,
                "confidence": 1.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Cross-modal retrieval",
                "confidence": 0.5823
            },
            "application": {
                "value": "Text-to-image and image-to-text retrieval",
                "confidence": 0.4822
            },
            "dataset": {
                "value": "UCM",
                "confidence": 0.9939
            },
            "metrics": {
                "value": [
                    {
                        "value": "R@1",
                        "confidence": 0.5639
                    },
                    {
                        "value": "R@5",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 44.56,
                        "confidence": 1.0
                    },
                    {
                        "value": 82.09,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "Aerial imagery",
                        "confidence": 0.9919
                    }
                ]
            },
            "regions": {
                "value": []
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": 30.0,
                "confidence": 0.9999
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 0.7754
            },
            "batch_size": {
                "value": 1024,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 1e-05,
                "confidence": 1.0
            },
            "epochs": {
                "value": 5,
                "confidence": 1.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Cross-modal retrieval",
                "confidence": 0.9903
            },
            "application": {
                "value": "Text-to-image and image-to-text retrieval",
                "confidence": 0.9997
            },
            "dataset": {
                "value": "RSICD",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "R@1",
                        "confidence": 1.0
                    },
                    {
                        "value": "R@5",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 13.9,
                        "confidence": 0.9795
                    },
                    {
                        "value": 38.02,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "Aerial imagery",
                        "confidence": 0.8786
                    }
                ]
            },
            "regions": {
                "value": []
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": 30.0,
                "confidence": 1.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 1.0
            },
            "batch_size": {
                "value": 1024,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 1e-05,
                "confidence": 1.0
            },
            "epochs": {
                "value": 5,
                "confidence": 1.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Visual question answering",
                "confidence": 0.4392
            },
            "application": {
                "value": "Remote sensing VQA",
                "confidence": 0.5694
            },
            "dataset": {
                "value": "LHRS-Bench",
                "confidence": 0.8923
            },
            "metrics": {
                "value": [
                    {
                        "value": "Accuracy",
                        "confidence": 0.9819
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 67.08,
                        "confidence": 0.9152
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "Aerial imagery",
                        "confidence": 0.6629
                    },
                    {
                        "value": "Multispectral",
                        "confidence": 0.8843
                    }
                ]
            },
            "regions": {
                "value": []
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Visual grounding",
                "confidence": 0.8162
            },
            "application": {
                "value": "Remote sensing visual grounding",
                "confidence": 0.511
            },
            "dataset": {
                "value": "VG-DIOR",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "Accuracy",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 58.34,
                        "confidence": 0.6935
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "Aerial imagery",
                        "confidence": 0.9645
                    },
                    {
                        "value": "Multispectral",
                        "confidence": 0.9996
                    }
                ]
            },
            "regions": {
                "value": []
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        }
    ]
}