{
    "model_id": {
        "value": "GeoRSCLIP_RS5M_CLIP",
        "confidence": 0.4225
    },
    "model_name": {
        "value": "GeoRSCLIP",
        "confidence": 1.0
    },
    "version": {
        "value": null,
        "confidence": 0.0
    },
    "release_date": {
        "value": null,
        "confidence": 0.0
    },
    "last_updated": {
        "value": null,
        "confidence": 0.0
    },
    "short_description": {
        "value": "GeoRSCLIP is a domain-adapted vision-language model for remote sensing, based on CLIP and fine-tuned on the large-scale RS5M remote sensing image-text paired dataset. It leverages parameter-efficient fine-tuning (PEFT) methods and full fine-tuning to bridge the gap between general vision-language models and remote sensing downstream tasks.",
        "confidence": 0.818
    },
    "paper_link": {
        "value": "https://arxiv.org/abs/2306.11300",
        "confidence": 0.9923
    },
    "citations": {
        "value": null,
        "confidence": 0.0
    },
    "repository": {
        "value": "https://github.com/om-ai-lab/RS5M",
        "confidence": 1.0
    },
    "weights": {
        "value": null,
        "confidence": 0.0
    },
    "backbone": {
        "value": "CLIP (ViT-B-32, ViT-B-16, ViT-L-14, ViT-H-14)",
        "confidence": 0.9701
    },
    "num_layers": {
        "value": null,
        "confidence": 0.0
    },
    "num_parameters": {
        "value": null,
        "confidence": 0.0
    },
    "pretext_training_type": {
        "value": "Contrastive vision-language alignment (InfoNCE loss)",
        "confidence": 0.8053
    },
    "masking_strategy": {
        "value": null,
        "confidence": 0.0
    },
    "pretraining": {
        "value": "Fine-tuning of CLIP vision-language model on RS5M, a large-scale remote sensing image-text paired dataset, using parameter-efficient fine-tuning (PEFT) methods (Pfeiffer, LoRA, Prefix-tuning, UniPELT) and full fine-tuning.",
        "confidence": 0.7691
    },
    "domain_knowledge": {
        "value": [
            {
                "value": "Remote sensing domain-specific image-text pairs",
                "confidence": 0.7314
            },
            {
                "value": "Rotation-invariant caption selection",
                "confidence": 0.7531
            },
            {
                "value": "Geo-metadata integration",
                "confidence": 0.6297
            }
        ]
    },
    "backbone_modifications": {
        "value": [
            {
                "value": "Adapters (Pfeiffer, LoRA, Prefix-tuning, UniPELT) for PEFT",
                "confidence": 0.5631
            }
        ]
    },
    "supported_sensors": {
        "value": [
            {
                "value": "Sentinel-1",
                "confidence": 0.8434
            },
            {
                "value": "Sentinel-2",
                "confidence": 1.0
            },
            {
                "value": "Gaofen",
                "confidence": 0.8979
            },
            {
                "value": "USGS",
                "confidence": 0.9482
            },
            {
                "value": "NAIP",
                "confidence": 0.9928
            },
            {
                "value": "MODIS",
                "confidence": 0.9849
            },
            {
                "value": "WorldView",
                "confidence": 0.6948
            },
            {
                "value": "Planet Dove",
                "confidence": 0.9988
            },
            {
                "value": "Maxar",
                "confidence": 0.9046
            },
            {
                "value": "Landsat",
                "confidence": 0.9964
            }
        ]
    },
    "modality_integration_type": {
        "value": "Homogeneous Multimodal",
        "confidence": 0.9164
    },
    "modalities": {
        "value": [
            {
                "value": "Multispectral",
                "confidence": 0.9886
            },
            {
                "value": "RGB",
                "confidence": 0.8566
            },
            {
                "value": "Text",
                "confidence": 0.9908
            }
        ]
    },
    "spectral_alignment": {
        "value": "none",
        "confidence": 0.6934
    },
    "temporal_alignment": {
        "value": "none",
        "confidence": 0.9992
    },
    "spatial_resolution": {
        "value": "variable (0.5m to 30m)",
        "confidence": 0.8109
    },
    "temporal_resolution": {
        "value": null,
        "confidence": 0.0
    },
    "bands": {
        "value": [
            {
                "value": "RGB",
                "confidence": 0.9545
            },
            {
                "value": "Multispectral (Sentinel-2: 12 bands, Sentinel-1: 2 bands)",
                "confidence": 0.4823
            }
        ]
    },
    "pretraining_phases": [
        {
            "dataset": {
                "value": "RS5M",
                "confidence": 1.0
            },
            "regions_coverage": {
                "value": [
                    {
                        "value": "Global (with overrepresentation in North America, Europe, Asia, South America; underrepresentation in Middle/Southern Africa, Northern Canada/Russia, Southern Indonesia, Australia, Central Asia)",
                        "confidence": 0.3
                    }
                ]
            },
            "time_range": {
                "value": null,
                "confidence": 0.0
            },
            "num_images": {
                "value": 5070186,
                "confidence": 0.6792
            },
            "token_size": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": "varies (120x120 to 800x800)",
                "confidence": 0.3256
            },
            "epochs": {
                "value": 20,
                "confidence": 1.0
            },
            "batch_size": {
                "value": 700,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": "1e-6",
                "confidence": 0.9988
            },
            "augmentations": {
                "value": [
                    {
                        "value": "Image rotation (for caption selection)",
                        "confidence": 0.3326
                    },
                    {
                        "value": "Standard image augmentations (not detailed)",
                        "confidence": 0.3081
                    }
                ]
            },
            "processing": {
                "value": [
                    {
                        "value": "Invalid image checking",
                        "confidence": 0.3497
                    },
                    {
                        "value": "Deduplication",
                        "confidence": 0.8862
                    },
                    {
                        "value": "VLM filtering",
                        "confidence": 0.4423
                    },
                    {
                        "value": "RS image detector",
                        "confidence": 0.6575
                    },
                    {
                        "value": "Caption generation with BLIP2",
                        "confidence": 0.3377
                    },
                    {
                        "value": "Caption ranking and reranking",
                        "confidence": 0.3104
                    },
                    {
                        "value": "Rotation-invariant caption selection",
                        "confidence": 0.4563
                    },
                    {
                        "value": "Geo-metadata integration",
                        "confidence": 0.518
                    }
                ]
            },
            "sampling": {
                "value": "5% of RS5M for validation, rest for training",
                "confidence": 0.3215
            },
            "processing_level": {
                "value": null,
                "confidence": 0.0
            },
            "cloud_cover": {
                "value": null,
                "confidence": 0.0
            },
            "missing_data": {
                "value": null,
                "confidence": 0.0
            },
            "masking_ratio": {
                "value": null,
                "confidence": 0.0
            }
        }
    ],
    "benchmarks": [
        {
            "task": {
                "value": "Zero-shot Classification",
                "confidence": 0.7832
            },
            "application": {
                "value": "Remote sensing scene classification",
                "confidence": 0.6711
            },
            "dataset": {
                "value": "AID",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "Top-1 Accuracy",
                        "confidence": 0.9797
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 73.72,
                        "confidence": 0.9984
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "Various (see supported_sensors)",
                        "confidence": 0.3198
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": "Global",
                        "confidence": 0.8703
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": null,
                "confidence": 0.0
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 1.0
            },
            "batch_size": {
                "value": 700,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 1e-06,
                "confidence": 0.9959
            },
            "epochs": {
                "value": 20,
                "confidence": 1.0
            },
            "loss_function": {
                "value": "InfoNCE",
                "confidence": 0.9982
            },
            "split_ratio": {
                "value": "95% train, 5% val",
                "confidence": 0.3845
            }
        },
        {
            "task": {
                "value": "Zero-shot Classification",
                "confidence": 1.0
            },
            "application": {
                "value": "Remote sensing scene classification",
                "confidence": 0.9997
            },
            "dataset": {
                "value": "RESISC45",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "Top-1 Accuracy",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 71.89,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "Various (see supported_sensors)",
                        "confidence": 0.7356
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": "Global",
                        "confidence": 0.9996
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": null,
                "confidence": 0.0
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 1.0
            },
            "batch_size": {
                "value": 700,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 1e-06,
                "confidence": 1.0
            },
            "epochs": {
                "value": 20,
                "confidence": 1.0
            },
            "loss_function": {
                "value": "InfoNCE",
                "confidence": 1.0
            },
            "split_ratio": {
                "value": "95% train, 5% val",
                "confidence": 0.9992
            }
        },
        {
            "task": {
                "value": "Zero-shot Classification",
                "confidence": 1.0
            },
            "application": {
                "value": "Remote sensing scene classification",
                "confidence": 0.884
            },
            "dataset": {
                "value": "EuroSAT",
                "confidence": 0.9999
            },
            "metrics": {
                "value": [
                    {
                        "value": "Top-1 Accuracy",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 61.49,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "Various (see supported_sensors)",
                        "confidence": 0.6904
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": "Europe",
                        "confidence": 0.8094
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": null,
                "confidence": 0.0
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 1.0
            },
            "batch_size": {
                "value": 700,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 1e-06,
                "confidence": 1.0
            },
            "epochs": {
                "value": 20,
                "confidence": 1.0
            },
            "loss_function": {
                "value": "InfoNCE",
                "confidence": 1.0
            },
            "split_ratio": {
                "value": "95% train, 5% val",
                "confidence": 0.9995
            }
        },
        {
            "task": {
                "value": "Remote Sensing Cross-Modal Text?Image Retrieval",
                "confidence": 0.6392
            },
            "application": {
                "value": "Image-to-text and text-to-image retrieval",
                "confidence": 0.3713
            },
            "dataset": {
                "value": "RSICD",
                "confidence": 0.65
            },
            "metrics": {
                "value": [
                    {
                        "value": "Recall@1",
                        "confidence": 0.6323
                    },
                    {
                        "value": "Recall@5",
                        "confidence": 1.0
                    },
                    {
                        "value": "Recall@10",
                        "confidence": 1.0
                    },
                    {
                        "value": "Mean Recall",
                        "confidence": 0.7949
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 11.53,
                        "confidence": 1.0
                    },
                    {
                        "value": 28.55,
                        "confidence": 1.0
                    },
                    {
                        "value": 39.16,
                        "confidence": 1.0
                    },
                    {
                        "value": 26.18,
                        "confidence": 0.9998
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "Various (see supported_sensors)",
                        "confidence": 0.8833
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": "Global",
                        "confidence": 0.9546
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": null,
                "confidence": 0.0
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 1.0
            },
            "batch_size": {
                "value": 700,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 1e-06,
                "confidence": 1.0
            },
            "epochs": {
                "value": 20,
                "confidence": 1.0
            },
            "loss_function": {
                "value": "InfoNCE",
                "confidence": 1.0
            },
            "split_ratio": {
                "value": "95% train, 5% val",
                "confidence": 0.5124
            }
        },
        {
            "task": {
                "value": "Remote Sensing Cross-Modal Text?Image Retrieval",
                "confidence": 0.9999
            },
            "application": {
                "value": "Image-to-text and text-to-image retrieval",
                "confidence": 1.0
            },
            "dataset": {
                "value": "RSITMD",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "Recall@1",
                        "confidence": 1.0
                    },
                    {
                        "value": "Recall@5",
                        "confidence": 1.0
                    },
                    {
                        "value": "Recall@10",
                        "confidence": 1.0
                    },
                    {
                        "value": "Mean Recall",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 19.03,
                        "confidence": 1.0
                    },
                    {
                        "value": 34.51,
                        "confidence": 1.0
                    },
                    {
                        "value": 46.46,
                        "confidence": 1.0
                    },
                    {
                        "value": 35.68,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "Various (see supported_sensors)",
                        "confidence": 0.9999
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": "Global",
                        "confidence": 1.0
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": null,
                "confidence": 0.0
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 1.0
            },
            "batch_size": {
                "value": 700,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 1e-06,
                "confidence": 1.0
            },
            "epochs": {
                "value": 20,
                "confidence": 1.0
            },
            "loss_function": {
                "value": "InfoNCE",
                "confidence": 1.0
            },
            "split_ratio": {
                "value": "95% train, 5% val",
                "confidence": 1.0
            }
        },
        {
            "task": {
                "value": "Semantic Localization",
                "confidence": 1.0
            },
            "application": {
                "value": "Semantic-level retrieval/localization",
                "confidence": 0.3232
            },
            "dataset": {
                "value": "AIR-SLT",
                "confidence": 0.9996
            },
            "metrics": {
                "value": [
                    {
                        "value": "Rsu",
                        "confidence": 0.9923
                    },
                    {
                        "value": "Ras",
                        "confidence": 1.0
                    },
                    {
                        "value": "Rda",
                        "confidence": 1.0
                    },
                    {
                        "value": "Rmi",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 0.7546,
                        "confidence": 1.0
                    },
                    {
                        "value": 0.261,
                        "confidence": 0.9069
                    },
                    {
                        "value": 0.718,
                        "confidence": 0.9988
                    },
                    {
                        "value": 0.74,
                        "confidence": 0.9975
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "Various (see supported_sensors)",
                        "confidence": 0.9989
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": "Global",
                        "confidence": 0.9959
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": null,
                "confidence": 0.0
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 1.0
            },
            "batch_size": {
                "value": 700,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 1e-06,
                "confidence": 1.0
            },
            "epochs": {
                "value": 20,
                "confidence": 1.0
            },
            "loss_function": {
                "value": "InfoNCE",
                "confidence": 1.0
            },
            "split_ratio": {
                "value": "95% train, 5% val",
                "confidence": 0.9997
            }
        }
    ]
}