{
    "model_id": {
        "value": "GeoChat",
        "confidence": 0.8929
    },
    "model_name": {
        "value": "GeoChat",
        "confidence": 1.0
    },
    "version": {
        "value": null,
        "confidence": 0.0
    },
    "release_date": {
        "value": "2023-11-24",
        "confidence": 0.9958
    },
    "last_updated": {
        "value": null,
        "confidence": 0.0
    },
    "short_description": {
        "value": "GeoChat is the first grounded large vision-language model (VLM) for remote sensing, capable of multitask conversational capabilities with high-resolution RS images, including image/region captioning, VQA, scene classification, visually grounded conversations, and referring detection. It is based on LLaVA-1.5 and Vicuna-v1.5, with a CLIP-ViT(L-14) visual backbone and LoRA fine-tuning.",
        "confidence": 0.7867
    },
    "paper_link": {
        "value": "https://arxiv.org/abs/2311.15826",
        "confidence": 0.9999
    },
    "citations": {
        "value": null,
        "confidence": 0.0
    },
    "repository": {
        "value": null,
        "confidence": 0.0
    },
    "weights": {
        "value": null,
        "confidence": 0.0
    },
    "backbone": {
        "value": "CLIP-ViT(L-14)",
        "confidence": 0.9996
    },
    "num_layers": {
        "value": null,
        "confidence": 0.0
    },
    "num_parameters": {
        "value": null,
        "confidence": 0.0
    },
    "pretext_training_type": {
        "value": "Instruction tuning with multimodal remote sensing data",
        "confidence": 0.6563
    },
    "masking_strategy": {
        "value": null,
        "confidence": 0.0
    },
    "pretraining": {
        "value": "Initialized with pretrained CLIP-ViT(L-14) encoder, pretrained MLP adaptor, and Vicuna-v1.5 LLM. LoRA fine-tuning is applied to the LLM for remote sensing tasks, while the visual backbone and adaptor are frozen.",
        "confidence": 0.737
    },
    "domain_knowledge": {
        "value": [
            {
                "value": "Remote sensing imagery",
                "confidence": 0.5528
            },
            {
                "value": "Spatial location representation",
                "confidence": 0.6282
            },
            {
                "value": "Region-level reasoning",
                "confidence": 0.6948
            },
            {
                "value": "Object attribute extraction",
                "confidence": 0.4266
            },
            {
                "value": "Object relationship modeling",
                "confidence": 0.4628
            }
        ]
    },
    "backbone_modifications": {
        "value": [
            {
                "value": "Interpolated positional encoding to support 504x504 input resolution (1296 patches)",
                "confidence": 0.7312
            },
            {
                "value": "Task-specific prompt tokens for task switching",
                "confidence": 0.5269
            },
            {
                "value": "Spatial location representation in textual format",
                "confidence": 0.6066
            }
        ]
    },
    "supported_sensors": {
        "value": [
            {
                "value": null,
                "confidence": 0.3
            }
        ]
    },
    "modality_integration_type": {
        "value": "Homogeneous Multimodal",
        "confidence": 0.8983
    },
    "modalities": {
        "value": [
            {
                "value": "Multispectral",
                "confidence": 0.8976
            },
            {
                "value": "RGB",
                "confidence": 0.5332
            },
            {
                "value": "Text",
                "confidence": 0.8101
            }
        ]
    },
    "spectral_alignment": {
        "value": "none",
        "confidence": 0.9874
    },
    "temporal_alignment": {
        "value": "none",
        "confidence": 1.0
    },
    "spatial_resolution": {
        "value": "variable (supports up to 504x504 input images)",
        "confidence": 0.7254
    },
    "temporal_resolution": {
        "value": null,
        "confidence": 0.0
    },
    "bands": {
        "value": [
            {
                "value": null,
                "confidence": 0.3
            }
        ]
    },
    "pretraining_phases": [
        {
            "dataset": {
                "value": "SAMRS (DOTA, DIOR, FAIR1M), NWPU-RESISC-45, LRBEN (RSVQA), Floodnet",
                "confidence": 0.3584
            },
            "regions_coverage": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "time_range": {
                "value": null,
                "confidence": 0.0
            },
            "num_images": {
                "value": 318000,
                "confidence": 0.952
            },
            "token_size": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": "504x504",
                "confidence": 0.614
            },
            "epochs": {
                "value": 1,
                "confidence": 0.9527
            },
            "batch_size": {
                "value": 144,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": "cosine scheduler",
                "confidence": 0.6154
            },
            "augmentations": {
                "value": []
            },
            "processing": {
                "value": []
            },
            "sampling": {
                "value": null,
                "confidence": 0.0
            },
            "processing_level": {
                "value": null,
                "confidence": 0.0
            },
            "cloud_cover": {
                "value": null,
                "confidence": 0.0
            },
            "missing_data": {
                "value": null,
                "confidence": 0.0
            },
            "masking_ratio": {
                "value": null,
                "confidence": 0.0
            }
        }
    ],
    "benchmarks": [
        {
            "task": {
                "value": "Scene classification",
                "confidence": 0.6323
            },
            "application": {
                "value": "Remote sensing scene classification",
                "confidence": 0.4133
            },
            "dataset": {
                "value": "AID",
                "confidence": 0.9157
            },
            "metrics": {
                "value": [
                    {
                        "value": "Accuracy",
                        "confidence": 0.7256
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 72.03,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": 10000,
                "confidence": 1.0
            },
            "num_samples": {
                "value": 2000,
                "confidence": 1.0
            },
            "sampling_percentage": {
                "value": 20,
                "confidence": 1.0
            },
            "num_classes": {
                "value": 30,
                "confidence": 1.0
            },
            "classes": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "image_resolution": {
                "value": "variable",
                "confidence": 0.952
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 0.9166
            },
            "batch_size": {
                "value": 144,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": 1,
                "confidence": 1.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": "80/20",
                "confidence": 0.6514
            }
        },
        {
            "task": {
                "value": "Scene classification",
                "confidence": 1.0
            },
            "application": {
                "value": "Remote sensing scene classification",
                "confidence": 0.916
            },
            "dataset": {
                "value": "UCMerced",
                "confidence": 0.9999
            },
            "metrics": {
                "value": [
                    {
                        "value": "Accuracy",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 84.43,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": 2100,
                "confidence": 1.0
            },
            "num_samples": {
                "value": 2100,
                "confidence": 1.0
            },
            "sampling_percentage": {
                "value": 100,
                "confidence": 1.0
            },
            "num_classes": {
                "value": 21,
                "confidence": 1.0
            },
            "classes": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "image_resolution": {
                "value": "256x256",
                "confidence": 0.9998
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 0.9947
            },
            "batch_size": {
                "value": 144,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": 1,
                "confidence": 1.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": "100/0",
                "confidence": 0.511
            }
        },
        {
            "task": {
                "value": "Visual Question Answering",
                "confidence": 0.7315
            },
            "application": {
                "value": "Remote sensing VQA",
                "confidence": 0.7267
            },
            "dataset": {
                "value": "RSVQA-LRBEN",
                "confidence": 0.9894
            },
            "metrics": {
                "value": [
                    {
                        "value": "Accuracy",
                        "confidence": 0.6966
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 90.7,
                        "confidence": 0.7669
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": 772,
                "confidence": 0.9121
            },
            "num_samples": {
                "value": 86,
                "confidence": 0.6533
            },
            "sampling_percentage": {
                "value": 11.1,
                "confidence": 0.9827
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "image_resolution": {
                "value": "256x256",
                "confidence": 0.9971
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 0.9469
            },
            "batch_size": {
                "value": 144,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": 1,
                "confidence": 1.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": "77.8/11.1/11.1",
                "confidence": 0.7768
            }
        },
        {
            "task": {
                "value": "Visual Question Answering",
                "confidence": 0.9997
            },
            "application": {
                "value": "Remote sensing VQA",
                "confidence": 0.9974
            },
            "dataset": {
                "value": "RSVQA-HRBEN",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "Accuracy",
                        "confidence": 0.8637
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 72.3,
                        "confidence": 0.8722
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": 10569,
                "confidence": 0.9955
            },
            "num_samples": {
                "value": 47000,
                "confidence": 0.8976
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 0.9983
            },
            "batch_size": {
                "value": 144,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": 1,
                "confidence": 1.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Visual Grounding",
                "confidence": 0.8958
            },
            "application": {
                "value": "Grounded object detection and referring expression",
                "confidence": 0.3216
            },
            "dataset": {
                "value": "SAMRS (validation set)",
                "confidence": 0.3255
            },
            "metrics": {
                "value": [
                    {
                        "value": "Accuracy@0.5",
                        "confidence": 0.6391
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 10.6,
                        "confidence": 0.9412
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": 7653,
                "confidence": 0.5713
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Grounding Description",
                "confidence": 0.6686
            },
            "application": {
                "value": "Grounded image description",
                "confidence": 0.399
            },
            "dataset": {
                "value": "SAMRS (validation set)",
                "confidence": 0.8965
            },
            "metrics": {
                "value": [
                    {
                        "value": "Accuracy@0.5",
                        "confidence": 0.8943
                    },
                    {
                        "value": "Accuracy@0.25",
                        "confidence": 0.9584
                    },
                    {
                        "value": "METEOR",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 11.7,
                        "confidence": 1.0
                    },
                    {
                        "value": 33.9,
                        "confidence": 1.0
                    },
                    {
                        "value": 48.9,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": 555,
                "confidence": 0.9992
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Region-level Captioning",
                "confidence": 0.6306
            },
            "application": {
                "value": "Region captioning",
                "confidence": 0.5411
            },
            "dataset": {
                "value": "SAMRS (validation set)",
                "confidence": 0.9942
            },
            "metrics": {
                "value": [
                    {
                        "value": "ROUGE-1",
                        "confidence": 1.0
                    },
                    {
                        "value": "ROUGE-L",
                        "confidence": 1.0
                    },
                    {
                        "value": "METEOR",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 87.3,
                        "confidence": 1.0
                    },
                    {
                        "value": 87.2,
                        "confidence": 1.0
                    },
                    {
                        "value": 83.9,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        }
    ]
}