{
    "model_id": {
        "value": "UniRS",
        "confidence": 0.895
    },
    "model_name": {
        "value": "UniRS",
        "confidence": 1.0
    },
    "version": {
        "value": null,
        "confidence": 0.0
    },
    "release_date": {
        "value": null,
        "confidence": 0.0
    },
    "last_updated": {
        "value": null,
        "confidence": 0.0
    },
    "short_description": {
        "value": "UniRS is the first vision-language model unifying multi-temporal remote sensing tasks across various types of visual input, supporting single images, dual-time image pairs, and videos as input, enabling comprehensive remote sensing temporal analysis within a unified framework.",
        "confidence": 0.9033
    },
    "paper_link": {
        "value": null,
        "confidence": 0.2
    },
    "citations": {
        "value": null,
        "confidence": 0.0
    },
    "repository": {
        "value": null,
        "confidence": 0.0
    },
    "weights": {
        "value": null,
        "confidence": 0.0
    },
    "backbone": {
        "value": "VILA-1.5 (3B) with SigLIP visual encoder and Sheared-LLaMA (3B) language decoder",
        "confidence": 0.8572
    },
    "num_layers": {
        "value": null,
        "confidence": 0.0
    },
    "num_parameters": {
        "value": 3000000000,
        "confidence": 0.8989
    },
    "pretext_training_type": {
        "value": "Instruction tuning with prompt augmentation and multi-task joint fine-tuning",
        "confidence": 0.7453
    },
    "masking_strategy": {
        "value": null,
        "confidence": 0.0
    },
    "pretraining": {
        "value": "Joint instruction fine-tuning on a mixture of remote sensing datasets (GeoChat-Instruct, LEVIR-CC, ERA) with prompt augmentation and a dedicated change extraction module for dual-time image pairs.",
        "confidence": 0.7759
    },
    "domain_knowledge": {
        "value": [
            {
                "value": "Prompt augmentation mechanism",
                "confidence": 0.5059
            },
            {
                "value": "Change Extraction module for dual-time image pairs",
                "confidence": 0.6778
            }
        ]
    },
    "backbone_modifications": {
        "value": [
            {
                "value": "Addition of Change Extraction module for dual-time image pairs",
                "confidence": 0.7641
            },
            {
                "value": "Prompt augmentation mechanism",
                "confidence": 0.8841
            },
            {
                "value": "Unified visual embedding representation for multi-temporal inputs",
                "confidence": 0.4914
            }
        ]
    },
    "supported_sensors": {
        "value": null,
        "confidence": 0.0
    },
    "modality_integration_type": {
        "value": "Homogeneous Multimodal",
        "confidence": 0.9393
    },
    "modalities": {
        "value": [
            {
                "value": "Multispectral",
                "confidence": 0.9772
            },
            {
                "value": "Optical",
                "confidence": 0.4942
            },
            {
                "value": "SAR",
                "confidence": 0.701
            },
            {
                "value": "Infrared",
                "confidence": 0.8935
            },
            {
                "value": "Video",
                "confidence": 0.9558
            }
        ]
    },
    "spectral_alignment": {
        "value": "none",
        "confidence": 0.7873
    },
    "temporal_alignment": {
        "value": "partial",
        "confidence": 0.8095
    },
    "spatial_resolution": {
        "value": null,
        "confidence": 0.0
    },
    "temporal_resolution": {
        "value": null,
        "confidence": 0.0
    },
    "bands": {
        "value": null,
        "confidence": 0.0
    },
    "pretraining_phases": [
        {
            "dataset": {
                "value": "VILA-1.5 (3B) pretraining data (large-scale interleaved image-text data)",
                "confidence": 0.311
            },
            "regions_coverage": {
                "value": null,
                "confidence": 0.0
            },
            "time_range": {
                "value": null,
                "confidence": 0.0
            },
            "num_images": {
                "value": null,
                "confidence": 0.0
            },
            "token_size": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": null,
                "confidence": 0.0
            },
            "processing": {
                "value": null,
                "confidence": 0.0
            },
            "sampling": {
                "value": null,
                "confidence": 0.0
            },
            "processing_level": {
                "value": null,
                "confidence": 0.0
            },
            "cloud_cover": {
                "value": null,
                "confidence": 0.0
            },
            "missing_data": {
                "value": null,
                "confidence": 0.0
            },
            "masking_ratio": {
                "value": null,
                "confidence": 0.0
            }
        }
    ],
    "benchmarks": [
        {
            "task": {
                "value": "Visual Question Answering",
                "confidence": 0.9306
            },
            "application": {
                "value": "Remote sensing image understanding",
                "confidence": 0.3561
            },
            "dataset": {
                "value": "RSVQA-LR",
                "confidence": 0.9999
            },
            "metrics": {
                "value": [
                    {
                        "value": "Presence Accuracy",
                        "confidence": 0.5496
                    },
                    {
                        "value": "Comparison Accuracy",
                        "confidence": 1.0
                    },
                    {
                        "value": "Rural/Urban Accuracy",
                        "confidence": 0.9977
                    },
                    {
                        "value": "Average Accuracy",
                        "confidence": 0.7345
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 91.64,
                        "confidence": 0.9994
                    },
                    {
                        "value": 92.68,
                        "confidence": 1.0
                    },
                    {
                        "value": 90.0,
                        "confidence": 0.6224
                    },
                    {
                        "value": 92.21,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": null,
                "confidence": 0.0
            },
            "regions": {
                "value": null,
                "confidence": 0.0
            },
            "original_samples": {
                "value": 772,
                "confidence": 0.9998
            },
            "num_samples": {
                "value": 772,
                "confidence": 0.8738
            },
            "sampling_percentage": {
                "value": 100,
                "confidence": 0.9971
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": "256x256",
                "confidence": 0.9941
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": null,
                "confidence": 0.0
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 0.9166
            },
            "batch_size": {
                "value": 128,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 0.0001,
                "confidence": 0.9998
            },
            "epochs": {
                "value": 1,
                "confidence": 0.9995
            },
            "loss_function": {
                "value": "Cross-entropy",
                "confidence": 0.7468
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Visual Question Answering",
                "confidence": 1.0
            },
            "application": {
                "value": "Remote sensing image understanding",
                "confidence": 0.9832
            },
            "dataset": {
                "value": "RSVQA-HR",
                "confidence": 0.9989
            },
            "metrics": {
                "value": [
                    {
                        "value": "Presence Accuracy",
                        "confidence": 0.9996
                    },
                    {
                        "value": "Comparison Accuracy",
                        "confidence": 1.0
                    },
                    {
                        "value": "Average Accuracy",
                        "confidence": 0.9971
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 59.29,
                        "confidence": 1.0
                    },
                    {
                        "value": 84.05,
                        "confidence": 1.0
                    },
                    {
                        "value": 73.15,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": null,
                "confidence": 0.0
            },
            "regions": {
                "value": null,
                "confidence": 0.0
            },
            "original_samples": {
                "value": 10659,
                "confidence": 0.9995
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": "512x512",
                "confidence": 1.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": null,
                "confidence": 0.0
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 0.9923
            },
            "batch_size": {
                "value": 128,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 0.0001,
                "confidence": 1.0
            },
            "epochs": {
                "value": 1,
                "confidence": 1.0
            },
            "loss_function": {
                "value": "Cross-entropy",
                "confidence": 1.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Visual Question Answering",
                "confidence": 1.0
            },
            "application": {
                "value": "Remote sensing image understanding",
                "confidence": 0.9511
            },
            "dataset": {
                "value": "CRSVQA",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "Overall Accuracy",
                        "confidence": 0.9792
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 86.67,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": null,
                "confidence": 0.0
            },
            "regions": {
                "value": null,
                "confidence": 0.0
            },
            "original_samples": {
                "value": 4639,
                "confidence": 0.9978
            },
            "num_samples": {
                "value": 1000,
                "confidence": 0.9977
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": "600x600",
                "confidence": 1.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": null,
                "confidence": 0.0
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 0.9991
            },
            "batch_size": {
                "value": 128,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 0.0001,
                "confidence": 1.0
            },
            "epochs": {
                "value": 1,
                "confidence": 1.0
            },
            "loss_function": {
                "value": "Cross-entropy",
                "confidence": 1.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Change Captioning",
                "confidence": 0.9986
            },
            "application": {
                "value": "Remote sensing change detection and description",
                "confidence": 0.4149
            },
            "dataset": {
                "value": "LEVIR-CC",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "CIDEr-D",
                        "confidence": 0.9998
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 139.12,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": null,
                "confidence": 0.0
            },
            "regions": {
                "value": null,
                "confidence": 0.0
            },
            "original_samples": {
                "value": 10077,
                "confidence": 0.9734
            },
            "num_samples": {
                "value": 1929,
                "confidence": 0.9855
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": null,
                "confidence": 0.0
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 0.9998
            },
            "batch_size": {
                "value": 128,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 0.0001,
                "confidence": 1.0
            },
            "epochs": {
                "value": 1,
                "confidence": 1.0
            },
            "loss_function": {
                "value": "Cross-entropy",
                "confidence": 1.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Video Scene Classification",
                "confidence": 0.9898
            },
            "application": {
                "value": "Remote sensing video event recognition",
                "confidence": 0.4462
            },
            "dataset": {
                "value": "ERA",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "Overall Accuracy",
                        "confidence": 0.9899
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 87.8,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": null,
                "confidence": 0.0
            },
            "regions": {
                "value": null,
                "confidence": 0.0
            },
            "original_samples": {
                "value": 2864,
                "confidence": 1.0
            },
            "num_samples": {
                "value": 1391,
                "confidence": 1.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": 25,
                "confidence": 1.0
            },
            "classes": {
                "value": [
                    {
                        "value": "post-earthquake",
                        "confidence": 0.9183
                    },
                    {
                        "value": "flood",
                        "confidence": 1.0
                    },
                    {
                        "value": "fire",
                        "confidence": 1.0
                    },
                    {
                        "value": "landslide",
                        "confidence": 1.0
                    },
                    {
                        "value": "mudslide",
                        "confidence": 1.0
                    },
                    {
                        "value": "traffic collision",
                        "confidence": 0.9999
                    },
                    {
                        "value": "traffic congestion",
                        "confidence": 1.0
                    },
                    {
                        "value": "harvesting",
                        "confidence": 1.0
                    },
                    {
                        "value": "ploughing",
                        "confidence": 1.0
                    },
                    {
                        "value": "constructing",
                        "confidence": 1.0
                    },
                    {
                        "value": "police chase",
                        "confidence": 1.0
                    },
                    {
                        "value": "conflict",
                        "confidence": 1.0
                    },
                    {
                        "value": "baseball",
                        "confidence": 1.0
                    },
                    {
                        "value": "basketball",
                        "confidence": 1.0
                    },
                    {
                        "value": "boating",
                        "confidence": 1.0
                    },
                    {
                        "value": "cycling",
                        "confidence": 1.0
                    },
                    {
                        "value": "running",
                        "confidence": 1.0
                    },
                    {
                        "value": "soccer",
                        "confidence": 1.0
                    },
                    {
                        "value": "swimming",
                        "confidence": 1.0
                    },
                    {
                        "value": "car racing",
                        "confidence": 1.0
                    },
                    {
                        "value": "party",
                        "confidence": 1.0
                    },
                    {
                        "value": "concert",
                        "confidence": 1.0
                    },
                    {
                        "value": "parade/protest",
                        "confidence": 1.0
                    },
                    {
                        "value": "religious activity",
                        "confidence": 1.0
                    },
                    {
                        "value": "non-event",
                        "confidence": 1.0
                    }
                ]
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": null,
                "confidence": 0.0
            },
            "augmentations": {
                "value": null,
                "confidence": 0.0
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 1.0
            },
            "batch_size": {
                "value": 128,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 0.0001,
                "confidence": 1.0
            },
            "epochs": {
                "value": 1,
                "confidence": 1.0
            },
            "loss_function": {
                "value": "Cross-entropy",
                "confidence": 1.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        }
    ]
}