{
    "model_id": {
        "value": "aquila-rs-vlm",
        "confidence": 0.7435
    },
    "model_name": {
        "value": "Aquila",
        "confidence": 1.0
    },
    "version": {
        "value": null,
        "confidence": 0.0
    },
    "release_date": {
        "value": null,
        "confidence": 0.0
    },
    "last_updated": {
        "value": null,
        "confidence": 0.0
    },
    "short_description": {
        "value": "Aquila is a hierarchically aligned visual-language model for remote sensing, featuring a high-resolution, multi-scale vision encoder (CLIP-ConvNeXt-XXL), a Hierarchical Spatial Feature Integration (SFI) module, and a Multi-layer Deep Alignment (MDA) strategy for robust visual-language feature fusion. It achieves state-of-the-art results on remote sensing image captioning and VQA benchmarks.",
        "confidence": 0.7849
    },
    "paper_link": {
        "value": null,
        "confidence": 0.0
    },
    "citations": {
        "value": null,
        "confidence": 0.0
    },
    "repository": {
        "value": null,
        "confidence": 0.0
    },
    "weights": {
        "value": null,
        "confidence": 0.0
    },
    "backbone": {
        "value": "CLIP-ConvNeXt-XXL",
        "confidence": 0.9974
    },
    "num_layers": {
        "value": null,
        "confidence": 0.0
    },
    "num_parameters": {
        "value": null,
        "confidence": 0.0
    },
    "pretext_training_type": {
        "value": "Image-Text Alignment Pre-training and Instruction Fine-tuning",
        "confidence": 0.6945
    },
    "masking_strategy": {
        "value": null,
        "confidence": 0.0
    },
    "pretraining": {
        "value": "Two-stage: (1) Alignment pretraining with frozen vision encoder and LLM, only SFI trainable, on 1M image-text pairs; (2) Instruction fine-tuning with LoRA on LLM, SFI trainable, vision encoder frozen, on 1.8M instruction image-text pairs.",
        "confidence": 0.8041
    },
    "domain_knowledge": {
        "value": [
            {
                "value": "Remote sensing-specific image-text pairs",
                "confidence": 0.7883
            },
            {
                "value": "Domain-specific instruction datasets",
                "confidence": 0.5148
            }
        ]
    },
    "backbone_modifications": {
        "value": [
            {
                "value": "Removed final classification layer from vision encoder",
                "confidence": 0.7512
            },
            {
                "value": "Added four projectors (2-layer MLPs) for multi-scale feature mapping",
                "confidence": 0.6386
            }
        ]
    },
    "supported_sensors": {
        "value": [
            {
                "value": null,
                "confidence": 0.3
            }
        ]
    },
    "modality_integration_type": {
        "value": "Homogeneous Multimodal",
        "confidence": 0.9322
    },
    "modalities": {
        "value": [
            {
                "value": "Multispectral",
                "confidence": 0.9978
            },
            {
                "value": "Text",
                "confidence": 0.9002
            }
        ]
    },
    "spectral_alignment": {
        "value": "none",
        "confidence": 0.7733
    },
    "temporal_alignment": {
        "value": "none",
        "confidence": 1.0
    },
    "spatial_resolution": {
        "value": "up to 1024x1024 pixels",
        "confidence": 0.8406
    },
    "temporal_resolution": {
        "value": null,
        "confidence": 0.0
    },
    "bands": {
        "value": [
            {
                "value": null,
                "confidence": 0.3
            }
        ]
    },
    "pretraining_phases": [
        {
            "dataset": {
                "value": "CapERA, UCM, Sydney, NWPU, RSICD, RSITMD, RSVQA-HR, RSVQA-LR, WHU_RS19 (integrated, ~1M image-text pairs)",
                "confidence": 0.311
            },
            "regions_coverage": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "time_range": {
                "value": null,
                "confidence": 0.0
            },
            "num_images": {
                "value": 1000000,
                "confidence": 1.0
            },
            "token_size": {
                "value": "1024",
                "confidence": 0.9987
            },
            "image_resolution": {
                "value": "1024\u00d71024",
                "confidence": 0.9469
            },
            "epochs": {
                "value": 1,
                "confidence": 1.0
            },
            "batch_size": {
                "value": 32,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": "1e-3",
                "confidence": 0.994
            },
            "augmentations": {
                "value": []
            },
            "processing": {
                "value": []
            },
            "sampling": {
                "value": null,
                "confidence": 0.0
            },
            "processing_level": {
                "value": null,
                "confidence": 0.0
            },
            "cloud_cover": {
                "value": null,
                "confidence": 0.0
            },
            "missing_data": {
                "value": null,
                "confidence": 0.0
            },
            "masking_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "dataset": {
                "value": "FIT-RS (1.8M instruction image-text pairs)",
                "confidence": 0.6224
            },
            "regions_coverage": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "time_range": {
                "value": null,
                "confidence": 0.0
            },
            "num_images": {
                "value": 1800000,
                "confidence": 1.0
            },
            "token_size": {
                "value": "1024",
                "confidence": 1.0
            },
            "image_resolution": {
                "value": "1024\u00d71024",
                "confidence": 1.0
            },
            "epochs": {
                "value": 1,
                "confidence": 1.0
            },
            "batch_size": {
                "value": 32,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": "4e-5",
                "confidence": 1.0
            },
            "augmentations": {
                "value": []
            },
            "processing": {
                "value": []
            },
            "sampling": {
                "value": null,
                "confidence": 0.0
            },
            "processing_level": {
                "value": null,
                "confidence": 0.0
            },
            "cloud_cover": {
                "value": null,
                "confidence": 0.0
            },
            "missing_data": {
                "value": null,
                "confidence": 0.0
            },
            "masking_ratio": {
                "value": null,
                "confidence": 0.0
            }
        }
    ],
    "benchmarks": [
        {
            "task": {
                "value": "Image caption",
                "confidence": 0.721
            },
            "application": {
                "value": "Remote sensing image captioning",
                "confidence": 0.7867
            },
            "dataset": {
                "value": "RSICD",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "BLEU-1",
                        "confidence": 0.5959
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 74.6,
                        "confidence": 0.9999
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": []
            },
            "image_resolution": {
                "value": "224\u00d7224",
                "confidence": 0.9859
            },
            "spatial_resolution": {
                "value": "various",
                "confidence": 0.5992
            },
            "bands_used": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 1.0
            },
            "batch_size": {
                "value": 32,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 0.001,
                "confidence": 0.9023
            },
            "epochs": {
                "value": 1,
                "confidence": 1.0
            },
            "loss_function": {
                "value": "Next-token prediction loss",
                "confidence": 0.6821
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Image caption",
                "confidence": 1.0
            },
            "application": {
                "value": "Remote sensing image captioning",
                "confidence": 0.9856
            },
            "dataset": {
                "value": "Sydney",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "BLEU-1",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 83.42,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": []
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 1.0
            },
            "batch_size": {
                "value": 32,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 0.001,
                "confidence": 1.0
            },
            "epochs": {
                "value": 1,
                "confidence": 1.0
            },
            "loss_function": {
                "value": "Next-token prediction loss",
                "confidence": 1.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Image caption",
                "confidence": 1.0
            },
            "application": {
                "value": "Remote sensing image captioning",
                "confidence": 0.9999
            },
            "dataset": {
                "value": "UCM",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "BLEU-1",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 88.25,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": []
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 1.0
            },
            "batch_size": {
                "value": 32,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 0.001,
                "confidence": 1.0
            },
            "epochs": {
                "value": 1,
                "confidence": 1.0
            },
            "loss_function": {
                "value": "Next-token prediction loss",
                "confidence": 1.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Image caption",
                "confidence": 1.0
            },
            "application": {
                "value": "Fine-grained remote sensing image captioning",
                "confidence": 0.5455
            },
            "dataset": {
                "value": "FIT-RSFG-Captions",
                "confidence": 0.9969
            },
            "metrics": {
                "value": [
                    {
                        "value": "BLEU-1",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 35.08,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": []
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 1.0
            },
            "batch_size": {
                "value": 32,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 0.001,
                "confidence": 0.9994
            },
            "epochs": {
                "value": 1,
                "confidence": 1.0
            },
            "loss_function": {
                "value": "Next-token prediction loss",
                "confidence": 1.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Visual question answering",
                "confidence": 0.9732
            },
            "application": {
                "value": "Remote sensing VQA",
                "confidence": 0.7384
            },
            "dataset": {
                "value": "RSVQA-LR",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "Accuracy",
                        "confidence": 0.9978
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 92.72,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": []
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 1.0
            },
            "batch_size": {
                "value": 32,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 0.001,
                "confidence": 0.9976
            },
            "epochs": {
                "value": 1,
                "confidence": 1.0
            },
            "loss_function": {
                "value": "Next-token prediction loss",
                "confidence": 1.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Visual question answering",
                "confidence": 1.0
            },
            "application": {
                "value": "Remote sensing VQA",
                "confidence": 0.9795
            },
            "dataset": {
                "value": "RSVQA-HR",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "Accuracy",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 92.64,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": []
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 1.0
            },
            "batch_size": {
                "value": 32,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 0.001,
                "confidence": 1.0
            },
            "epochs": {
                "value": 1,
                "confidence": 1.0
            },
            "loss_function": {
                "value": "Next-token prediction loss",
                "confidence": 1.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Visual question answering",
                "confidence": 1.0
            },
            "application": {
                "value": "Fine-grained remote sensing VQA",
                "confidence": 0.9752
            },
            "dataset": {
                "value": "FIT-RSFG-VQA",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "Accuracy",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 83.87,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": []
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 1.0
            },
            "batch_size": {
                "value": 32,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 0.001,
                "confidence": 1.0
            },
            "epochs": {
                "value": 1,
                "confidence": 1.0
            },
            "loss_function": {
                "value": "Next-token prediction loss",
                "confidence": 1.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        }
    ]
}