{
    "model_id": {
        "value": "RoMA-Mamba",
        "confidence": 0.7924
    },
    "model_name": {
        "value": "RoMA",
        "confidence": 1.0
    },
    "version": {
        "value": "v1",
        "confidence": 0.593
    },
    "release_date": {
        "value": "2025-03-13",
        "confidence": 0.8153
    },
    "last_updated": {
        "value": null,
        "confidence": 0.2
    },
    "short_description": {
        "value": "RoMA is the first self-supervised autoregressive pretraining framework for Mamba architectures in remote sensing, enabling efficient scaling to high-resolution RS imagery. It introduces adaptive rotation-aware tokenization and multi-scale token prediction to address rotational diversity, sparse targets, and extreme object scale variations in RS images. RoMA-pretrained Mamba models outperform ViT-based counterparts in accuracy and computational efficiency.",
        "confidence": 0.801
    },
    "paper_link": {
        "value": "https://arxiv.org/abs/2503.10392",
        "confidence": 1.0
    },
    "citations": {
        "value": null,
        "confidence": 0.0
    },
    "repository": {
        "value": null,
        "confidence": 0.0
    },
    "weights": {
        "value": null,
        "confidence": 0.0
    },
    "backbone": {
        "value": "Mamba",
        "confidence": 0.9997
    },
    "num_layers": {
        "value": 12,
        "confidence": 1.0
    },
    "num_parameters": {
        "value": 85,
        "confidence": 0.894
    },
    "pretext_training_type": {
        "value": "Self-supervised autoregressive pretraining",
        "confidence": 0.9266
    },
    "masking_strategy": {
        "value": "No masking; autoregressive next-token prediction with adaptive rotation encoding and multi-scale prediction",
        "confidence": 0.7521
    },
    "pretraining": {
        "value": "Autoregressive next-token prediction with adaptive rotation encoding and multi-scale prediction strategies on large-scale unlabeled remote sensing data",
        "confidence": 0.7938
    },
    "domain_knowledge": {
        "value": [
            {
                "value": "Rotation-aware pretraining",
                "confidence": 0.6285
            },
            {
                "value": "Multi-scale token prediction",
                "confidence": 0.8669
            },
            {
                "value": "Adaptive cropping",
                "confidence": 0.8726
            },
            {
                "value": "Angular embeddings",
                "confidence": 0.938
            }
        ]
    },
    "backbone_modifications": {
        "value": [],
        "confidence": 0.0
    },
    "supported_sensors": {
        "value": [
            {
                "value": null,
                "confidence": 0.3
            }
        ]
    },
    "modality_integration_type": {
        "value": "Unimodal",
        "confidence": 1.0
    },
    "modalities": {
        "value": [
            {
                "value": "Multispectral",
                "confidence": 0.9937
            },
            {
                "value": "RGB",
                "confidence": 0.9505
            }
        ]
    },
    "spectral_alignment": {
        "value": "none",
        "confidence": 0.7751
    },
    "temporal_alignment": {
        "value": "none",
        "confidence": 1.0
    },
    "spatial_resolution": {
        "value": "variable",
        "confidence": 0.9844
    },
    "temporal_resolution": {
        "value": null,
        "confidence": 0.0
    },
    "bands": {
        "value": [
            {
                "value": null,
                "confidence": 0.3
            }
        ]
    },
    "pretraining_phases": [
        {
            "dataset": {
                "value": "OpticalRS-4M",
                "confidence": 1.0
            },
            "regions_coverage": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "time_range": {
                "value": null,
                "confidence": 0.0
            },
            "num_images": {
                "value": 4000000,
                "confidence": 1.0
            },
            "token_size": {
                "value": "16x16",
                "confidence": 0.968
            },
            "image_resolution": {
                "value": "196x196",
                "confidence": 1.0
            },
            "epochs": {
                "value": 400,
                "confidence": 1.0
            },
            "batch_size": {
                "value": 256,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": "1.5e-4 (cosine scheduler)",
                "confidence": 0.4413
            },
            "augmentations": {
                "value": [
                    {
                        "value": "Adaptive rotation encoding",
                        "confidence": 0.3547
                    },
                    {
                        "value": "Random rotation",
                        "confidence": 0.7315
                    },
                    {
                        "value": "Center cropping",
                        "confidence": 0.8814
                    }
                ]
            },
            "processing": {
                "value": [
                    {
                        "value": "Patch extraction",
                        "confidence": 0.427
                    },
                    {
                        "value": "Feature descriptor (LBP) for patch selection",
                        "confidence": 0.3141
                    }
                ]
            },
            "sampling": {
                "value": null,
                "confidence": 0.0
            },
            "processing_level": {
                "value": null,
                "confidence": 0.0
            },
            "cloud_cover": {
                "value": null,
                "confidence": 0.0
            },
            "missing_data": {
                "value": null,
                "confidence": 0.0
            },
            "masking_ratio": {
                "value": 0,
                "confidence": 1.0
            }
        }
    ],
    "benchmarks": [
        {
            "task": {
                "value": "Scene Classification",
                "confidence": 0.6198
            },
            "application": {
                "value": "Aerial scene classification",
                "confidence": 0.5262
            },
            "dataset": {
                "value": "AID",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "OA",
                        "confidence": 0.8436
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 87.36,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": 50,
                "confidence": 1.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": []
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": []
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": "train ratio=50%",
                "confidence": 0.3248
            }
        },
        {
            "task": {
                "value": "Scene Classification",
                "confidence": 1.0
            },
            "application": {
                "value": "Aerial scene classification",
                "confidence": 0.9967
            },
            "dataset": {
                "value": "UCM",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "OA",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 59.45,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": 50,
                "confidence": 1.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": []
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": []
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": "train ratio=50%",
                "confidence": 1.0
            }
        },
        {
            "task": {
                "value": "Change Detection",
                "confidence": 1.0
            },
            "application": {
                "value": "Change detection",
                "confidence": 0.7764
            },
            "dataset": {
                "value": "OSCD",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "F1",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 55.63,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": []
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": []
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Semantic Segmentation",
                "confidence": 0.9999
            },
            "application": {
                "value": "Semantic segmentation",
                "confidence": 0.6073
            },
            "dataset": {
                "value": "SpaceNetv1",
                "confidence": 0.9996
            },
            "metrics": {
                "value": [
                    {
                        "value": "mF1",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 79.5,
                        "confidence": 0.8441
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": null,
                "confidence": 0.0
            },
            "num_samples": {
                "value": null,
                "confidence": 0.0
            },
            "sampling_percentage": {
                "value": null,
                "confidence": 0.0
            },
            "num_classes": {
                "value": null,
                "confidence": 0.0
            },
            "classes": {
                "value": []
            },
            "image_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "spatial_resolution": {
                "value": null,
                "confidence": 0.0
            },
            "bands_used": {
                "value": []
            },
            "augmentations": {
                "value": []
            },
            "optimizer": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": null,
                "confidence": 0.0
            },
            "learning_rate": {
                "value": null,
                "confidence": 0.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "loss_function": {
                "value": null,
                "confidence": 0.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        }
    ]
}