{
    "model_id": {
        "value": "Text2Earth",
        "confidence": 0.9164
    },
    "model_name": {
        "value": "Text2Earth",
        "confidence": 1.0
    },
    "version": {
        "value": "v1",
        "confidence": 0.6398
    },
    "release_date": {
        "value": null,
        "confidence": 0.0
    },
    "last_updated": {
        "value": null,
        "confidence": 0.0
    },
    "short_description": {
        "value": "Text2Earth is a 1.3 billion parameter generative foundation model for global-scale, multi-resolution controllable, and unbounded remote sensing text-to-image generation. It is trained on the Git-10M dataset (10.5 million image-text pairs) and supports zero-shot text2image, image editing, unbounded scene construction, and cross-modal image generation.",
        "confidence": 0.8444
    },
    "paper_link": {
        "value": "https://arxiv.org/abs/2501.00895",
        "confidence": 0.9783
    },
    "citations": {
        "value": null,
        "confidence": 0.0
    },
    "repository": {
        "value": "https://github.com/Chen-Yang-Liu/Text2Earth",
        "confidence": 1.0
    },
    "weights": {
        "value": null,
        "confidence": 0.0
    },
    "backbone": {
        "value": "Diffusion model with VAE and OpenCLIP ViT-H text encoder, U-Net with cross-attention",
        "confidence": 0.7483
    },
    "num_layers": {
        "value": null,
        "confidence": 0.0
    },
    "num_parameters": {
        "value": 1300000000,
        "confidence": 0.8953
    },
    "pretext_training_type": {
        "value": "Diffusion-based generative pretraining with dynamic condition adaptation",
        "confidence": 0.8376
    },
    "masking_strategy": {
        "value": "Dynamic condition adaptation (randomly dropping text and resolution conditions during training)",
        "confidence": 0.768
    },
    "pretraining": {
        "value": "Two-stage: initial training on full Git-10M dataset, then fine-tuning on high-quality subset (score > 4.8)",
        "confidence": 0.7714
    },
    "domain_knowledge": {
        "value": [
            {
                "value": "Resolution guidance mechanism",
                "confidence": 0.8439
            },
            {
                "value": "Dynamic condition adaptation",
                "confidence": 0.8823
            },
            {
                "value": "Geospatial metadata integration",
                "confidence": 0.4479
            }
        ]
    },
    "backbone_modifications": {
        "value": [
            {
                "value": "Integration of resolution embedding",
                "confidence": 0.5322
            },
            {
                "value": "Cross-attention mechanism for text and resolution",
                "confidence": 0.5915
            },
            {
                "value": "Conditional masked image encoding for editing",
                "confidence": 0.6714
            }
        ]
    },
    "supported_sensors": {
        "value": [
            {
                "value": "Google Earth imagery",
                "confidence": 0.6642
            },
            {
                "value": "Public datasets: Million-AID, GeoPile, SSL4EO-S12, SkyScript, DIOR, RSICB",
                "confidence": 0.8682
            }
        ]
    },
    "modality_integration_type": {
        "value": "Homogeneous Multimodal",
        "confidence": 0.9904
    },
    "modalities": {
        "value": [
            {
                "value": "Multispectral",
                "confidence": 0.9851
            },
            {
                "value": "SAR",
                "confidence": 0.7506
            },
            {
                "value": "NIR",
                "confidence": 0.6448
            },
            {
                "value": "Panchromatic",
                "confidence": 0.7387
            },
            {
                "value": "RGB",
                "confidence": 0.7309
            },
            {
                "value": "Text",
                "confidence": 0.9949
            }
        ]
    },
    "spectral_alignment": {
        "value": "partial",
        "confidence": 0.9642
    },
    "temporal_alignment": {
        "value": "none",
        "confidence": 1.0
    },
    "spatial_resolution": {
        "value": "0.5m to 128m (multi-resolution)",
        "confidence": 0.8456
    },
    "temporal_resolution": {
        "value": null,
        "confidence": 0.0
    },
    "bands": {
        "value": [
            {
                "value": "RGB",
                "confidence": 0.9758
            },
            {
                "value": "NIR",
                "confidence": 0.8472
            },
            {
                "value": "PAN",
                "confidence": 0.712
            },
            {
                "value": "SAR",
                "confidence": 0.9979
            }
        ]
    },
    "pretraining_phases": [
        {
            "dataset": {
                "value": "Git-10M",
                "confidence": 1.0
            },
            "regions_coverage": {
                "value": [
                    {
                        "value": "Global (multiple continents and geographical regions, including urban areas, forests, mountains, deserts, etc.)",
                        "confidence": 0.3065
                    }
                ]
            },
            "time_range": {
                "value": null,
                "confidence": 0.0
            },
            "num_images": {
                "value": 10500000,
                "confidence": 0.9959
            },
            "token_size": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": "0.5m/pixel to 128m/pixel",
                "confidence": 0.7166
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "batch_size": {
                "value": 1024,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": "0.0001",
                "confidence": 0.9983
            },
            "augmentations": {
                "value": null,
                "confidence": 0.0
            },
            "processing": {
                "value": [
                    {
                        "value": "Image enhancement (trained on private high-quality dataset)",
                        "confidence": 0.3035
                    },
                    {
                        "value": "Duplicate/removal of redundant ocean scenes",
                        "confidence": 0.3116
                    },
                    {
                        "value": "Manual and automated filtering",
                        "confidence": 0.3226
                    }
                ]
            },
            "sampling": {
                "value": "Random and manual selection for global coverage",
                "confidence": 0.306
            },
            "processing_level": {
                "value": null,
                "confidence": 0.0
            },
            "cloud_cover": {
                "value": null,
                "confidence": 0.0
            },
            "missing_data": {
                "value": null,
                "confidence": 0.0
            },
            "masking_ratio": {
                "value": null,
                "confidence": 0.0
            }
        }
    ],
    "benchmarks": [
        {
            "task": {
                "value": "Text-to-image generation",
                "confidence": 0.6932
            },
            "application": {
                "value": "Remote sensing image generation",
                "confidence": 0.3834
            },
            "dataset": {
                "value": "RSICD",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "FID",
                        "confidence": 1.0
                    },
                    {
                        "value": "Zero-Shot Cls-OA",
                        "confidence": 0.9798
                    },
                    {
                        "value": "CLIP Score",
                        "confidence": 0.9986
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 24.49,
                        "confidence": 1.0
                    },
                    {
                        "value": 90.26,
                        "confidence": 1.0
                    },
                    {
                        "value": 25.62,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "RSICD (remote sensing images, RGB)",
                        "confidence": 0.3003
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": 10921,
                "confidence": 1.0
            },
            "num_samples": {
                "value": 10921,
                "confidence": 1.0
            },
            "sampling_percentage": {
                "value": 100,
                "confidence": 1.0
            },
            "num_classes": {
                "value": 30,
                "confidence": 1.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": "variable",
                "confidence": 0.6555
            },
            "spatial_resolution": {
                "value": "variable",
                "confidence": 0.9401
            },
            "bands_used": {
                "value": [
                    {
                        "value": "RGB",
                        "confidence": 1.0
                    }
                ]
            },
            "augmentations": {
                "value": null,
                "confidence": 0.0
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 0.9983
            },
            "batch_size": {
                "value": 1024,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 0.0001,
                "confidence": 1.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "loss_function": {
                "value": "Diffusion loss (L2 loss between predicted and true noise)",
                "confidence": 0.3115
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Text-driven multi-modal image generation",
                "confidence": 0.6788
            },
            "application": {
                "value": "Remote sensing multi-modal image generation",
                "confidence": 0.3889
            },
            "dataset": {
                "value": "RSICD (extended to multi-modal)",
                "confidence": 0.3791
            },
            "metrics": {
                "value": [
                    {
                        "value": "FID",
                        "confidence": 1.0
                    },
                    {
                        "value": "Zero-Shot Cls-OA",
                        "confidence": 0.9974
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 24.49,
                        "confidence": 0.9997
                    },
                    {
                        "value": 90.26,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "RSICD",
                        "confidence": 0.8894
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": 10921,
                "confidence": 1.0
            },
            "num_samples": {
                "value": 10921,
                "confidence": 1.0
            },
            "sampling_percentage": {
                "value": 100,
                "confidence": 1.0
            },
            "num_classes": {
                "value": 30,
                "confidence": 1.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": "variable",
                "confidence": 0.9803
            },
            "spatial_resolution": {
                "value": "variable",
                "confidence": 0.9999
            },
            "bands_used": {
                "value": [
                    {
                        "value": "RGB",
                        "confidence": 0.9997
                    }
                ]
            },
            "augmentations": {
                "value": null,
                "confidence": 0.0
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 0.645
            },
            "batch_size": {
                "value": 1024,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 0.0001,
                "confidence": 1.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "loss_function": {
                "value": "Diffusion loss",
                "confidence": 0.9621
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Text-driven multi-modal image generation",
                "confidence": 0.8698
            },
            "application": {
                "value": "Remote sensing multi-modal image generation (PAN)",
                "confidence": 0.3991
            },
            "dataset": {
                "value": "RSICD (PAN)",
                "confidence": 0.6705
            },
            "metrics": {
                "value": [
                    {
                        "value": "FID",
                        "confidence": 1.0
                    },
                    {
                        "value": "Zero-Shot Cls-OA",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 4.39,
                        "confidence": 1.0
                    },
                    {
                        "value": 88.46,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "RSICD",
                        "confidence": 0.9975
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": 10921,
                "confidence": 1.0
            },
            "num_samples": {
                "value": 10921,
                "confidence": 1.0
            },
            "sampling_percentage": {
                "value": 100,
                "confidence": 1.0
            },
            "num_classes": {
                "value": 30,
                "confidence": 1.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": "variable",
                "confidence": 1.0
            },
            "spatial_resolution": {
                "value": "variable",
                "confidence": 1.0
            },
            "bands_used": {
                "value": [
                    {
                        "value": "PAN",
                        "confidence": 0.981
                    }
                ]
            },
            "augmentations": {
                "value": null,
                "confidence": 0.0
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 0.9947
            },
            "batch_size": {
                "value": 1024,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 0.0001,
                "confidence": 1.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "loss_function": {
                "value": "Diffusion loss",
                "confidence": 0.9998
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Text-driven multi-modal image generation",
                "confidence": 1.0
            },
            "application": {
                "value": "Remote sensing multi-modal image generation (SAR)",
                "confidence": 0.924
            },
            "dataset": {
                "value": "RSICD (SAR)",
                "confidence": 0.9999
            },
            "metrics": {
                "value": [
                    {
                        "value": "FID",
                        "confidence": 1.0
                    },
                    {
                        "value": "Zero-Shot Cls-OA",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 68.83,
                        "confidence": 1.0
                    },
                    {
                        "value": 34.42,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "RSICD",
                        "confidence": 1.0
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": 10921,
                "confidence": 1.0
            },
            "num_samples": {
                "value": 10921,
                "confidence": 1.0
            },
            "sampling_percentage": {
                "value": 100,
                "confidence": 1.0
            },
            "num_classes": {
                "value": 30,
                "confidence": 1.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": "variable",
                "confidence": 1.0
            },
            "spatial_resolution": {
                "value": "variable",
                "confidence": 1.0
            },
            "bands_used": {
                "value": [
                    {
                        "value": "SAR",
                        "confidence": 1.0
                    }
                ]
            },
            "augmentations": {
                "value": null,
                "confidence": 0.0
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 1.0
            },
            "batch_size": {
                "value": 1024,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 0.0001,
                "confidence": 1.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "loss_function": {
                "value": "Diffusion loss",
                "confidence": 1.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        },
        {
            "task": {
                "value": "Text-driven multi-modal image generation",
                "confidence": 1.0
            },
            "application": {
                "value": "Remote sensing multi-modal image generation (NIR)",
                "confidence": 1.0
            },
            "dataset": {
                "value": "RSICD (NIR)",
                "confidence": 1.0
            },
            "metrics": {
                "value": [
                    {
                        "value": "FID",
                        "confidence": 1.0
                    },
                    {
                        "value": "Zero-Shot Cls-OA",
                        "confidence": 1.0
                    }
                ]
            },
            "metrics_value": {
                "value": [
                    {
                        "value": 2.05,
                        "confidence": 1.0
                    },
                    {
                        "value": 82.08,
                        "confidence": 1.0
                    }
                ]
            },
            "sensor": {
                "value": [
                    {
                        "value": "RSICD",
                        "confidence": 1.0
                    }
                ]
            },
            "regions": {
                "value": [
                    {
                        "value": null,
                        "confidence": 0.3
                    }
                ]
            },
            "original_samples": {
                "value": 10921,
                "confidence": 1.0
            },
            "num_samples": {
                "value": 10921,
                "confidence": 1.0
            },
            "sampling_percentage": {
                "value": 100,
                "confidence": 1.0
            },
            "num_classes": {
                "value": 30,
                "confidence": 1.0
            },
            "classes": {
                "value": null,
                "confidence": 0.0
            },
            "image_resolution": {
                "value": "variable",
                "confidence": 1.0
            },
            "spatial_resolution": {
                "value": "variable",
                "confidence": 1.0
            },
            "bands_used": {
                "value": [
                    {
                        "value": "NIR",
                        "confidence": 1.0
                    }
                ]
            },
            "augmentations": {
                "value": null,
                "confidence": 0.0
            },
            "optimizer": {
                "value": "AdamW",
                "confidence": 1.0
            },
            "batch_size": {
                "value": 1024,
                "confidence": 1.0
            },
            "learning_rate": {
                "value": 0.0001,
                "confidence": 1.0
            },
            "epochs": {
                "value": null,
                "confidence": 0.0
            },
            "loss_function": {
                "value": "Diffusion loss",
                "confidence": 1.0
            },
            "split_ratio": {
                "value": null,
                "confidence": 0.0
            }
        }
    ]
}