LMSEG:
    SAMPLE_STRATEGY: "uniform"

    ENABLE_LEARNABLE_CONTEXT: True
    ENABLE_CATEGORY_GUIDED_DECODER: True

MODEL:
    META_ARCHITECTURE: "LMSEG"

    # consistent with clip
    PIXEL_MEAN: [122.7709383, 116.7460125, 104.09373615000001]
    PIXEL_STD: [68.5005327, 66.6321579, 70.32316304999999]
    WEIGHTS: ""

    BACKBONE: 
        NAME: "D2CLIPResNetWithAttention"
    CLIP_RESNET:
        LAYERS: [3, 4, 23, 3]
        OUTPUT_DIM: 512
        INPUT_RESOLUTION: 512
        WIDTH: 64
        WITH_ATTNPOOL: True
        OUT_FEATURES: ["res2", "res3", "res4", "res5"]
        PRETRAINED: "clip_pretrained/RN101.pt"
        NORM: 'FrozenBN'        # following maskformer, using frozen bn
    
    TEXTENCODER:
        NAME: "CLIPTextContextEncoder"
        TOKEN_EMBED_DIM: 512
        TEXT_DIM: 512
        HARD_PROMPT_LENGTH: 10

        CLIPTextContextEncoder:
            PRETRAINED: "clip_pretrained/RN101.pt"
            CONTEXT_LENGTH: 18                 # learnable length is 8
            EMBED_DIM: 512
            TRANSFORMER_WIDTH: 512
            TRANSFORMER_HEADS: 8
            TRANSFORMER_LAYERS: 12

    SEM_SEG_HEAD:
        NAME: "LMSegHead"
        IN_FEATURES: ["res2", "res3", "res4", "res5"]
        IGNORE_VALUE: 255
        COMMON_STRIDE: 4    # not used, hard-coded
        LOSS_WEIGHT: 1.0
        CONVS_DIM: 256
        MASK_DIM: 256
        NORM: "GN"
    MASK_FORMER:
        TRANSFORMER_IN_FEATURE: "res5"
        DEEP_SUPERVISION: True
        NO_OBJECT_WEIGHT: 0.1
        DICE_WEIGHT: 1.0
        MASK_WEIGHT: 20.0
        HIDDEN_DIM: 256
        NUM_OBJECT_QUERIES: 100
        NHEADS: 8
        DROPOUT: 0.1
        DIM_FEEDFORWARD: 2048
        ENC_LAYERS: 0
        DEC_LAYERS: 6
        PRE_NORM: False

SOLVER:
    IMS_PER_BATCH: 16
    BASE_LR: 0.0001
    MAX_ITER: 160000
    WARMUP_FACTOR: 1.0
    WARMUP_ITERS: 0
    WEIGHT_DECAY: 0.0001
    OPTIMIZER: "ADAMW"
    LR_SCHEDULER_NAME: "WarmupPolyLR"
    BACKBONE_MULTIPLIER: 0.1
    CLIP_GRADIENTS:
        ENABLED: True
        CLIP_TYPE: "full_model"
        CLIP_VALUE: 0.01
        NORM_TYPE: 2.0

    TEXTENCODER_MULTIPLIER: 0.0

DATALOADER:
    FILTER_EMPTY_ANNOTATIONS: True
    NUM_WORKERS: 2
VERSION: 2


DATASETS:
    TRAIN: ("ade20k_sem_seg_train", "cityscapes_fine_sem_seg_train", "coco_2017_train_stuff_10k_sem_seg", "mapillary_vistas_sem_seg_train")
    TEST: ("ade20k_sem_seg_val",)


## test setting ade20k
INPUT:
    MIN_SIZE_TEST: 512
    MAX_SIZE_TEST: 2048
    SIZE_DIVISIBILITY: 512  # used in dataset mapper
    FORMAT: "RGB"
TEST:
    EVAL_PERIOD: 2000
    AUG:
        ENABLED: False
        MIN_SIZES: [256, 384, 512, 640, 768, 896]
        MAX_SIZE: 3584
        FLIP: True


INPUT_ADE20K:
    MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
    MIN_SIZE_TRAIN_SAMPLING: "choice"
    MIN_SIZE_TEST: 512
    MAX_SIZE_TRAIN: 2048
    MAX_SIZE_TEST: 2048
    CROP:
        ENABLED: True
        TYPE: "absolute"
        SIZE: (512, 512)
        SINGLE_CATEGORY_MAX_AREA: 1.0
    COLOR_AUG_SSD: True
    SIZE_DIVISIBILITY: 512  # used in dataset mapper
    FORMAT: "RGB"

INPUT_CITYSCAPES:
    MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
    MIN_SIZE_TRAIN_SAMPLING: "choice"
    MIN_SIZE_TEST: 512
    MAX_SIZE_TRAIN: 2048
    MAX_SIZE_TEST: 2048
    CROP:
        ENABLED: True
        TYPE: "absolute"
        SIZE: (512, 512)
        SINGLE_CATEGORY_MAX_AREA: 1.0
    COLOR_AUG_SSD: True
    SIZE_DIVISIBILITY: 512  # used in dataset mapper
    FORMAT: "RGB"

INPUT_COCOSTUFF10K:
    MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
    MIN_SIZE_TRAIN_SAMPLING: "choice"
    MIN_SIZE_TEST: 512
    MAX_SIZE_TRAIN: 2048
    MAX_SIZE_TEST: 2048
    CROP:
        ENABLED: True
        TYPE: "absolute"
        SIZE: (512, 512)
        SINGLE_CATEGORY_MAX_AREA: 1.0
    COLOR_AUG_SSD: True
    SIZE_DIVISIBILITY: 512  # used in dataset mapper
    FORMAT: "RGB"


INPUT_MAPILLARY_VISTAS:
    MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
    MIN_SIZE_TRAIN_SAMPLING: "choice"
    MIN_SIZE_TEST: 512
    MAX_SIZE_TRAIN: 2048
    MAX_SIZE_TEST: 2048
    CROP:
        ENABLED: True
        TYPE: "absolute"
        SIZE: (512, 512)
        SINGLE_CATEGORY_MAX_AREA: 1.0
    COLOR_AUG_SSD: True
    SIZE_DIVISIBILITY: 512  # used in dataset mapper
    FORMAT: "RGB"
    