{
    "name": "lra-experiment",
    "model_config": {
        "vocab_size_or_config_json_file": 256,
        "hidden_size": 128,
        "num_hidden_layers": 6,
        "num_attention_heads": 4,
        "intermediate_size": 4,
        "hidden_act": "gelu",
        "hidden_dropout_prob": 0.05,
        "attention_probs_dropout_prob": 0.05,
        "embedding_ln_type": "max_norm",
        "embedding_dropout": 0.05,
        "max_position_embeddings": 4000,
        "type_vocab_size": 2,
        "initializer_range": 0.2,
        "pooler_function": "first",
        "pooler_no_dense": true,
        "pos_emb_type": "sinusoidal",
        "local_attention": false
    },
    "data": {
        "training": {
            "inputs1": {
                "inputs": "input/train.src",
                "masks": "input/train.mask",
                "labels": "label/train.label"
            },
            "inputs2": {
                "inputs": "input/train.src",
                "masks": "input/train.mask",
                "labels": "label/train.label"
            }
        },
        "validation": {
            "inputs1": {
                "inputs": "input/valid.src",
                "masks": "input/valid.mask",
                "labels": "label/valid.label"
            },
            "inputs2": {
                "inputs": "input/valid.src",
                "masks": "input/valid.mask",
                "labels": "label/valid.label"
            }
        },
        "test": {
            "inputs1": {
                "inputs": "input/test.src",
                "masks": "input/test.mask",
                "labels": "label/test.label"
            },
            "inputs2": {
                "inputs": "input/test.src",
                "masks": "input/test.mask",
                "labels": "label/test.label"
            }
        }
    },
    "training": {
        "num_epochs": 40,
        "lr_scheduler_params": {
            "warmup_ratio": 0.05,
            "warmup_degree": 1,
            "degree": 0,
            "one_cycle_steps": 150000
        },
        "lr_schedule": "cosine",
        "lr_offset": 0.0,
        "learning_rate": 1e-3,
        "weight_decay": 0.0,
        "num_workers": 4,
        "async_worker": true
    }
}

// Also there should be std normal init of embeddings