{
    "listops": {
        "dataset": {
            "train": 96000,
            "dev": 2000,
            "test": 2000
        },
        "dataset_file": "listops",
        "model": {
            "mixed_precision": true,
            "shared_weight": false,
            "embedding_dim": 64,
            "dim": 64,
            "hidden_dim": 128,
            "head_dim": 32,
            "num_head": 2,
            "num_layers": 2,
            "vocab_size": 32,
            "max_seq_len": 2000,
            "dropout_prob": 0.1,
            "pooling_mode": "MEAN",
            "num_classes": 10,
            "model_type": "fmm_transformer",
            "attn_cpt": false,
            "diag_size": 5,
            "kernels": ["elu", "elu_flip"],
            "sparse_ratio": 6.5
        },
        "training": {
            "batch_size": 32,
            "learning_rate": 0.0001,
            "warmup": 1000,
            "lr_decay": "linear",
            "weight_decay": 0,
            "eval_frequency": 50,
            "num_train_steps": 15000,
            "num_eval_steps": 62
        },
        "gpu_memory": 32
    },
    "retrieval": {
        "dataset": {
            "train": 147086,
            "dev": 18090,
            "test": 17437
        },
        "dataset_file": "retrieval",
        "model": {
            "mixed_precision": true,
            "shared_weight": false,
            "embedding_dim": 64,
            "dim": 64,
            "hidden_dim": 128,
            "head_dim": 32,
            "num_head": 2,
            "num_layers": 2,
            "vocab_size": 512,
            "max_seq_len": 4000,
            "dropout_prob": 0.1,
            "pooling_mode": "MEAN",
            "num_classes": 2,
            "model_type": "fmm_transformer",
            "attn_cpt": false,
            "diag_size": 5,
            "kernels": ["elu", "elu_flip"],
            "sparse_ratio": 19.5
        },
        "training": {
            "batch_size": 32,
            "learning_rate": 0.0001,
            "warmup": 800,
            "lr_decay": "linear",
            "weight_decay": 0,
            "eval_frequency": 300,
            "num_train_steps": 30000,
            "num_eval_steps": 565
        },
        "gpu_memory": 4
    },
    "text": {
        "dataset": {
            "train": 25000,
            "dev": 25000,
            "test": 25000
        },
        "dataset_file": "text",
        "model": {
            "mixed_precision": true,
            "shared_weight": false,
            "embedding_dim": 64,
            "dim": 64,
            "hidden_dim": 128,
            "head_dim": 32,
            "num_head": 2,
            "num_layers": 2,
            "vocab_size": 512,
            "max_seq_len": 4000,
            "dropout_prob": 0.1,
            "pooling_mode": "MEAN",
            "num_classes": 2,
            "model_type": "fmm_transformer",
            "attn_cpt": false,
            "diag_size": 5,
            "kernels": ["elu", "elu_flip"],
            "sparse_ratio": 4.5
        },
        "training": {
            "batch_size": 32,
            "learning_rate": 0.0001,
            "warmup": 8000,
            "lr_decay": "linear",
            "weight_decay": 0,
            "eval_frequency": 500,
            "num_train_steps": 20000,
            "num_eval_steps": 781
        },
        "gpu_memory": 8
    },
    "image": {
        "dataset": {
            "train": 45000,
            "dev": 5000,
            "test": 10000
        },
        "dataset_file": "image",
        "model": {
            "mixed_precision": true,
            "shared_weight": false,
            "embedding_dim": 64,
            "dim": 64,
            "hidden_dim": 128,
            "head_dim": 32,
            "num_head": 2,
            "num_layers": 2,
            "vocab_size": 512,
            "max_seq_len": 1024,
            "dropout_prob": 0.1,
            "pooling_mode": "MEAN",
            "num_classes": 10,
            "model_type": "fmm_transformer",
            "attn_cpt": false,
            "diag_size": 5,
            "kernels": "elu"
        },
        "training": {
            "batch_size": 256,
            "learning_rate": 0.0001,
            "warmup": 175,
            "lr_decay": "linear",
            "weight_decay": 0,
            "eval_frequency": 175,
            "num_train_steps": 35000,
            "num_eval_steps": 20
        },
        "gpu_memory": 128
    },
    "pathfinder32": {
        "dataset": {
            "train": 45000,
            "dev": 5000,
            "test": 10000
        },
        "dataset_file": "pathfinder32-curv_contour_length_14",
        "model": {
            "mixed_precision": true,
            "shared_weight": false,
            "embedding_dim": 64,
            "dim": 64,
            "hidden_dim": 128,
            "head_dim": 32,
            "num_head": 2,
            "num_layers": 2,
            "vocab_size": 512,
            "max_seq_len": 1024,
            "dropout_prob": 0.1,
            "pooling_mode": "MEAN",
            "num_classes": 2,
            "model_type": "fmm_transformer",
            "attn_cpt": false,
            "diag_size": 5,
            "kernels": "elu"
        },
        "training": {
            "batch_size": 256,
            "learning_rate": 0.0001,
            "warmup": 312,
            "lr_decay": "linear",
            "weight_decay": 0,
            "eval_frequency": 312,
            "num_train_steps": 62400,
            "num_eval_steps": 312
        },
        "gpu_memory": 128
    }
}