    {
    "version": "0.2.0",
    "configurations": [
        {
            "name": "MoSA_tiny1",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "512",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "1",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_tiny2",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "512",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "2",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_isoflop",

                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_tiny4",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "512",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "4",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_tiny8",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "512",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "8",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_tiny16",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "512",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "16",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_tiny32",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "512",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "32",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_tiny64",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "512",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "64",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_tiny128",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "512",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "128",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },







        {
            "name": "MoSA_small1",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "1",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_small2",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", ",",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_small4",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "4",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_small8",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "8",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_small16",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "16",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_small32",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "32",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_small64",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "64",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },










        {
            "name": "MoSA_mid1",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "18",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "1",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "mid_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_mid2",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "18",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "2",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "mid_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_mid4",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "18",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "4",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "mid_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_mid8",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "18",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "8",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "mid_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_mid16",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "18",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "16",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "mid_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_mid32",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_mosa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "18",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "32",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "mid_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },









        {
            "name": "MoSA_large1",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_symmetric_rope_sa_transformer", "-test_interval", "100000",
                "-state_size", "1280", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "27",
                "-sa_moe.baseline_dense_heads", "16", "-sa_moe.shared_dense_heads", "12", "-sa_moe.sparsity", "1",
                "-sa_moe.include_first", "0", "-sa_moe.noise_std", "0.0",
                "-transformer.head_projection_size", "64", "-dropout", "0.0", "-lr", "0.00025",
                "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64", "-grad_clip", "0.25",
                "-amp", "1", "-save_interval", "50000", "-stop_after", "100000", "-lr_sched.type", "cos",
                "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1", "-compile", "1",
                "-name", "large_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_large2",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1,2,3"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_symmetric_rope_sa_transformer", "-test_interval", "100000",
                "-state_size", "1280", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "27",
                "-sa_moe.baseline_dense_heads", "16", "-sa_moe.shared_dense_heads", "12", "-sa_moe.sparsity", "2",
                "-sa_moe.include_first", "0", "-sa_moe.noise_std", "0.0",
                "-transformer.head_projection_size", "64", "-dropout", "0.0", "-lr", "0.00025",
                "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64", "-grad_clip", "0.25",
                "-amp", "1", "-save_interval", "50000", "-stop_after", "100000", "-lr_sched.type", "cos",
                "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1", "-compile", "1",
                "-name", "large_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "MoSA_large4",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1,2,3"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_symmetric_rope_sa_transformer", "-test_interval", "100000",
                "-state_size", "1280", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "27",
                "-sa_moe.baseline_dense_heads", "16", "-sa_moe.shared_dense_heads", "12", "-sa_moe.sparsity", "4",
                "-sa_moe.include_first", "0", "-sa_moe.noise_std", "0.0",
                "-transformer.head_projection_size", "64", "-dropout", "0.0", "-lr", "0.00025",
                "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64", "-grad_clip", "0.25",
                "-amp", "1", "-save_interval", "50000", "-stop_after", "100000", "-lr_sched.type", "cos",
                "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1", "-compile", "1",
                "-name", "large_isoflop"
            ]
        },






        {
            "name": "routing_tiny_2",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "512", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "2",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },

        {
            "name": "routing_tiny_4",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "512", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "4",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },

        {
            "name": "routing_tiny_8",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "512", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "8",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },

        {
            "name": "routing_tiny_16",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "512", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "16",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },

        {
            "name": "routing_tiny_32",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "512", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "32",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },

        {
            "name": "routing_tiny_64",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "512", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "64",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },

        {
            "name": "routing_tiny_128",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "512", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "128",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "routing_small_2",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "2",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "routing_small_4",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "4",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "routing_small_8",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "8",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "routing_small_16",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "16",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "routing_small_32",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "32",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "routing_small_64",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "64",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },

        {
            "name": "routing_mid_2",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "18",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "2",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "mid_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },

        {
            "name": "routing_mid_4",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "18",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "4",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "mid_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },

        {
            "name": "routing_mid_8",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "18",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "8",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "mid_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },

        {
            "name": "routing_mid_16",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "18",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "16",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "mid_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },

        {
            "name": "routing_mid_32",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "18",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "32",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "mid_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "routing_large_2",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1,2,3"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "1280", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "27",
                "-sa_moe.baseline_dense_heads", "16", "-sa_moe.shared_dense_heads", "12", "-sa_moe.sparsity", "2",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "50000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "large_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "routing_large_4",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1,2,3"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_routing_transformer", "-test_interval", "100000",
                "-state_size", "1280", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "27",
                "-sa_moe.baseline_dense_heads", "16", "-sa_moe.shared_dense_heads", "12", "-sa_moe.sparsity", "4",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "50000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "large_routing_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },






        {
            "name": "FSA_tiny2",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_fsa_transformer", "-test_interval", "100000", "-state_size", "512",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "2",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "FSA_tiny4",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_fsa_transformer", "-test_interval", "100000", "-state_size", "512",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "4",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "FSA_tiny8",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_fsa_transformer", "-test_interval", "100000", "-state_size", "512",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "8",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "FSA_tiny16",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_fsa_transformer", "-test_interval", "100000", "-state_size", "512",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "16",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "FSA_tiny32",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_fsa_transformer", "-test_interval", "100000", "-state_size", "512",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "32",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "FSA_tiny64",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_fsa_transformer", "-test_interval", "100000", "-state_size", "512",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "64",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "FSA_tiny128",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_fsa_transformer", "-test_interval", "100000", "-state_size", "512",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "128",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "200000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "tiny_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },




        {
            "name": "FSA_small2",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_fsa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", ",",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "FSA_small4",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_fsa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "4",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "FSA_small8",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_fsa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "8",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "FSA_small16",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0"
            },
            "args": [
                "-task", "c4_flopmatched_fsa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "16",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "FSA_small32",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_fsa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "32",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "FSA_small64",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_fsa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "9",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "64",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "small_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },






        {
            "name": "FSA_mid2",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_fsa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "18",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "2",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "mid_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "FSA_mid4",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_fsa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "18",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "4",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "mid_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "FSA_mid8",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_fsa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "18",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "8",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "mid_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "FSA_mid16",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_fsa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "18",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "16",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "mid_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "FSA_mid32",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_fsa_transformer", "-test_interval", "100000", "-state_size", "1024",
                "-transformer.ff_multiplier", "4", "-transformer.n_layers", "18",
                "-sa_moe.baseline_dense_heads", "9", "-sa_moe.shared_dense_heads", "4", "-sa_moe.sparsity", "32",
                "-sa_moe.include_first", "0", "-transformer.head_projection_size", "64", "-dropout", "0.0",
                "-lr", "0.00025", "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64",
                "-grad_clip", "0.25", "-amp", "1", "-save_interval", "600000", "-stop_after", "100000",
                "-lr_sched.type", "cos", "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1",
                "-compile", "1", "-name", "mid_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },









        {
            "name": "FSA_large1",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_symmetric_rope_sa_transformer", "-test_interval", "100000",
                "-state_size", "1280", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "27",
                "-sa_moe.baseline_dense_heads", "16", "-sa_moe.shared_dense_heads", "12", "-sa_moe.sparsity", "1",
                "-sa_moe.include_first", "0", "-sa_moe.noise_std", "0.0",
                "-transformer.head_projection_size", "64", "-dropout", "0.0", "-lr", "0.00025",
                "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64", "-grad_clip", "0.25",
                "-amp", "1", "-save_interval", "50000", "-stop_after", "100000", "-lr_sched.type", "cos",
                "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1", "-compile", "1",
                "-name", "large_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "FSA_large2",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1,2,3"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_symmetric_rope_sa_transformer", "-test_interval", "100000",
                "-state_size", "1280", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "27",
                "-sa_moe.baseline_dense_heads", "16", "-sa_moe.shared_dense_heads", "12", "-sa_moe.sparsity", "2",
                "-sa_moe.include_first", "0", "-sa_moe.noise_std", "0.0",
                "-transformer.head_projection_size", "64", "-dropout", "0.0", "-lr", "0.00025",
                "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64", "-grad_clip", "0.25",
                "-amp", "1", "-save_interval", "50000", "-stop_after", "100000", "-lr_sched.type", "cos",
                "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1", "-compile", "1",
                "-name", "large_isoflop",
                "-lm.eval.enabled", "1",
                "-lm.eval.lambada.enabled", "1",
                "-lm.eval.blimp.enabled", "1",
                "-lm.eval.ai2arc.enabled", "1",
                "-lm.eval.piqa.enabled", "1",
                "-lm.eval.winogrande.enabled", "1",
                "-lm.eval.hellaswag.enabled", "1"
            ]
        },
        {
            "name": "FSA_large4",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1,2,3"
            },
            "args": [
                "-task", "c4_flopmatched_parthead_symmetric_rope_sa_transformer", "-test_interval", "100000",
                "-state_size", "1280", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "27",
                "-sa_moe.baseline_dense_heads", "16", "-sa_moe.shared_dense_heads", "12", "-sa_moe.sparsity", "4",
                "-sa_moe.include_first", "0", "-sa_moe.noise_std", "0.0",
                "-transformer.head_projection_size", "64", "-dropout", "0.0", "-lr", "0.00025",
                "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64", "-grad_clip", "0.25",
                "-amp", "1", "-save_interval", "50000", "-stop_after", "100000", "-lr_sched.type", "cos",
                "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1", "-compile", "1",
                "-name", "large_isoflop"
            ]
        },





        {
            "name": "MoSA_sw1024",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1,2,3"
            },
            "args": [
                "-task", "c4_sliding_window_and_sparse_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.sw_heads", "4", "-sa_moe.expert_heads", "70", "-sa_moe.sparsity", "16",
                "-sa_moe.include_first", "0", "-sa_moe.sparsity_type", "EC",
                "-transformer.head_projection_size", "64", "-dropout", "0.0", "-lr", "0.00025",
                "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64", "-grad_clip", "0.25",
                "-amp", "1", "-save_interval", "600000", "-stop_after", "100000", "-lr_sched.type", "cos",
                "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1", "-compile", "1",
                "-name", "small_mosa_sw_1024",
            ]
        },
        {
            "name": "MoSA_sw2048",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1,2,3"
            },
            "args": [
                "-task", "c4_sliding_window_and_sparse_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.sw_heads", "4", "-sa_moe.expert_heads", "70", "-sa_moe.sparsity", "32",
                "-sa_moe.include_first", "0", "-sa_moe.sparsity_type", "EC",
                "-transformer.head_projection_size", "64", "-dropout", "0.0", "-lr", "0.00025",
                "-optimizer", "adamw", "-lm.unroll", "2048", "-batch_size", "64", "-grad_clip", "0.25",
                "-amp", "1", "-save_interval", "600000", "-stop_after", "100000", "-lr_sched.type", "cos",
                "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1", "-compile", "1",
                "-name", "small_mosa_sw_2048",
            ]
        },
        {
            "name": "MoSA_sw4096",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1,2,3"
            },
            "args": [
                "-task", "c4_sliding_window_and_sparse_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.sw_heads", "4", "-sa_moe.expert_heads", "70", "-sa_moe.sparsity", "64",
                "-sa_moe.include_first", "0", "-sa_moe.sparsity_type", "EC",
                "-transformer.head_projection_size", "64", "-dropout", "0.0", "-lr", "0.00025",
                "-optimizer", "adamw", "-lm.unroll", "4096", "-batch_size", "64", "-grad_clip", "0.25",
                "-amp", "1", "-save_interval", "600000", "-stop_after", "100000", "-lr_sched.type", "cos",
                "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1", "-compile", "1",
                "-name", "small_mosa_sw_4096",
            ]
        },
        {
            "name": "MoSA_sw8192",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1,2,3"
            },
            "args": [
                "-task", "c4_sliding_window_and_sparse_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.sw_heads", "4", "-sa_moe.expert_heads", "70", "-sa_moe.sparsity", "128",
                "-sa_moe.include_first", "0", "-sa_moe.sparsity_type", "EC",
                "-transformer.head_projection_size", "64", "-dropout", "0.0", "-lr", "0.00025",
                "-optimizer", "adamw", "-lm.unroll", "8192", "-batch_size", "64", "-grad_clip", "0.25",
                "-amp", "1", "-save_interval", "600000", "-stop_after", "100000", "-lr_sched.type", "cos",
                "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1", "-compile", "1",
                "-name", "small_mosa_sw_8192",
            ]
        },











        {
            "name": "fixed_sw1024",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1,2,3"
            },
            "args": [
                "-task", "c4_sliding_window_and_sparse_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.sw_heads", "4", "-sa_moe.expert_heads", "70", "-sa_moe.sparsity", "16",
                "-sa_moe.include_first", "0", "-sa_moe.sparsity_type", "fixed",
                "-transformer.head_projection_size", "64", "-dropout", "0.0", "-lr", "0.00025",
                "-optimizer", "adamw", "-lm.unroll", "1024", "-batch_size", "64", "-grad_clip", "0.25",
                "-amp", "1", "-save_interval", "600000", "-stop_after", "100000", "-lr_sched.type", "cos",
                "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1", "-compile", "1",
                "-name", "small_mosa_sw_1024",
            ]
        },
        {
            "name": "fixed_sw2048",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1,2,3"
            },
            "args": [
                "-task", "c4_sliding_window_and_sparse_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.sw_heads", "4", "-sa_moe.expert_heads", "70", "-sa_moe.sparsity", "32",
                "-sa_moe.include_first", "0", "-sa_moe.sparsity_type", "fixed",
                "-transformer.head_projection_size", "64", "-dropout", "0.0", "-lr", "0.00025",
                "-optimizer", "adamw", "-lm.unroll", "2048", "-batch_size", "64", "-grad_clip", "0.25",
                "-amp", "1", "-save_interval", "600000", "-stop_after", "100000", "-lr_sched.type", "cos",
                "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1", "-compile", "1",
                "-name", "small_mosa_sw_2048",
            ]
        },
        {
            "name": "fixed_sw4096",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1,2,3"
            },
            "args": [
                "-task", "c4_sliding_window_and_sparse_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.sw_heads", "4", "-sa_moe.expert_heads", "70", "-sa_moe.sparsity", "64",
                "-sa_moe.include_first", "0", "-sa_moe.sparsity_type", "fixed",
                "-transformer.head_projection_size", "64", "-dropout", "0.0", "-lr", "0.00025",
                "-optimizer", "adamw", "-lm.unroll", "4096", "-batch_size", "64", "-grad_clip", "0.25",
                "-amp", "1", "-save_interval", "600000", "-stop_after", "100000", "-lr_sched.type", "cos",
                "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1", "-compile", "1",
                "-name", "small_mosa_sw_4096",
            ]
        },
        {
            "name": "fixed_sw8192",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1,2,3"
            },
            "args": [
                "-task", "c4_sliding_window_and_sparse_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.sw_heads", "4", "-sa_moe.expert_heads", "70", "-sa_moe.sparsity", "128",
                "-sa_moe.include_first", "0", "-sa_moe.sparsity_type", "fixed",
                "-transformer.head_projection_size", "64", "-dropout", "0.0", "-lr", "0.00025",
                "-optimizer", "adamw", "-lm.unroll", "8192", "-batch_size", "64", "-grad_clip", "0.25",
                "-amp", "1", "-save_interval", "600000", "-stop_after", "100000", "-lr_sched.type", "cos",
                "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1", "-compile", "1",
                "-name", "small_mosa_sw_8192",
            ]
        },











        {
            "name": "routing_sw1024",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1,2,3"
            },
            "args": [
                "-task", "c4_sliding_window_and_sparse_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.sw_heads", "4", "-sa_moe.expert_heads", "4", "-sa_moe.sparsity", "16",
                "-sa_moe.include_first", "0", "-sa_moe.sparsity_type", "routing",
                "-transformer.head_projection_size", "64", "-dropout", "0.0", "-lr", "0.00025",
                "-optimizer", "adamw", "-lm.unroll", "2048", "-batch_size", "64", "-grad_clip", "0.25",
                "-amp", "1", "-save_interval", "600000", "-stop_after", "100000", "-lr_sched.type", "cos",
                "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1", "-compile", "1",
                "-name", "small_routing_sw_1024",
            ]
        },
        {
            "name": "routing_sw2048",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1,2,3"
            },
            "args": [
                "-task", "c4_sliding_window_and_sparse_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.sw_heads", "4", "-sa_moe.expert_heads", "4", "-sa_moe.sparsity", "32",
                "-sa_moe.include_first", "0", "-sa_moe.sparsity_type", "routing",
                "-transformer.head_projection_size", "64", "-dropout", "0.0", "-lr", "0.00025",
                "-optimizer", "adamw", "-lm.unroll", "2048", "-batch_size", "64", "-grad_clip", "0.25",
                "-amp", "1", "-save_interval", "600000", "-stop_after", "100000", "-lr_sched.type", "cos",
                "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1", "-compile", "1",
                "-name", "small_routing_sw_2048",
            ]
        },
        {
            "name": "routing_sw4096",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1,2,3"
            },
            "args": [
                "-task", "c4_sliding_window_and_sparse_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.sw_heads", "4", "-sa_moe.expert_heads", "4", "-sa_moe.sparsity", "64",
                "-sa_moe.include_first", "0", "-sa_moe.sparsity_type", "routing",
                "-transformer.head_projection_size", "64", "-dropout", "0.0", "-lr", "0.00025",
                "-optimizer", "adamw", "-lm.unroll", "4096", "-batch_size", "64", "-grad_clip", "0.25",
                "-amp", "1", "-save_interval", "600000", "-stop_after", "100000", "-lr_sched.type", "cos",
                "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1", "-compile", "1",
                "-name", "small_routing_sw_4096",
            ]
        },
        {
            "name": "routing_sw8192",
            "type": "debugpy",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "console": "integratedTerminal",
            "env": {
                "CUDA_VISIBLE_DEVICES": "0,1,2,3"
            },
            "args": [
                "-task", "c4_sliding_window_and_sparse_transformer", "-test_interval", "100000",
                "-state_size", "1024", "-transformer.ff_multiplier", "4", "-transformer.n_layers", "6",
                "-sa_moe.sw_heads", "4", "-sa_moe.expert_heads", "4", "-sa_moe.sparsity", "128",
                "-sa_moe.include_first", "0", "-sa_moe.sparsity_type", "routing",
                "-transformer.head_projection_size", "64", "-dropout", "0.0", "-lr", "0.00025",
                "-optimizer", "adamw", "-lm.unroll", "8192", "-batch_size", "64", "-grad_clip", "0.25",
                "-amp", "1", "-save_interval", "600000", "-stop_after", "100000", "-lr_sched.type", "cos",
                "-lr_warmup", "4000", "-wd", "0.01", "-min_lr_multiplier", "0.1", "-compile", "1",
                "-name", "small_routing_sw_8192",
            ]
        },

    ]
}