
seed: 555
use_wandb: true
output_dir: "./outputs"


model_config:
  model_name: "roberta-large"
  tokenizer_name: "roberta-large"
  max_length: 512
  gradient_checkpointing: true


training:
  learning_rate: 1e-5
  batch_size: 16
  epochs: 3
  warmup_ratio: 0.1
  weight_decay: 0.1
  max_grad_norm: 1.0
  evaluation_strategy: "epoch"
  save_strategy: "epoch"
  fp16: true
  

  aq_sgd: false 
    

  lazy_sampling: true 
  lazy_sampling_params:
    schedule: "constant"
    p_t: 0.5


compression_config:
  layer12:
    layer_idx: 12
    forward: "topk"
    forward-EF: true
    forward-EF-method: "AQSGD"
    #forward-EF-method: "EF"
    forward-params:
      topk: 0.3
    backward: "topk"
    backward-EF: true
    backward-EF-method: "EF21"
    #backward-EF-method: "AQSGD"
    backward-params:
      topk: 0.3


dataset_config:
  cola:
    path: "/data/datasets/glue/cola"
    num_labels: 2
    metrics: ["accuracy", "matthews_correlation"]
  qnli:
    path: "/data/datasets/glue/qnli"
    num_labels: 2
    metrics: ["accuracy"]
  qqp:
    path: "/data/datasets/glue/qqp"
    num_labels: 2
    metrics: ["accuracy", "f1"]
  mnli:
    path: "/data/datasets/glue/mnli"
    num_labels: 3
    metrics: ["accuracy"]
  stsb:
    path: "/data/datasets/glue/stsb"
    num_labels: 1
    metrics: ["pearson", "spearman"]
  mrpc:
    path: "/data/datasets/glue/mrpc"
    num_labels: 2
    metrics: ["accuracy", "f1"]
  rte:
    path: "/data/datasets/glue/rte"
    num_labels: 2
    metrics: ["accuracy"]
  sst2:
    path: "/data/datasets/glue/sst2"
    num_labels: 2
    metrics: ["accuracy"]


wandb:
  project: "roberta-glue"
  name: "aqsgd-compression-test" 