program: main.py
command:
  - ${env}
  - python3
  - ${program}
  - ${args}
method: grid
metric:
  name: validation/valid/accuracy/total
  goal: maximize
parameters:
  log:
    value: wandb
  profile:
    value: listops_trafo
  listops.variant:
    value: big
  transformer.variant:
    value: ndr_residual
  test_batch_size:
    value: 16
  transformer.encoder_n_layers:
    value: 20
  n_test_layers:
    value: 24
  lr:
    value: 0.0002
  state_size:
    value: 512
  dropout:
    value: 0.1
  transformer.attention_dropout:
    value: 0.1
  wd:
    value: 0.09
  transformer.n_heads:
    value: 16
  batch_size:
    value: 512
  optimizer:
    value: adamw
  stop_after:
    value: 100000
  max_length_per_batch:
    value: none
  amp:
    value: 1
  wandb_bug_workaround:
    value: 1
  geometric.pos_encoding:
    value: direction
  ndr.scalar_gate:
    value: 0
  ndr.drop_gate:
    value: 0.05
  grad_clip:
    value: 1
  length_bucketed_sampling:
    value: 1
  transformer.ff_multiplier:
    value: 2
  trafo_classifier.out_mode:
    value: linear
  trafo_classifier.norm_att:
    value: 0
  sweep_id_for_grid_search:
    distribution: categorical
    values:
      - 1
      - 2
      - 3
      - 4
      - 5
