_name: null
common:
  _name: null
  no_progress_bar: false
  log_interval: 200
  log_format: json
  log_file: null
  aim_repo: null
  aim_run_hash: null
  tensorboard_logdir: null
  wandb_project: null
  azureml_logging: false
  seed: 1
  cpu: false
  tpu: false
  bf16: false
  memory_efficient_bf16: false
  fp16: true
  memory_efficient_fp16: false
  fp16_no_flatten_grads: false
  fp16_init_scale: 4
  fp16_scale_window: 128
  fp16_scale_tolerance: 0.0
  on_cpu_convert_precision: false
  min_loss_scale: 0.0001
  threshold_loss_scale: 1.0
  amp: false
  amp_batch_retries: 2
  amp_init_scale: 128
  amp_scale_window: null
  user_dir: null
  empty_cache_freq: 0
  all_gather_list_size: 16384
  model_parallel_size: 1
  quantization_config_path: null
  profile: false
  reset_logging: false
  suppress_crashes: false
  use_plasma_view: false
  plasma_path: /tmp/plasma
common_eval:
  _name: null
  path: null
  post_process: null
  quiet: false
  model_overrides: '{}'
  results_path: null
distributed_training:
  _name: null
  distributed_world_size: 1
  distributed_num_procs: 1
  distributed_rank: 0
  distributed_backend: nccl
  distributed_init_method: null
  distributed_port: -1
  device_id: 0
  distributed_no_spawn: false
  ddp_backend: pytorch_ddp
  ddp_comm_hook: none
  bucket_cap_mb: 25
  fix_batches_to_gpus: false
  find_unused_parameters: true
  gradient_as_bucket_view: false
  fast_stat_sync: false
  heartbeat_timeout: -1
  broadcast_buffers: false
  slowmo_momentum: null
  slowmo_base_algorithm: localsgd
  localsgd_frequency: 3
  nprocs_per_node: 1
  pipeline_model_parallel: false
  pipeline_balance: null
  pipeline_devices: null
  pipeline_chunks: 0
  pipeline_encoder_balance: null
  pipeline_encoder_devices: null
  pipeline_decoder_balance: null
  pipeline_decoder_devices: null
  pipeline_checkpoint: never
  zero_sharding: none
  fp16: ${common.fp16}
  memory_efficient_fp16: ${common.memory_efficient_fp16}
  tpu: ${common.tpu}
  no_reshard_after_forward: false
  fp32_reduce_scatter: false
  cpu_offload: false
  use_sharded_state: false
  not_fsdp_flatten_parameters: false
dataset:
  _name: null
  num_workers: 1
  skip_invalid_size_inputs_valid_test: false
  max_tokens: 4400
  batch_size: 16
  required_batch_size_multiple: 1
  required_seq_len_multiple: 1
  dataset_impl: null
  data_buffer_size: 10
  train_subset: train
  valid_subset: valid
  combine_valid_subsets: null
  ignore_unused_valid_subsets: false
  validate_interval: 1
  validate_interval_updates: 0
  validate_after_updates: 0
  fixed_validation_seed: null
  disable_validation: false
  max_tokens_valid: ${dataset.max_tokens}
  batch_size_valid: ${dataset.batch_size}
  max_valid_steps: null
  curriculum: 0
  gen_subset: test
  num_shards: 1
  shard_id: 0
  grouped_shuffling: false
  update_epoch_batch_itr: ${dataset.grouped_shuffling}
  update_ordered_indices_seed: false
optimization:
  _name: null
  max_epoch: 10
  max_update: 5336
  stop_time_hours: 0.0
  clip_norm: 0.0
  sentence_avg: false
  update_freq:
  - 1
  lr:
  - 1.0e-05
  stop_min_lr: -1.0
  use_bmuf: false
  skip_remainder_batch: false
checkpoint:
  _name: null
  save_dir: checkpoints
  restore_file: /Users/aaron/Projects/feature-aug/roberta_pretrained/roberta.base.tar.gz
  continue_once: null
  finetune_from_model: null
  reset_dataloader: true
  reset_lr_scheduler: false
  reset_meters: true
  reset_optimizer: true
  optimizer_overrides: '{}'
  save_interval: 1
  save_interval_updates: 0
  keep_interval_updates: -1
  keep_interval_updates_pattern: -1
  keep_last_epochs: -1
  keep_best_checkpoints: -1
  no_save: false
  no_epoch_checkpoints: true
  no_last_checkpoints: false
  no_save_optimizer_state: false
  best_checkpoint_metric: accuracy
  maximize_best_checkpoint_metric: true
  patience: -1
  checkpoint_suffix: ''
  checkpoint_shard_count: 1
  load_checkpoint_on_all_dp_ranks: false
  write_checkpoints_asynchronously: false
  model_parallel_size: ${common.model_parallel_size}
bmuf:
  _name: null
  block_lr: 1.0
  block_momentum: 0.875
  global_sync_iter: 50
  warmup_iterations: 500
  use_nbm: false
  average_sync: false
  distributed_world_size: ${distributed_training.distributed_world_size}
generation:
  _name: null
  beam: 5
  nbest: 1
  max_len_a: 0.0
  max_len_b: 200
  min_len: 1
  match_source_len: false
  unnormalized: false
  no_early_stop: false
  no_beamable_mm: false
  lenpen: 1.0
  unkpen: 0.0
  replace_unk: null
  sacrebleu: false
  score_reference: false
  prefix_size: 0
  no_repeat_ngram_size: 0
  sampling: false
  sampling_topk: -1
  sampling_topp: -1.0
  constraints: null
  temperature: 1.0
  diverse_beam_groups: -1
  diverse_beam_strength: 0.5
  diversity_rate: -1.0
  print_alignment: null
  print_step: false
  lm_path: null
  lm_weight: 0.0
  iter_decode_eos_penalty: 0.0
  iter_decode_max_iter: 10
  iter_decode_force_max_iter: false
  iter_decode_with_beam: 1
  iter_decode_with_external_reranker: false
  retain_iter_history: false
  retain_dropout: false
  retain_dropout_modules: null
  decoding_format: null
  no_seed_provided: false
  eos_token: null
eval_lm:
  _name: null
  output_word_probs: false
  output_word_stats: false
  context_window: 0
  softmax_batch: 9223372036854775807
interactive:
  _name: null
  buffer_size: 0
  input: '-'
model:
  _name: roberta
  dropout: 0.1
  attention_dropout: 0.1
task:
  _name: sentence_prediction
  data: ./CoLA-bin
  init_token: 0
  separator_token: 2
  num_classes: 2
  max_positions: 512
criterion:
  _name: sentence_prediction
optimizer:
  _name: adam
  weight_decay: 0.1
  adam_betas: (0.9,0.98)
  adam_eps: 1.0e-06
lr_scheduler:
  _name: polynomial_decay
  warmup_updates: 320
scoring: null
bpe: null
tokenizer: null
ema:
  _name: null
  store_ema: false
  ema_decay: 0.9999
  ema_start_update: 0
  ema_seed_model: null
  ema_update_freq: 1
  ema_fp32: false
