defaults:
  - model: per_token_iql
  - dataset: list_test
  # - score_evaluators@evaluator: iql_eval
  - evaluator: iql_evaluator
  # - evaluator: top_advantage_n_grams
  - _self_

dataset:
  cache_id: d
  data:
    cache_id: raw_data
    cache_path: null
    reward_shift: 0.0
    reward_scale: -5.0
    reward_f:
      name: toxicity_reward
      # name: score_human_reward
      # reddit_path: data/reddit_comments/
      # index_path: data/reddit_comments/test_idxs.json
  max_len: 512
  cuttoff: null
  resample_timeout: 0.05
  include_parent: true

model:
  alpha: 0.005
  gamma: 0.99
  beta: 0.0
  transition_weight: 0.0
  clip_weight: null
  value_max: null
  value_min: null
  detach_v: false
  detach_q: false
  detach_pi: false
  double_q: true
  seperate_policy: true
  seperate_target: true
  tau: 0.8
  exp_weights: true
  dm_margin: 0.0
  advanced_mlp: false
  cql_temp: 1.0
  gpt2:
    lm_head: true
    from_pretrained: true
  dataset:
    name: toxicity_list_dataset
    cache_id: d
  load:
    # checkpoint_path: outputs/toxicity/conditional_toxicity_bc_test1/model.pkl
    # checkpoint_path: outputs/toxicity/conditional_toxicity_pbc_test1/model_163839.pkl
    # checkpoint_path: outputs/toxicity/conditional_toxicity_official_iql_test1/model_131071.pkl
    checkpoint_path: outputs/toxicity/conditional_toxicity_official_iql_test2/model_196607.pkl
    strict_load: true

# evaluator:
#   data:
#     name: toxicity_list_dataset
#     cache_id: d
#   print_every: 100
#   print_k: 100
#   n_gram: 2


# evaluator:
#   env:
#     reward_shift: 0.0
#     reward_scale: -5.0
#     reward_f:
#       name: toxicity_reward
#       # name: model_reward
#       # model:
#       #   name: roberta_binary_reward_model
#       #   dataset:
#       #     name: toxicity_list_dataset
#       #     cache_id: d
#       #   roberta_kind: roberta-base
#       #   freeze_roberta: false
#       #   reward_cuttoff: 0.0
#       #   load:
#       #     name: roberta_binary_reward_model
#       #     checkpoint_path: outputs/toxicity/toxicity_upvote_roberta_binary_reward_f1/model.pkl
#       #     strict_load: true
#   kwargs_main:
#     beta: 10.0
#     exp_weights: true
#     clip_weight: null
#     logit_temp: 1.0
#     logit_top_k: null
#     logit_top_p: null
#     include_logits: true
#     include_advantage: true
#   verbose: true
#   num_generations: 1
#   max_generation_len: 256
#   kind: sample

evaluator:
  env:
    reward_shift: 0.0
    reward_scale: -5.0
    data:
      name: reddit_comments
      cache_id: raw_data
    reward_f:
      name: toxicity_reward
      # name: model_reward
      # model:
      #   name: roberta_binary_reward_model
      #   dataset:
      #     name: toxicity_list_dataset
      #     cache_id: d
      #   roberta_kind: roberta-base
      #   freeze_roberta: false
      #   reward_cuttoff: 0.0
      #   load:
      #     name: roberta_binary_reward_model
      #     checkpoint_path: outputs/toxicity/toxicity_upvote_roberta_binary_reward_f2/model.pkl
      #     strict_load: true
    include_parent: true
  verbose: true
  kind: sample
  generation_kwargs:
    max_generation_len: 256
    # beam_width: 1
    temp: 1.0
    top_k: null
    top_p: null
    exp_adv: true
    adv_weight: 8.0
    adv_clip: null
    include_logits: true
    include_adv: true
    num_generations: 1
    rerank_log_prob_weight: 0.0
    rerank_advantage_weight: 1.0

eval:
  dataloader_workers: 1
  bsize: 1
  batches: 1024
  print_every: 8
  seed: 0
  log_save_path: outputs/toxicity/evals/conditional_toxicity_official_iql_test2_beta8_beam1_eval1.pkl
  loss:
    v_loss_weight: 1.0
    q_loss_weight: 1.0
    awac_weight: 1.0
    cql_loss_weight: 1.0
    mc_returns: false
