
_target_: src.datamodules.pcfg_set.PCFGSetDatamodule
dataset_target_: src.datamodules.pcfg_set.PCFGSetDataset
seed: 42
key: pcfg_set
tokenize_in_dataset: True
data_type_sampling_probability: 
  - ${callbacks.supervision_scheduler.scheduler_xz.hp_init}
  - ${callbacks.supervision_scheduler.scheduler_z.hp_init} #[xz, p(z|not zx)]

experiment: "systematicity" # "systematicity", "substitutivity", "productivity", "localism", "overgeneralization", "pcfgset"

path_header: ${work_dir}

experiment_paths:
  systematicity:
    train:
      src_path: "${datamodule.path_header}/data/pcfgset/systematicity/train.src"
      tgt_path: "${datamodule.path_header}/data/pcfgset/systematicity/train.tgt"
    test:
      src_path: "${datamodule.path_header}/data/pcfgset/systematicity/test.src"
      tgt_path: "${datamodule.path_header}/data/pcfgset/systematicity/test.tgt"
  productivity:
    train:
      src_path: "${datamodule.path_header}/data/pcfgset/productivity/train.src"
      tgt_path: "${datamodule.path_header}/data/pcfgset/productivity/train.tgt"
    test:
      src_path: "${datamodule.path_header}/data/pcfgset/productivity/test.src"
      tgt_path: "${datamodule.path_header}/data/pcfgset/productivity/test.tgt"
  pcfgset:
    train:
      src_path: "${datamodule.path_header}/data/pcfgset/pcfgset/train.src"
      tgt_path: "${datamodule.path_header}/data/pcfgset/pcfgset/train.tgt"
    test:
      src_path: "${datamodule.path_header}/data/pcfgset/pcfgset/test.src"
      tgt_path: "${datamodule.path_header}/data/pcfgset/pcfgset/test.tgt"
  substitutivity:
    train:
      src_path: "${datamodule.path_header}/data/pcfgset/substitutivity/equally_distributed/train.src"
      tgt_path: "${datamodule.path_header}/data/pcfgset/substitutivity/equally_distributed/train.tgt"
    test:
      src_path: "${datamodule.path_header}/data/pcfgset/substitutivity/equally_distributed/test.src"
      tgt_path: "${datamodule.path_header}/data/pcfgset/substitutivity/equally_distributed/test.tgt"

dataset_parameters:
  seed: ${seed}
  batch_size: 128
  train_ratio: 0.8
  supervision_ratio: [0.05, 0.5] # [r(xz), r(z|not xz)]
  num_workers: 8
  overfit_batch: ${overfit_batch}
  max_x_length: ${model.model_params.max_x_length}
  max_z_length: ${model.model_params.max_z_length}
  remove_long_data_points: True
  train:
    src_path: ${datamodule.experiment_paths[${datamodule.experiment}].train.src_path}
    tgt_path: ${datamodule.experiment_paths[${datamodule.experiment}].train.tgt_path}
  test:
    src_path: ${datamodule.experiment_paths[${datamodule.experiment}].test.src_path}
    tgt_path: ${datamodule.experiment_paths[${datamodule.experiment}].test.tgt_path}
  
datasets:
  seed: ${seed}
  test:
      _target_: ${datamodule.dataset_target_}
      split: "test"
  
  train:
      _target_: ${datamodule.dataset_target_}
      split: "train"
      
  val:
      _target_: ${datamodule.dataset_target_}
      split: "val"