defaults:
  - _preprocess
  - _self_
input_dir: ${oc.env:MEDS_DIR}
cohort_dir: ${oc.env:MODEL_DIR}
stages:
  - fit_filter_and_occlude
  - filter_measurements
  - filter_subjects
  - occlude_outliers
  - reorder_measurments
  - fit_normalization
  - fit_vocabulary_indices
  - normalization
  - tokenization
  - tensorization

stage_configs:
  filter_subjects:
    min_events_per_subject: 10
    min_measurements_per_subject: 10
  fit_filter_and_occlude:
    _script: MEDS_transform-aggregate_code_metadata
    aggregations:
      - code/n_occurrences
      - code/n_subjects
      - values/sum
      - values/sum_sqd
      - values/n_occurrences
  filter_measurements:
    min_subjects_per_code: 10000
  reorder_measurments:
    _script: "MEDS_transform-reorder_measurements"
    order: $(python -m meds_torch.utils.get_all_measurements metadata_fp=${oc.env:MODEL_DIR}/metadata/codes.parquet)
  fit_normalization:
    _script: MEDS_transform-aggregate_code_metadata
    aggregations:
      - code/n_occurrences
      - code/n_subjects
      - name: values/quantiles
        quantiles: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
  tokenization:
    _script: "python -m meds_torch.utils.custom_tokenization"
