datamodule:
  _target_: "proteinfoundation.datasets.pdb_data.PDBLightningDataModule"
  data_dir: data_dir/pdb_train/
  in_memory: False
  format: "cif"
  overwrite: False
  batch_padding: True
  sampling_mode: "cluster-random"
  transforms:
    - _target_: "proteinfoundation.datasets.transforms.CoordsToNanometers"
    - _target_: "proteinfoundation.datasets.transforms.CenterStructureTransform"
    - _target_: "proteinfoundation.datasets.transforms.GlobalRotationTransform"
    - _target_: "proteinfoundation.datasets.transforms.ChainBreakPerResidueTransform"
    - _target_: "proteinfoundation.datasets.transforms.MotifMaskTransform"
      atom_selection_mode: "all"
      motif_max_pct_res: 0.3
      motif_prob: 1.0
    - _target_: "proteinfoundation.datasets.transforms.CenteringTransform"
      center_mode: "motif"
      data_mode: "all-atom"
    - _target_: "proteinfoundation.datasets.transforms.ExtractMotifCoordinatesTransform"
  batch_size: 2
  num_workers: 16
  pin_memory: True

  dataselector:
    _target_: "proteinfoundation.datasets.pdb_data.PDBDataSelector"
    data_dir: data_dir/pdb_train/
    fraction: 0.001
    molecule_type: "protein"
    experiment_types: ["diffraction", "EM"]
    min_length: 50
    max_length: 256
    oligomeric_min: null 
    oligomeric_max: null 
    best_resolution: 0.0
    worst_resolution: 5.0
    has_ligands: []
    remove_ligands: []
    remove_non_standard_residues: True
    remove_pdb_unavailable: True
    exclude_ids:


  datasplitter:
    _target_: "proteinfoundation.datasets.pdb_data.PDBDataSplitter"
    data_dir: ${oc.env:DATA_PATH}/pdb_train/
    train_val_test: [0.98, 0.019, 0.001]
    split_type: "sequence_similarity"
    split_sequence_similarity: 0.5
    overwrite_sequence_clusters: False
