# @package _global_
# Dataset Configuration for Epiformer Dataset Creation
# This config defines parameters for creating the epiformer dataset from AsEP + PDB files

dataset:
# ============================================
# Data Paths Configuration
# ============================================
  tensor: "epiformer_dataset_test.pkl"  # Default for RAAD

  # Graph type configuration - affects encoder architecture
  graph_type: "raad"  # Options: "base", "simple", "gearnet", "raad"
  graph_num_relations: 4
  # base: no hierarchy, residue-only graph with 4.5A proximity
  # simple: epiformer with uni-relational residue edges
  # gearnet: epiformer with 7-relation residue edges (GearNet-Edge)
  # raad: epiformer with multi-relational residue edges (current)

  
  plm_type: "esm2_35m"
  esm2_35m: "res_graph_tensor_esm2_35m.pkl"
  esm2_650m: "res_graph_tensor_esm2_650m.pkl"
  esm2_3b: "res_graph_tensor_esm2_3b.pkl"
  esm3_small: "res_graph_tensor_esm3_small.pkl"
  
  # Graph type-specific datasets
  graph_datasets:
    base: "base_dataset.pkl"
    simple: "simple_epiformer_dataset.pkl"
    gearnet: "gearnet_epiformer_dataset.pkl"
    raad: "epiformer_dataset.pkl"

  
  # TODO: Added dataset split configuration for AsEP paper compatibility
  split:
    method: "random"  # Options: "random", "epitope_ratio", "epitope_group"
    seed: 42         # Random seed for reproducible splits
    # AsEP paper split file paths (corrected for excluded complexes)
    split_dict_path: "${hydra:runtime.cwd}/../../../../data/asep/split/split_dict_corrected.pt"
    pdb_ids_path: "${hydra:runtime.cwd}/../../../../data/asep/split/asepv1-AbDb-IDs-corrected.txt"

  paths:
    # AsEP preprocessed data (contains PLM embeddings and labels)
    asep_data: "../../../data/asep/dict_pre_cal_esm2_esm2.pkl"
    
    # PDB structure directories
    antigen_pdb_dir: "../../../data/asep/ag_atmseq2surf"    # Antigen surface PDBs
    antibody_pdb_dir: "../../../data/asep/ab_atmseq2cdr"    # Antibody CDR PDBs
    
    # Output paths
    output_dir: "../../../data/asep/epiformer"
    output_file: "epiformer_dataset.pkl"
    
    # Alternative data sources (for different experiments)
    alternative:
      m3epi_transformed: "../../../data/asep/m3epi/asep_m3epi_transformed.pkl"
      test_subset: "../../../data/asep/m3epi/asep_mipe_transformed_100_examples.pkl"

    