cuda_visible_devices: "4"
device: gpu

dataset_name: arXiv
logger_name: arXiv-logger_filename.csv
model_name: logger_name_used_in_MHA_config      # Options: MHA_ReLu, MHA_Anneal, MHA_Sigmoid
setting_file: /path/to/config/file/of/the/corresponding/MHA-Model/
type_model: GAT 
binarized: False
multi_layer: False                              # True for multi-layer attention models, False for single-layer
baseline: False                                 # Always False for attention-based learned graphs
type_graph: max                                 # "max" or "mean", depending on the type of statistical filtering to use. For such a long dataset with extremely long documents, we only use max
tol_degree: 0.5                                 # Tolerance degree for the statistical filtering of the attention weights. Default 0.5
unified_nodes: True 


data_paths:
  path_logger: /path/to/arXiv-logger/file/
  results_folder: /path/to/GNN/results/folder/
  root_graph_dataset: /path/to/graph-based/arXiv-dataset/folder/

load_data_paths:
  in_path: "/path/to/data/folder/"
  data_train: ""
  labels_train: ""
  data_test: ""
  labels_test: ""
  with_val: True                                # True if validation set is available, False otherwise 

model_arch_args:
  num_classes: 11
  lr: 0.001
  dropout: 0.2
  dim_features : [64, 128, 256]
  n_layers: [1, 2, 3]
  num_runs: 5

batch_size: 8                                   # Batch size is set to 8 due to the long documents in arXiv dataset
with_cw: True
max_len: 1800                                   # Maximum sequence length (number of sentences) for arxiv documents

trainer_args:
  max_epochs: 50
  enable_progress_bar: False
  accumulate_grad_batches: 4                    # Accumulate gradients over 4 batches, as the batch size is small

early_args:
  patience: 5
  min_delta: 0.001  
