# Settings for implementation details
# These settings "should" not influence the outcome of the computation in major ways, only its speed.
# These settings are pytorch implementation details, tuned for singl(ish) GPU, sane pytorch stuff
# -----------------------------------------------------------------------------------------------------

name: torch-default
defaults:
  - _default
  - _self_


# Basic pytorch settings
benchmark: True # CUDNN benchmarking
deterministic: False # This option will disable non-deterministic ops
non_blocking: True # unblocked .to(device) handles
tf32_allowed: True
matmul_precision: medium # highest/high/medium

mixed_precision: True # turns on AMP on GPUs/Intel devices. The default precision needs to be float
grad_scaling: True # Only activates when mixed_precision=True
mixed_precision_target_dtype: float16 # you might try your luck with bfloat16 too

# Distributed training:
zero_redundancy_optimizer: False # requires limited_decay_keys=[] for pytorch<=1.10.2
broadcast_buffers: False
bucket_cap_mb: 25
gradient_as_bucket_view: True
static_graph: True

# scaled dot products:
enable_mem_efficient_sdp: False
enable_math_sdp: True
enable_flash_sdp: True

# Misc:
foreach_optimizer: False

# Compilation
compile_torch: True
mode: default # overwritten by manual selection of inductor variables below
dynamic: False # this is a world of pain (when I last tested it, around torch2.0 release)
fullgraph: True # why even compile when not compile everywhere :>
backend: inductor
_inductor_vars:
  # max_autotune_gemm: True
  # max_autotune_pointwise: False # was better in some tests not to enable this?
  # triton:
  #   cudagraphs: False # cannot fit with overhead
  #   # cudagraph_trees: False # fixes memory problems but has scary warning messages
  # # epilogue_fusion: True # true by default is latest nightly
  # # aggressive_fusion: False # oom on latest nightly
  # permute_fusion: True # nice
  # shape_padding: True # flaky on the new nightly?
  # optional to mess with the internal inductor config. Maybe not advisable
  # - `epilogue_fusion` which fuses pointwise ops into templates. Requires `max_autotune` to also be set
  # - `max_autotune` which will profile to pick the best matmul configuration
  # - `fallback_random` which is useful when debugging accuracy issues
  # - `shape_padding` which pads matrix shapes to better align loads on GPUs especially for tensor cores
  # - `triton.cudagraphs` which will reduce the overhead of python with CUDA graphs
  # - `trace.enabled` which is the most useful debugging flag to turn on
  # - `trace.graph_diagram` which will show you a picture of your graph after fusion
  # - For inductor you can see the full list of configs that it supports by calling `torch._inductor.list_options()`
  # or directly at https://github.com/pytorch/pytorch/blob/master/torch/_inductor/config.py
