# @package model
t2tattn1_cfg:
  _target_: src.models.attention.sbblocksparse_attention.SBBlockSparseAttention
  # seqlen = 3136, and we want to use 1/32 of the memory
  sparsity_config:
    _target_: deepspeed.ops.sparse_attention.BigBirdSparsityConfig
    num_heads: 1
    block: 32
    num_sliding_window_blocks: 3
  dim_heads: 64
  nb_features: 32
  attention_dropout: 0.0
  max_seq_length: 3136
t2tattn2_cfg:
  _target_: ${..t2tattn1_cfg._target_}
  # seqlen = 784, and we want to use 1/32 of the memory
  sparsity_config:
    _target_: ${...t2tattn1_cfg.sparsity_config._target_}
    num_heads: ${...t2tattn1_cfg.sparsity_config.num_heads}
    block: 16
    num_sliding_window_blocks: 3
  dim_heads: ${..t2tattn1_cfg.dim_heads}
  nb_features: 16
  attention_dropout: 0.0
  max_seq_length: 784
