"anc_model_type": set to "transformer"
"anc_enc_config": dictionary containing sub-config for the ancestor sequence embedder; contains the following-
    "initial_embed_module": (STRING) How to first embed the sequences; "EmbeddingWithPadding" is good
    "first_block_module": (STRING) What block to after first embedding the sequences; we use "RoPETransfBlock"
    "subsequent_block_module": (STRING) What subsequence blocks to use; we use "RoPETransfBlock"
    "hidden_dim": (INT) embedding size
    "num_blocks": (INT) how many transformer blocks to stack (self-attention block + MLP block)
    "num_heads": (INT) number of heads in self-attention
    "dropout": (FLOAT) dropout rate
    "max_position_embeddings": (INT) maximum sequence length (for RoPE)

"desc_model_type": set to "transformer"
"desc_dec_config":  dictionary containing sub-config for the descendant sequence embedder; contains same values as anc_enc_config
    