video_tokenizer_config = {
    'encoder': {
        'n_layer': 2,
        'n_head': 8,
        'n_embd': 512,
        'dropout': 0,
        'bias': True
    },

    'decoder': {
        'n_layer': 4,
        'n_head': 8,
        'n_embd': 512,
        'dropout': 0,
        'bias': True
    },

    'codebook': {
        'embed_dim': 32,
        'n_embd': 1024,
        'beta': 0.25,
        'norm_vq': True
    },

    'img_size': 64,
    'patch_size': 4,
    'seq_len': 8,

    'weight_decay': 1e-4,
    'warmup_steps': 10000,
    'lr': 3e-4,
    'betas': (0.9, 0.9),

    'n_train_steps': 100000,
    'batch_size': 32,
    'gradient_accumulate_every': 1,
}

# video_tokenizer_config_256 = {
#     'encoder': {
#         'n_layer': 6,
#         'n_head': 8,
#         'n_embd': 512,
#         'dropout': 0,
#         'bias': True
#     },

#     'decoder': {
#         'n_layer': 8,
#         'n_head': 8,
#         'n_embd': 512,
#         'dropout': 0,
#         'bias': True
#     },

#     'codebook': {
#         'embed_dim': 32,
#         'n_embd': 1024,
#         'beta': 0.25,
#         'norm_vq': True
#     },

#     'img_size': 256,
#     'patch_size': 16,
#     'seq_len': 8,

#     'weight_decay': 1e-4,
#     'warmup_steps': 10000,
#     'lr': 3e-4,
#     'betas': (0.9, 0.9),

#     'n_train_steps': 100000,
#     'batch_size': 32,
#     'gradient_accumulate_every': 1,
# }


oxe_video_tokenizer_config = {
    'encoder': {
        'n_layer': 6,
        'n_head': 8,
        'n_embd': 512,
        'dropout': 0,
        'bias': True
    },

    'decoder': {
        'n_layer': 10,
        'n_head': 16,
        'n_embd': 1024,
        'dropout': 0,
        'bias': True
    },

    'codebook': {
        'embed_dim': 32,
        'n_embd': 2048,
        'beta': 0.25
    },
    
    'img_size': 256,
    'patch_size': 16,
    'seq_len': 8,

    'n_train_steps': 300000,
    'batch_size': 8,
}


