_base_ = "slipmae_emotion_faceonly.py"


pipeline = [
    dict(
        type="LoadVideoWithLabelSegment",
        video_path_key="video_path",
        audio_path_key="audio_path",
        label_key="action",
        max_num_frames=4,
        sampling_rate=16000,
        segment_rule="random",
        video_only=True,
        strict_length=True,
    ),
    dict(
        type="ResizeVideo",
        video_keys=["video"],
        size_candidates=[(512, 512)],
        keep_ratio=True,
    ),
    dict(type="CenterCropVideo", video_keys=["video"], crop_size=(512, 512)),
    dict(
        type="NormalizeVideo",
        video_keys=["video"],
        mean=[0.5, 0.5, 0.5],
        std=[0.5, 0.5, 0.5],
    ),
]

model = dict(
    task="multilabel",  # multiclass for appearance, multilabel for action and emotion
    num_classes=35,  # 8 for emotion, 40 for appearance, 35 for action
    label_key="action",
)

train_dataloader = dict(
    batch_size=48,
    num_workers=16,
    sampler=dict(type="DefaultSampler", shuffle=True),
    collate_fn=dict(type="flexible_collate"),
    dataset=dict(
        type="TextVideoAudioLabelDataset",
        data_dir="data/",
        anno_file="data/celebv-hq/annotations/train_anno_meta.json",
        pipeline=pipeline,
        refetch=True,
    ),
)


train_cfg = dict(by_epoch=True, max_epochs=100, val_interval=100)

optim_wrapper = dict(
    type="AmpOptimWrapper",
    dtype="bfloat16",
    optimizer=dict(type="AdamW", lr=5e-5, betas=[0.9, 0.99], weight_decay=0.0),
)
