_base_ = "slipmae_emotion.py"

pipeline = [
    dict(
        type="LoadVideoWithLabelSegment",
        video_path_key="video_path",
        audio_path_key="audio_path",
        label_key="appearance",
        max_num_frames=4,
        sampling_rate=16000,
        segment_rule="random",
        video_only=True,
        strict_length=True,
    ),
    dict(type="LoadText", text_path_key=None, dummy_captions=["A person is talking."]),
    dict(
        type="ResizeVideo",
        video_keys=["video"],
        size_candidates=[(512, 512)],
        keep_ratio=True,
    ),
    dict(type="CenterCropVideo", video_keys=["video"], crop_size=(512, 512)),
    dict(
        type="NormalizeVideo",
        video_keys=["video"],
        mean=[0.5, 0.5, 0.5],
        std=[0.5, 0.5, 0.5],
    ),
]

model = dict(
    task="multilabel",  # multiclass for appearance, multilabel for action and emotion
    num_classes=40,  # 8 for emotion, 40 for appearance, 35 for action
    label_key="appearance",
)

train_dataloader = dict(
    batch_size=6,
    num_workers=8,
    dataset=dict(
        anno_file="data/celebv-hq/annotations/train_anno_meta.json",
        pipeline=pipeline,
    ),
)
