train:
    warmup_rate: 0.1
    epochs: 4
    samples_per_epoch: 150000
    max_length: 512
    precomputed_languagebind: False
    dataset_list:
        -
            root: worldnet/ag                                    # root of subset
            annotaion_path: worldnet/ag/state/action_train.json  # path to annotations
            modality: ['image', 'video', 'audio']                # available modalities of subset
            weight: 0.2                                          # possibility to be chosen in training
    modality_modes:
        -
            inputs: ['image']
            targets: ['image']
            weight: 0.2
        -
            inputs: ['video']
            targets: ['video']
            weight: 0.2
        -
            inputs: ['audio']
            targets: ['audio']
            weight: 0.1
        -
            inputs: ['image']
            targets: ['video']
            weight: 0.2
        -
            inputs: ['image']
            targets: ['audio']
            weight: 0.025
        -
            inputs: ['video']
            targets: ['image']
            weight: 0.2
        -
            inputs: ['video']
            targets: ['audio']
            weight: 0.025
        -
            inputs: ['audio']
            targets: ['image']
            weight: 0.025
        -
            inputs: ['audio']
            targets: ['video']
            weight: 0.025
        -
            inputs: ['image', 'video']
            targets: ['image', 'video']
            weight: 0.6
        -
            inputs: ['image', 'audio']
            targets: ['image', 'audio']
            weight: 0.1
        -
            inputs: ['video', 'audio']
            targets: ['video', 'audio']
            weight: 0.1
        -
            inputs: ['image', 'video', 'audio']
            targets: ['image', 'video', 'audio']
            weight: 0.2

validation:
    validation_interval: 3000
    max_length: 512
    max_tgt_length: 512
    top_p: 1.0
    temperature: 0.4
    precomputed_languagebind: True
    dataset_list:
        -
            root: worldnet/ag                                    # root of subset
            annotaion_path: worldnet/ag/state/action_valid.json  # path to annotations
            modality: ['image', 'video', 'audio']                # available modalities of subset
    modality_modes:
        -
            inputs: ['image']
            targets: ['image']
        -
            inputs: ['video']
            targets: ['video']
        -
            inputs: ['audio']
            targets: ['audio']
        -
            inputs: ['image', 'audio']
            targets: ['video']
        -
            inputs: ['video', 'audio']
            targets: ['image']
        -
            inputs: ['image']
            targets: ['video', 'audio']
        -
            inputs: ['video']
            targets: ['image', 'audio']
        -
            inputs: ['image', 'video', 'audio']
            targets: ['image', 'video', 'audio']