{
      "output_dir": "proj_1",
      "device": "cuda",
      "resume_from_checkpoint": null,
      "resume_from_checkpoint_reset_steps": false,
      "save_state": true,
      "train_batch_size": 32,
      "eval_batch_size": 32,
      "learning_rate": 1e-4,
      "max_grad_norm": 1.0,
      "weight_decay": 5e-6,
      "adam_beta1": 0.9,
      "adam_beta2": 0.95,
      "adam_epsilon": 1e-8,
      "epochs_l": [5],
      "dataset_size_l": ["full"],
      "warmup_steps": 1000,
      "scheduler": "cosine_warmup",
      "logging_steps": 50,
      "save_steps": 5000,
      "eval_steps": 5000,
      "generate_steps": 200000,
      "eval_at_step_zero": false,
      "generate_at_step_zero": false,
      "seeds": [42],
      "gradient_accumulation_steps": 1,
      "pad_to_multiple_of": 8,
      "debug": false,
      "menc_names_or_paths": [
            "openai/clip-vit-large-patch14",
            "clap-htsat-fused", 
            "alibaba-pai/VideoCLIP-XL"
      ],
      "mm_dim": 768,
      "load_extracted_features": [
            true,
            true,
            true
      ],
      "mm_dtype": "float32",
      "lm_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
      "lm_dtype": "bfloat16",
      "dataset_names_or_paths": [
            "coco",
            "audiocaps",
            "openvid"
      ],
      "proj_name_or_path": "proj_1",
      "proj_arch": "mlp",
      "proj_act": "quick_gelu",
      "proj_n_layers": 2,
      "proj_dropout": 0.1
}
