checkpoint_path: https://huggingface.co/facebook/video_seal/checkpoint.pth
args:
  image_dataset: sa-1b-500k-resized
  video_dataset: sa-v
  prop_img_vid: 0.0
  video_start: 0
  finetune_detector_start: 251
  embedder_config: configs/embedder.yaml
  extractor_config: configs/extractor.yaml
  attenuation_config: configs/attenuation.yaml
  embedder_model: unet_small2
  extractor_model: sam_small
  augmentation_config: configs/augs.yaml
  num_augs: 2
  nbits: 96
  img_size: 256
  img_size_extractor: 256
  img_size_val: 256
  attenuation: None
  blending_method: additive
  scaling_w: 1.0
  scaling_w_schedule: null
  scaling_i: 1.0
  videoseal_chunk_size: 32
  videoseal_step_size: 4
  optimizer: AdamW,lr=1e-6
  optimizer_d: null
  scheduler: CosineLRScheduler,lr_min=1e-7,t_initial=301,warmup_lr_init=1e-8,warmup_t=10
  epochs: 301
  iter_per_epoch: 50
  sleepwake: false
  iter_per_valid: 20
  temperature: 1.0
  lambda_det: 0.0
  lambda_dec: 1.0
  lambda_i: 0.5
  lambda_d: 0.1
  balanced: true
  total_gnorm: 1.0
  perceptual_loss: mse
  disc_start: 0
  disc_num_layers: 2
  disc_hinge_on_logits_fake: false
  batch_size: 16
  batch_size_eval: 32
  batch_size_video: 1
  batch_size_video_eval: 1
  workers: 0
  frames_per_clip: 32
  frame_step: 1
  num_clips: 2
  only_eval: false
  eval_freq: 5
  full_eval_freq: 5
  saveimg_freq: 5
  saveckpt_freq: 50
  seed: 444
  debug_slurm: false
  local_rank: 0
  modality: hyb
embedder:
  model: unet_small2
  params:
    msg_processor:
      nbits: 16
      hidden_size: 32
      msg_processor_type: binary+concat
    unet:
      in_channels: 3
      out_channels: 3
      z_channels: 16
      num_blocks: 8
      activation: silu
      normalization: rms
      z_channels_mults:
      - 1
      - 2
      - 4
      - 8
      last_tanh: true
extractor:
  model: sam_small
  params:
    encoder:
      img_size: 256
      embed_dim: 384
      out_chans: 384
      depth: 12
      num_heads: 6
      patch_size: 16
      global_attn_indexes:
      - 2
      - 5
      - 8
      - 11
      window_size: 8
      mlp_ratio: 4
      qkv_bias: true
      use_rel_pos: true
    pixel_decoder:
      pixelwise: false
      upscale_stages:
      - 1
      embed_dim: 384
      nbits: 16
      sigmoid_output: false
      upscale_type: bilinear