name: "fourdfy_stage_3_low_vram_vc"
tag: "${rmspace:${system.prompt_processor.prompt},_}"
exp_root_dir: "outputs"
seed: 0

data_type: "single-multiview-combined-camera-datamodule"
data:
  prob_multi_view: 0.75
  single_view:
    batch_size: [1,1]
    # 0-4999: 64x64, >=5000: 512x512
    # this drastically reduces VRAM usage as empty space is pruned in early training
    width: [256, 256]
    height: [256, 256]
    resolution_milestones: [5000]
    camera_distance_range: [2.5, 3.0]
    fovy_range: [15, 60]
    elevation_range: [0, 30]
    camera_perturb: 0.
    center_perturb: 0.
    up_perturb: 0.
    eval_camera_distance: 3.0
    eval_fovy_deg: 40.
    static: false
    num_frames: 16
    simultan: true
    prob_single_view_video: 1.0
    width_vid: 128
    height_vid: 80
    # width_vid: 64
    # height_vid: 40
    sample_rand_frames: t0
    num_frames_factor: 4
    eval_height: 256
    eval_width: 256
    test_traj: 'motion_smooth'

  multi_view:
    batch_size: [4,4] # must be dividable by n_view
    n_view: 4
    width: [256, 256]
    height: [256, 256]
    resolution_milestones: [5000]
    camera_distance_range: [2.5, 3.0]
    fovy_range: [15, 60]
    elevation_range: [0, 30]
    camera_perturb: 0.
    center_perturb: 0.
    up_perturb: 0.
    n_val_views: 4
    eval_camera_distance: 3.0
    eval_fovy_deg: 40.
    relative_radius: false
    num_frames: ${data.single_view.num_frames}
    sample_rand_frames: ${data.single_view.sample_rand_frames}
    eval_height: ${data.single_view.eval_height}
    eval_width: ${data.single_view.eval_width}

system_type: "fourdfy-system"
system:
  freeze_static_modules: true
  prob_multi_view: ${data.prob_multi_view}
  prob_single_view_video: ${data.single_view.prob_single_view_video}
  stage: coarse
  geometry_type: "implicit-volume"
  geometry:
    radius: 1.0
    normal_type: "finite_difference"

    density_bias: "blob_magic3d"
    density_activation: softplus
    density_blob_scale: 10.
    density_blob_std: 0.5

    pos_encoding_config:
      otype: HashGridSpatialTime
      n_levels: 16
      n_features_per_level: 2
      log2_hashmap_size: 19
      base_resolution: 16
      per_level_scale: 1.447269237440378 # max resolution 4096
      static: ${data.single_view.static}
      num_frames: ${data.single_view.num_frames}
    
    anneal_density_blob_std_config:
      min_anneal_step: 0
      max_anneal_step: 50000
      start_val: ${system.geometry.density_blob_std}
      end_val: 0.5

  material_type: "no-material"
  material:
    n_output_dims: 3
    color_activation: sigmoid

  background_type: "neural-environment-map-background"
  background:
    color_activation: sigmoid
    random_aug: false

  renderer_type: "stable-nerf-volume-renderer"
  renderer:
    base_renderer_type: "mask-nerf-volume-renderer"
    base_renderer:
      radius: ${system.geometry.radius}
      num_samples_per_ray: 512
      train_max_nums: 100000
      train_max_nums_static: ${system.renderer.base_renderer.train_max_nums}

  simultan: ${data.single_view.simultan}
  prompt_processor_type: "stable-diffusion-prompt-processor"
  prompt_processor:
    pretrained_model_name_or_path: "stabilityai/stable-diffusion-2-1-base"
    prompt: ???
    front_threshold: 30.
    back_threshold: 30.

  prompt_processor_type_multi_view: "stable-diffusion-prompt-processor"
  prompt_processor_multi_view:
    pretrained_model_name_or_path: "stabilityai/stable-diffusion-2-1-base"
    prompt: ???
    negative_prompt: "ugly, bad anatomy, blurry, pixelated obscure, unnatural colors, poor lighting, dull, and unclear, cropped, lowres, low quality, artifacts, duplicate, morbid, mutilated, poorly drawn face, deformed, dehydrated, bad proportions"
    front_threshold: 30.
    back_threshold: 30.

  guidance_type: "stable-diffusion-vsd-guidance"
  guidance:
    pretrained_model_name_or_path: "stabilityai/stable-diffusion-2-1-base"
    pretrained_model_name_or_path_lora: "stabilityai/stable-diffusion-2-1"
    guidance_scale: 7.5
    min_step_percent: 0.02
    max_step_percent: 0.5
    max_step_percent_annealed: 0.5
    anneal_start_step: 5000
  
  guidance_type_multi_view: "multiview-diffusion-guidance"
  guidance_multi_view:
    model_name: "sd-v2.1-base-4view"
    ckpt_path: null # path to a pre-downloaded checkpoint file (null for loading from URL)
    guidance_scale: 50.0
    min_step_percent: [0, 0.02, 0.02, 8000]  # (start_iter, start_val, end_val, end_iter)
    max_step_percent: [0, 0.5, 0.5, 8000]
    recon_loss: true
    recon_std_rescale: 0.5
  
  prompt_processor_type_video: "videocrafter-prompt-processor"
  prompt_processor_video:
    config: threestudio/models/guidance/videocrafter/configs/inference_t2v_512_v2.0.yaml
    # Checkpoint https://huggingface.co/VideoCrafter/VideoCrafter2/blob/main/model.ckpt
    pretrained_model_name_or_path: "VideoCrafter/VideoCrafter2"
    negative_prompt: "low motion, static statue, not moving, no motion"
    prompt: ???

  guidance_type_video: "videocrafter-guidance"
  guidance_video:
    config: ${system.prompt_processor_video.config}
    pretrained_model_name_or_path: ${system.prompt_processor_video.pretrained_model_name_or_path}
    guidance_scale: 100.
    weighting_strategy: sds
    use_hifa: false
    width_vid: ${data.single_view.width_vid}
    height_vid: ${data.single_view.height_vid}
    motion_amp_scale: 30.
    half_precision_weights: false
    fps: 8
    min_step_percent: [0, 0.02, 0.02, 5000]  # (start_iter, start_val, end_val, end_iter)
    max_step_percent: [0, 0.98, 0.5, 5000]
    # Set a number between 1 and 16, this saves memory by only backpropagating through low_ram_vae number of frames instead of all 16
    # low_ram_vae: 2

  loggers:
    wandb:
      enable: false
      project: "threestudio"

  loss:
    lambda_sds: 1.
    lambda_sds_video: 0.1
    lambda_vsd: 1.
    lambda_lora: 1.
    lambda_orient: 0.
    lambda_sparsity: 10.
    lambda_opaque: [10000, 0.0, 1000.0, 10001]
    lambda_z_variance: 0.
    lambda_tv: 0.
    lambda_deformation: 0.
  optimizer:
    name: AdamW
    args:
      betas: [0.9, 0.99]
      eps: 1.e-15
    params:
      geometry.encoding.encoding.encoding:
        lr: 0.0001
      geometry.encoding.encoding.encoding_time:
        lr: 0.01
      geometry.density_network:
        lr: 0.001
      geometry.feature_network:
        lr: 0.001
      background:
        lr: 0.001
      guidance:
        lr: 0.0001

trainer:
  max_steps: 200000
  log_every_n_steps: 1
  num_sanity_val_steps: 0
  val_check_interval: 1000
  enable_progress_bar: true
  precision: 16-mixed

checkpoint:
  save_last: true
  save_top_k: 0 #-1
  every_n_train_steps: ${trainer.max_steps}
