__object__:
  path: projects.hfm.humanexpert_s3_mot_dforcing
  name: humanexpertTrainer

# debug: true
running_dir: /runs
# strategy: auto
strategy: deepspeed_stage_2
devices: [0]
# devices: 8
precision: bf16-mixed
# debug: True
# num_nodes: 5

eval:
  render_path: //demo_scripts/pose2vid_motenc_folder_v2.sh
  num_samples: 50
  eval_first_chunk: false

training:
  project: human_foundation_model_L1
  name: 0520_humanexpert_MOT_pose_audio_noMeanStd_2B_dforcing_800_timePE_LTC_TrainAllLTC
  seed: 1234
  train_epochs: 999999
  eval_steps: 100000
  save_steps: 1000
  train_batch_size: 2
  eval_batch_size: 1
  gradient_clip: 1.0
  torch_compile: False
  eval_visualize: True

dforcing:
  # for architecture
  x_shape: [262]
  cond_dropout_prob: 0.2
  external_cond_dim: None
  noise_level: random_independent
  # scheduling_matrix: full_sequence
  scheduling_matrix: autoregressive

  uniform_future:
    enabled: false
  fixed_context:
    enabled: false
    indices: null # defaults to context_frames
    dropout: 0
  variable_context:
    enabled: false
    prob: 0
    dropout: 0

  diffusion:
    is_continuous: False
    timesteps: 1000
    beta_schedule: cosine
    schedule_fn_kwargs:
      shift: 1.0
    clip_noise: 20.0
    # training
    objective: pred_v # pred_x0 pred_v
    loss_weighting:
      strategy: fused_min_snr
      snr_clip: 5.0
      cum_snr_decay: 0.9
    # sampling
    sampling_timesteps: 50
    ddim_sampling_eta: 0.0
    # (For full sequence diffusion)
    reconstruction_guidance: 0.0
    use_causal_mask: True

  backbone:
    # sample_n_frames: 62
    # tokens_per_frame: 1
    max_tokens_training: 800
    use_fourier_noise_embedding: false

  sampling:
    chunk_size: 52

    from_fsdp: True
    compile: False
    sample_dir: 'samplings'

    # video generation tasks
    tasks:
      prediction:
        enabled: True
        history_guidance:
          name: vanilla
          guidance_scale: 1.5

        # history_guidance:
        #   name: fractional
        #   guidance_scale: 4.0
        #   freq_scale: 0.5
        # history_guidance:
        #   name: stabilized_vanilla
        #   guidance_scale: 3.0
        #   stabilization_level: 0.02
        keyframe_density: 1.0
        sliding_context_len: null
      interpolation:
        enabled: False
        history_guidance:
          # name: vanilla
          # guidance_scale: 2.0
        max_batch_size: null

optimizer:
  name: AdamW
  lr: 1e-04
  weight_decay: 0.01
  betas:
    - 0.9
    - 0.95

lr_schedule:
  name: constant_with_warmup
  num_warmup_steps: 100
  num_training_steps: $training.train_epochs
  scheduler_specific_kwargs:
    num_cycles: 0.5
    last_epoch: -1

data:
  __inherit__: projects/hfm/configs/data/pose_ltc.yaml
  num_workers: 2
  prefetch_factor: 8
  mean_path: null
  std_path: null
  window_size: 2048 # 20s
  dataset_weights: [0, 1]
  # eval_split: train

model:
  eval_training: True
  streaming_size: 30
  max_length: 1024
  interleave_window: 26
  audio_base_layer: True
  instruction_text: True
  response_text: True
  freeze_lm: True
  audio_bos_eos: False # Wrap audio with bos and eos token in response
  text_fps: 6.25
  audio_fps: 12.5
  motion_fps: 25
  model_path: /runs/0515_humanexpert_MOT_pose_audio_noMeanStd_2B_dforcing_800_timePE_LTC_TrainAllLTC/output_2025-05-16_06:09:32/step-34000_state_dict.pt
  # model_path: /runs/0515_humanexpert_MOT_pose_audio_noMeanStd_2B_dforcing_800_timePE_LTC_TrainAllLTC/output_2025-05-16_06:09:32/step-17000_state_dict.pt
  # model_path: /runs/0513_humanexpert_MOT_pose_audio_noMeanStd_2B_dforcing_800_timePE_LTC_TrainAllLTC/output_2025-05-13_21:13:01/step-80000_state_dict.pt

  audio_tokenizer:
    __object__:
      path: models.speech_tokenizer.modeling_whisper
      name: WhisperVQEncoder
      args: as_params

    cosyvoice_path: /GLM-4-Voice/
    matcha_path: /GLM-4-Voice/third_party/Matcha-TTS
    model_path: /mnt/deps/glm-4-voice-tokenizer
    flow_path: /mnt/deps/glm-4-voice-decoder

  language_model:
    model_path: /mnt/deps/glm-4-voice-9b
    config:
      mot_hidden_size: [2048]
      output_conv: True

  task:
    - 做一个笑脸的表情
    - 做一个丧气的表情
    - 笑着说今天天气真不错
    - 伤心地说今天的天气真糟糕
    - How's the day today?
    - What does a rainbow look like?
    - 你会如何向一个孩子解释互联网？
    - 你知道维生素 C 有什么功效吗？
    - 你能介绍一下上海吗？
    - 早上好
    - 我今天很忙
    - 请列出五种水果
    - 地球的自转周期是多少？
    - 春节是什么时候？
    - 秦始皇是谁？
    - 中国的首都在哪里？
    - 拜年时要注意什么？
    - 用户要求做某事，模型如何礼貌拒绝？
    - 如果人类能飞会怎样？
    - 水是由什么组成的？
    - 帮我规划一天的行程。

metric:
  names: ['loss']
  save_monitor: {}
  #   eval/mse:
  #     abbr: mse
  #     mode: min

logging:
  logging_steps: 5
  save_steps: 50
  report_to: ['tensorboard', 'wandb']
  # report_to: []
