import os
import warnings
import subprocess
import cv2
import torchaudio

def merge_video_audio(
    video_path: str,
    audio_path: str,
    output_path: str,
    fps: int = 25,
    sr: int = 16000,
    tol_sec: float = 0.1
) -> str:
    """
    Merge a silent video and an audio file into a single video with sound.
    If the durations differ by more than tol_sec, a warning is emitted.

    Args:
        video_path (str): path to input video (no audio).
        audio_path (str): path to input audio (wav, mp3, etc.).
        output_path (str): path for output video (mp4).
        fps (int): expected frames per second of the video (for duration check).
        sr (int): expected sampling rate of the audio (for duration check).
        tol_sec (float): tolerance (in seconds) for duration mismatch.

    Returns:
        str: path to the merged video.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise RuntimeError(f"Failed to open video: {video_path}")
    frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    video_fps = cap.get(cv2.CAP_PROP_FPS) or fps
    video_dur = frame_count / video_fps
    cap.release()

    # 2. 获取音频时长
    audio_waveform, audio_sr = torchaudio.load(audio_path)
    # 如果采样率不一致，以实际加载到的为准
    audio_dur = audio_waveform.shape[1] / audio_sr

    # 3. 校验时长差异
    if abs(video_dur - audio_dur) > tol_sec:
        warnings.warn(
            f"Video duration ({video_dur:.3f}s) and audio duration "
            f"({audio_dur:.3f}s) differ by more than {tol_sec}s.",
            UserWarning
        )

    # 4. 合并输出
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    cmd = [
        "ffmpeg",
        "-y",
        "-i", video_path,
        "-i", audio_path,
        "-c:v", "copy",       # 复制视频流
        "-c:a", "aac",        # 音频编码为 aac
        "-map", "0:v:0",      # 取第一个输入的视频流
        "-map", "1:a:0",      # 取第二个输入的音频流
        output_path
    ]
    subprocess.run(cmd, check=True)
    return output_path

