import argparse
import os
from clip_model import run_clip_main
from cap_model import run_cap_main
from causal_model import run_causal_main


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='LongVideo Understanding')
    parser.add_argument("--dataset_name", type=str, required=True, help="Name of the dataset")
    parser.add_argument("--dataset_dir", type=str, required=True, help="Directory containing videos or image folders")
    parser.add_argument("--keyframe_num", type=int, default=64, help="Number of keyframes to extract from each video")
    parser.add_argument("--causalframe_num", type=int, default=64, help="Number of causalframes to extract from each video")
    parser.add_argument("--clip_root", type=str, default="qihoo360/fg-clip-large",
                        choices=[
                            "qihoo360/fg-clip-base",
                            "qihoo360/fg-clip-large"
                        ],
                        help="FG-CLIP model used for feature extraction.")
    parser.add_argument("--vllm_root", type=str, default="Qwen/Qwen2.5-VL-3B-Instruct",
                        help="VLLM model used for video scene caption.")
    parser.add_argument("--llm_root", type=str, default="Qwen/Qwen3-8B",
                        choices=[
                            "Qwen/Qwen3-1.7B",
                            "Qwen/Qwen3-8B"
                        ],
                        help="LLM model used for causal inference.")
    parser.add_argument("--gpus", nargs='+', type=int, default=[0, 1, 2, 3], help="List of GPUs to use")
    parser.add_argument("--visualize", action='store_true', help="Whether to visualize the selected frames")
    args = parser.parse_args()

    args.output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.dataset_name + f"_fn_{args.keyframe_num}_{args.causalframe_num}")
    run_clip_main(args)
    run_cap_main(args)
    run_causal_main(args)
