import json

# 假设你的原始 JSON 文件路径如下
input_file = "charades_test.json"
output_file = "formatted_charades_test.json"

# 读取 JSON 数据
with open(input_file, 'r') as f:
    data = json.load(f)

# 保存结果的列表
output_data = []


output_data = []

# 遍历每个视频 sample
for video_id, content in data.items():
    duration = content.get("duration", 1.0)
    video_name = f"{video_id}.mp4"
    sentences = content.get("sentences", [])
    timestamps = content.get("timestamps", [])

    # 遍历 sentence + timestamp 对
    for i, sentence in enumerate(sentences):
        if i >= len(timestamps):
            continue  # 防止索引越界

        ts = timestamps[i]

        # 检查 timestamp 是否为 [start, end]
        if not isinstance(ts, list) or len(ts) != 2:
            continue

        start, end = ts
        normalized_solution = [round(start / duration, 4), round(end / duration, 4)]

        cleaned_sentence = sentence.strip('" ').strip()

        sample = {
            "dataset": "charades", #"activitynet",
            "text_type": "caption",
            "normal_caption": cleaned_sentence,
            "problem": f"Give the query: '{cleaned_sentence}', when does the described content occur in the video?",
            "video": video_name,
            "solution": normalized_solution,
            "video_length": duration
        }

        output_data.append(sample)

# 保存结果为新 JSON 文件
with open(output_file, 'w') as f:
    json.dump(output_data, f, indent=2)

print(f"✅ Converted {len(output_data)} samples and saved to {output_file}")