from pathlib import Path
import pandas as pd
import json
import tqdm
import yaml
import os
PREPARED_AUDIOCAPS_DIR = Path(
    os.environ.get("MACS_PREPARED_BASE", "./data/macs/raw_data")
)
TARGET_AUDIOCAPS_DIR = Path(os.environ.get("MACS_TARGET_BASE", "./data/macs"))
ORI_DATA_DIR = Path(os.environ.get("MACS_ORI_DATA", "./data/TAU2019/audio"))

for split in ["dev"]:
    with open(PREPARED_AUDIOCAPS_DIR / "MACS.yaml", "r") as yaml_file:
        annotation_yaml = yaml.safe_load(yaml_file)  

        (TARGET_AUDIOCAPS_DIR / "jsonl" / split).mkdir(parents=True, exist_ok=True)
        with open(TARGET_AUDIOCAPS_DIR / "jsonl" / split / "audio.jsonl", "w") as audio_writer, \
            open(TARGET_AUDIOCAPS_DIR / "jsonl" / split / "caption.jsonl", "w") as caption_writer:
            for item in tqdm.tqdm(annotation_yaml["files"]):
                tmp_ori_path = (ORI_DATA_DIR / item['filename'])
                tmp_tgt_path = (TARGET_AUDIOCAPS_DIR / "audio" / item['filename'])
                cmd = f"ln -s {tmp_ori_path} {tmp_tgt_path}"
                os.system(cmd)
                if Path(tmp_ori_path).exists():
                    audio_writer.write(
                        json.dumps({
                            "audio_id": item["filename"],
                            "audio": f"macs/audio/{item['filename']}"
                        }) + "\n"
                    )
                else:
                    print(tmp_ori_path)

                for caption_item in item["annotations"]: 
                    caption_writer.write(
                        json.dumps({
                            "audio_id": item["filename"],
                            "caption": caption_item["sentence"]
                        }) + "\n"
                    )
