import pandas as pd
import argparse
import os
from tqdm import tqdm

def parse_args():
    parser = argparse.ArgumentParser('')

    parser.add_argument('--data_path', type=str, help='path data files',
                        default='./')
    parser.add_argument('--save_path', type=str, help='path data files',
                        default='./')
    args = parser.parse_args()
    return args

args = parse_args()

def read_txt_file(path):
    with open(path, "r") as f:
        txt = f.read()
    return txt

def build_csv_files(path, save_path):
    data_types = ['train', 'val']
    for data_type in data_types:
        data_path = os.path.join(path, data_type)
        save_file_path = os.path.join(save_path, "Train_GCC-training_output.csv") if data_type == "train" \
            else os.path.join(save_path, "Validation_GCC-1.1.0-Validation_output.csv")

        csv_list = []
        for root, dirs, files in tqdm(os.walk(data_path)):
            for file in files:
                if file[-4:] == ".jpg" and file[:2] != "._":
                    filepath, txt_file_path = os.path.join(root, file), os.path.join(root, file[:-4]+".txt")
                    title = read_txt_file(txt_file_path)
                    csv_list.append({"filepath":filepath, "title":title})
                    # print(img_path)
        print(len(csv_list))
        # save
        df = pd.DataFrame(csv_list, columns=["filepath", "title"])
        print(df)
        df.to_csv(save_file_path, index=False, sep="\t")
        print("Saved to", save_file_path)
    print("Done!")

if __name__ == "__main__":
    build_csv_files(args.data_path, args.save_path)