## This program creates a json file from a csv file containing the metadata 
## of the queries. 

import pandas as pd
import json 
import argparse
import os 
from pathlib import Path

def parse_arguments():
    parser = argparse.ArgumentParser(description="Convert CSV to JSON")
    parser.add_argument("-i", "--input_csv", type=str, required=True, 
                        help="Path to the input CSV file")
    parser.add_argument("-of","--output_folder", type=str, required=True, 
                        help="Path to the output folder")
    parser.add_argument("-on","--output_name", type=str, required=True,
                        help="Name of the output JSON file")   
    parser.add_argument("-nc","--name_column", type=str, default="paper_name",
                        help="Name of index column in the CSV file")
    parser.add_argument("-qc","--query_column", type=str, default="natural_language_query",
                        help="Name of the column containing the queries")
    parser.add_argument("-dc","--description_column", type=str, default="data_description",
                        help="Name of the column containing the descriptions")
    parser.add_argument("-fc","--filename_column", type=str, default="data_files",
                        help="Name of the column containing the filenames") 
    parser.add_argument("-mc", "--method_column", type=str, default="method",
                        help="Name of the column describing method that is used")
    parser.add_argument("-ec", "--effect_column", type=str, default="answer",
                        help="Name of the column containing the causal effect reference values")

    return parser.parse_args()


def csv_to_json(df, name_column, query_column, description_column, 
                filename_column, method_column, effect_column, file_path=None, 
                count=25):
    """
    Convert a DataFrame to json format
    Args:
        df (pd.DataFrame): The input DataFrame 
        name_column (str): The name of the column containing the names associated to the queries
        query_column (str): The name of the column containing the queries
        description_column (str): The name of the column containing the descriptions
        filename_column (str): The name of the column containing the filenames
        method_column (str): The name of the column containing the method names
        effect_column (str): The name of the column containing the causal effect reference values
        file_path (str, optional): The path to the file to save the json. Defaults to None.
        count
    Returns:
        (dict): the json object 
    """
    json_list = []
    method_count = {}
    for _, row in df.iterrows():

        query = row[query_column]
        description = row[description_column]
        filename = row[filename_column] if file_path is None else f"{file_path}/{row[filename_column]}"
        name = row[name_column]
        method = row[method_column]
        if method not in method_count:
            method_count[method] = 0
        method_count[method] += 1
        if method_count[method] > count:
            continue
        effect = float(row[effect_column])
        if not os.path.exists(filename):
            print(f"Warning: File {filename} does not exist.")
            continue
        json_list.append({"name": name, "query": query, "dataset_description": description, 
                         "method": method, "effect":effect, "dataset_path": filename})
        
    return json_list 


if __name__ == "__main__":

    args = parse_arguments()
    try:
        df = pd.read_csv(args.input_csv)
        df["answer"] = df["answer"].astype(str).str.replace("−", "-")
    except FileNotFoundError:
        raise FileNotFoundError(f"Input CSV file {args.input_csv} not found.")
    
    ## this is specific to our use case
    dict_data = csv_to_json(df, "paper_name", "natural_language_query", 
                           "data_description", "data_files", "method", "answer",
                           file_path="data/synthetic_data")  
    json_data = json.dumps(dict_data, indent=4)
    output_path = Path(args.output_folder)
    output_path.mkdir(parents=True, exist_ok=True)
    output_name = args.output_name
    if ".json" not in args.output_name:
        output_name += ".json"
    output_file = output_path / f"{args.output_name}"
    with open(output_file, "w") as f:
        f.write(json_data)
    print(f"JSON file saved to {output_file}")

    
