import json 
import os 

import datasets 
import fire 

from common .common_types import Sample 
from common .dataset import create_datasets 


def load_apps (data_dir ,split :str ="test")->list [Sample ]:

    rows =datasets .load_from_disk (
    os .path .join (data_dir ,f"codeparot/apps_cleaned_{split }")
    )
    samples =[]
    for row in rows :
        if row ["input_output"]is None or row ["input_output"]=="":
            continue 
        try :
            io_dict =json .loads (row ["input_output"])
        except (json .JSONDecodeError ,ValueError )as e :
            print (
            f"Skipping problem_id {row ['problem_id']} due to parse error: {str (e )}"
            )
            continue 
        if io_dict ["inputs"]==[]or io_dict ["outputs"]==[]:
            continue 
        if "fn_name"in io_dict and io_dict ["fn_name"]is not None :
            continue 
        try :
            solutions =json .loads (row ["solutions"])
        except (json .JSONDecodeError ,ValueError )as e :
            print (
            f"Skipping problem_id {row ['problem_id']} due to parse error: {str (e )}"
            )
            continue 
        if not row ["solutions"]:
            continue 
        primary_code =solutions [0 ]
        input_list =io_dict .get ("inputs",[])
        output_list =io_dict .get ("outputs",[])
        min_len =min (len (input_list ),len (output_list ))
        for i in range (min_len ):
            in_str =input_list [i ]
            out_str =output_list [i ]
            sample_id =f"{row ['problem_id']}_{i }"
            samples .append (
            Sample (
            sample_id =sample_id ,
            code =primary_code ,
            input =in_str ,
            output =out_str ,
            problem_statement =row ["question"],
            )
            )
    return samples 


def main (
data_name :str ="apps",
input_dir :str ="/path/to/home/data",
output_dir :str ="/path/to/home/lltm/02_codeexec_etcot/scripts/instruction/convert_datasets",
):
    train_samples =load_apps (data_dir =input_dir ,split ="train")
    test_samples =load_apps (data_dir =input_dir ,split ="test")
    all_samples =train_samples +test_samples 
    create_datasets (data_name ,all_samples ,output_dir =output_dir )





if __name__ =="__main__":
    fire .Fire (main )
