import re 
from typing import List 
import os 

import datasets 
import fire 

from common .common_types import Sample 
from common .dataset import create_datasets 


def load_atcoder (data_dir )->List [Sample ]:
    ds =datasets .load_dataset (
    os .path .join (data_dir ,"nan_do_datasets/atcoder_contests/"),


    )
    ds =ds ["train"]

    samples :List [Sample ]=[]
    print (f"Total rows in dataset: {len (ds )}")
    cnt =0 
    for row in ds :

        if "Python"not in row ["lang"]and "3."not in row ["lang"]:
            continue 
        if row ["status"]!="AC":
            continue 
        if row ["task"]!="A"and row ["task"]!="B"and row ["task"]!="C":
            continue 

        ps :str =row ["problem_statement"]
        pattern =r"Sample Input\s*\d*\s*(.*?)\s*Sample Output\s*\d*\s*(.*?)(?:\s*Sample Input\s*\d*|$)"
        match =re .search (pattern ,ps ,flags =re .DOTALL |re .IGNORECASE )
        if not match :
            continue 
        sample_input =match .group (1 ).strip ()
        sample_output =match .group (2 ).strip ()

        if sample_output :
            sample_output =sample_output .splitlines ()[0 ].strip ()

        if not sample_input or not sample_output :
            continue 

        sample_id =f"{row ['contest_id']}_{row ['task']}"

        if ".bin"in row ["code"]:
                print (".bin FOUND!!")
                continue 

        samples .append (
        Sample (
        sample_id =sample_id ,
        code =row ["code"],
        input =sample_input ,
        output =sample_output ,
        problem_statement =row ["problem_statement"],
        )
        )
        cnt +=1 
    print (f"Loaded {len (samples )} valid samples from AtCoder dataset.")
    return samples 


def main (
data_name :str ="atcoder",
input_dir :str ="/path/to/home/data",
output_dir :str ="/path/to/home/lltm/02_codeexec_etcot/scripts/instruction/convert_datasets",
):
    all_samples =load_atcoder (data_dir =input_dir )
    create_datasets (data_name ,all_samples ,output_dir =output_dir )



if __name__ =="__main__":
    fire .Fire (main )
