import json 
import re 
import os 


import fire 
from tqdm import tqdm 

from common .common_types import Sample 
from common .dataset import create_datasets 


def extract_code_content (code_block :str )->str :
    """
    Removes any triple backticks (``` or ```python, etc.) from a code block
    and returns the inner code as a string.
    """
    code_block_pattern =re .compile (r"```python(.*?)```",re .DOTALL )
    matches =code_block_pattern .findall (code_block )
    if matches :
        return matches [0 ].strip ()
    return code_block .strip ()


def extract_rhs_from_input (input_str :str )->str :
    """
    Extract right-hand side values from input string with variable assignments.
    """
    pattern =re .compile (r"([\w_]+)\s*=\s*(.*?)(?=,\s*[\w_]+\s*=|$)")
    lines =input_str .strip ().splitlines ()
    line_results =[]

    for line in lines :
        matches =pattern .findall (line )
        if matches :
            rhs_list =[rhs .strip ()for _ ,rhs in matches ]
            line_result =" ".join (rhs_list )
            line_results .append (line_result )

    if line_results :
        return ",".join (line_results )
    return input_str 


BANNED_KEYWORDS =[
"git","vim","Vim","nano","emacs","/path/to","EDITOR","sudo",

"py_compile","compileall",
"PyInstaller","cx_Freeze","py2exe",
"setuptools","distutils","setup",
"gcc","clang","cc","make",


"os.system","subprocess","Popen","os.popen",


"pickle","marshal",
"'wb'",
"b'","b\""
]

def load_pyx (data_dir )->list [Sample ]:
    """
    Load samples from pyx_samples.jsonl
    Returns a list of Sample objects containing code, input, and output.
    """
    samples =[]
    with open (os .path .join (data_dir ,"lltm-input-data/pyx_samples.jsonl"),"r")as f :
        for line in tqdm (f ,desc ="Loading samples"):
            if not line .strip ():
                continue 

            data =json .loads (line )
            sample_id =data .get ("sample_id","")

            if not sample_id .startswith ("pyx_"):
                continue 

            code =extract_code_content (data ["code"])
            input_str =extract_rhs_from_input (data ["input"])

            if any (keyword in code for keyword in BANNED_KEYWORDS ):
                print (f"Banned keyword found in sample {sample_id }. Skipping.")
                continue 

            samples .append (
            Sample (
            sample_id =sample_id ,
            code =code ,
            input =input_str ,
            output =data ["output"].strip (),
            function_name =data ["function_name"],

            )
            )

    print (f"Loaded {len (samples )} valid samples from PyX dataset.")
    return samples 


def main (
data_name :str ="pyx",
input_dir :str ="/path/to/home/data",
output_dir :str ="/path/to/home/lltm/02_codeexec_etcot/scripts/instruction/convert_datasets",
):
    samples =load_pyx (data_dir =input_dir )
    create_datasets (data_name ,samples ,output_dir =output_dir )



if __name__ =="__main__":
    fire .Fire (main )
