
"""
Usage:

uv run convert_stable_to_cr.py \
 --src_dir /path/to/home/lltm-h200/data_stable \
 --dst_dir /path/to/home/lltm/02_codeexec_etcot/scripts/instruction/convert_datasets \
"""


import os 
import glob 
import json 
import re 
from pathlib import Path 
import argparse 


BLOCK_RE =re .compile (
r"<\|im_start\|\>(.*?)\n(.*?)<\|im_end\|\>",
re .DOTALL 
)

def parse_blocks (text :str ):
    """
    Parses the text into a list of role and content pairs.
    テキストを role, content のリストに分解
    """
    for role ,body in BLOCK_RE .findall (text ):
        yield role .strip (),body 

def convert_line (line :str )->dict :
    """
    Converts a single JSON line to the new structure.
    1 行 (json) を新構造に変換して返す
    """
    data =json .loads (line )


    whole_text ="".join (m ["content"]for m in data ["messages"])

    user_prompt =None 
    assistant_content =None 

    for role ,body in parse_blocks (whole_text ):
        if role =="user"and user_prompt is None :
            user_prompt =body .strip ()
        elif role =="assistant":
            assistant_content =body .strip ()

    if user_prompt is None or assistant_content is None :
        raise ValueError ("User or assistant content not found.")

    return {
    "input":[{"role":"user","content":user_prompt }],
    "output":{"role":"assistant","content":assistant_content }
    }

def main ():
    parser =argparse .ArgumentParser (
    description ="Convert stable dataset format to a new instruction format."
    )
    parser .add_argument (
    "--src_dir",
    type =str ,
    default ="/path/to/home/lltm-h200/data_stable",
    help ="Source directory containing .jsonl files to convert."
    )
    parser .add_argument (
    "--dst_dir",
    type =str ,
    default ="/path/to/home/lltm/02_codeexec_etcot/scripts/instruction/convert_datasets",
    help ="Destination directory to save converted .jsonl files."
    )
    parser .add_argument (
    "--dataset_num",
    type =int ,
    default =1000 ,
    help ="Number of datasets to process per file. Default is 1000."
    )

    args =parser .parse_args ()

    SRC_DIR =Path (args .src_dir )
    DST_DIR =Path (args .dst_dir )
    DATASET_NUM =args .dataset_num 

    DST_DIR .mkdir (parents =True ,exist_ok =True )

    processed_total_count =0 
    file_list =glob .glob (str (SRC_DIR /"*.jsonl"))

    for src_path in file_list :
        processed_count_in_file =0 
        dst_path =DST_DIR /Path (src_path ).name 
        print (f"🔁 Processing {src_path }...")

        with open (src_path ,"r",encoding ="utf-8")as fin ,open (dst_path ,"w",encoding ="utf-8")as fout :

            for line in fin :
                if processed_count_in_file >=DATASET_NUM :
                    break 

                try :
                    new_item =convert_line (line )
                    fout .write (json .dumps (new_item ,ensure_ascii =False )+"\n")
                    processed_count_in_file +=1 
                except Exception as e :
                    print (f"[SKIP] {src_path }: {e }")

        processed_total_count +=processed_count_in_file 
        print (f"✅ Generated {dst_path } ({processed_count_in_file } / {DATASET_NUM })")

    print (f"\n🔥 Processing completed. Total {processed_total_count } data generated.")


if __name__ =="__main__":
    main ()
