import argparse
import json
import glob
import os

def merge_jsonl(input_paths, output_path):
    json_count = 0
    with open(output_path, 'w', encoding='utf-8') as fout:
        for path in input_paths:
            if not os.path.isfile(path):
                print(f"[!] Skipping non-existent file: {path}")
                continue
            with open(path, 'r', encoding='utf-8') as fin:
                for line in fin:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        json_obj = json.loads(line)
                        fout.write(json.dumps(json_obj, ensure_ascii=False) + '\n')
                        json_count += 1
                    except json.JSONDecodeError:
                        print(f"Invalid JSON skipped from {path}: {line[:100]}...")
    print(f"[✓] Merged {len(input_paths)} files into: {output_path}, having {json_count} valid JSON objects.")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Merge multiple JSONL files from a directory into one.")
    parser.add_argument("--input_dir", type=str, required=True, help="Directory containing JSONL files to merge.")
    parser.add_argument("--output_dir", type=str, required=True, help="Directory to save the merged JSONL file.")
    parser.add_argument("--output_name", type=str, required=True, help="Filename for the merged JSONL file (e.g., merged.jsonl).")

    args = parser.parse_args()

    # Collect all *.jsonl files in the input directory
    input_pattern = os.path.join(args.input_dir, "*.jsonl")
    input_files = glob.glob(input_pattern)

    if not input_files:
        print(f"[!] No .jsonl files found in {args.input_dir}")
        exit(1)

    # Ensure output directory exists
    os.makedirs(args.output_dir, exist_ok=True)

    # Final output path
    output_path = os.path.join(args.output_dir, args.output_name)

    merge_jsonl(input_files, output_path)
