import json
import re
import sys

def process_question_add_dollars(json_line):
    """
    Processes a single JSON line string.
    Finds numbers (potentially with commas) in the 'question' field
    and wraps them in $.
    Returns the modified JSON object (as a Python dict).
    """
    try:
        data = json.loads(json_line)

        if "question" in data and isinstance(data["question"], str):
            original_question = data["question"]

            # Regex Explanation:
            # \b : Word boundary, ensures we match whole numbers, not parts of words.
            # \d+ : Matches one or more digits (the start of the number).
            # (?:,\d{3})* : Matches zero or more groups of (a comma followed by exactly 3 digits).
            #              This handles the thousands separators. '(?:...)' is a non-capturing group.
            # \b : Word boundary at the end.
            # (...) : The outer parentheses capture the entire matched number (group 1).
            pattern = r'(\b\d+(?:,\d{3})*\b)'

            # Replacement String Explanation:
            # $ : Literal '$' to start the LaTeX math mode.
            # \1 : Backreference to the first captured group (the entire matched number).
            # $ : Literal '$' to end the LaTeX math mode.
            replacement = r'$\1$'

            # Apply the substitution using re.sub
            modified_question = re.sub(pattern, replacement, original_question)

            # Update the dictionary with the modified question field
            data["question"] = modified_question

        # --- IMPORTANT ---
        # Only the 'question' field is modified if it exists.
        # All other fields (idx, answer, gt_cot, etc.) remain unchanged.
        # ---           ---

        return data

    except json.JSONDecodeError:
        print(f"Warning: Skipping invalid JSON line: {json_line.strip()}", file=sys.stderr)
        return None # Return None if JSON parsing fails
    except Exception as e:
        print(f"Warning: Error processing line: {e}\nLine content: {json_line.strip()}", file=sys.stderr)
        # Attempt to return original data if parsing worked but processing failed
        # This prevents losing the whole line if only the regex fails somehow
        if 'data' in locals():
             return data
        return None # Return None if an unexpected error occurs

# --- Main Script Execution ---

# Define input and output filenames
# !!! REPLACE 'input.jsonl' with the actual name of your input file !!!
input_filename = '/data/openfed_llm_new/OpenFedLLM/math-evaluation-harness/data/gsm8k/test.jsonl'
# Output file will contain the modified JSON lines
output_filename = '/data/openfed_llm_new/OpenFedLLM/math-evaluation-harness/data/gsm8k/test_latex.jsonl'

print(f"Adding $...$ around numbers in 'question' field from '{input_filename}'...")
print(f"Output will be written to '{output_filename}' preserving JSON structure.")

processed_count = 0
total_lines = 0
try:
    # Open input and output files with UTF-8 encoding
    with open(input_filename, 'r', encoding='utf-8') as infile, \
         open(output_filename, 'w', encoding='utf-8') as outfile:

        # Process each line from the input file
        for line in infile:
            total_lines += 1
            if line.strip(): # Check if the line is not empty
                modified_data = process_question_add_dollars(line)
                # If processing was successful (returned a dict)
                if modified_data is not None:
                    # Convert the potentially modified Python dict back to a JSON string
                    # ensure_ascii=False preserves non-ASCII characters (like Unicode chars) directly
                    output_json_line = json.dumps(modified_data, ensure_ascii=False)
                    # Write the resulting JSON line to the output file
                    outfile.write(output_json_line + '\n')
                    processed_count += 1
            else:
                 # Optionally report skipped empty lines
                 # print(f"Info: Skipping empty line at line number {total_lines}", file=sys.stderr)
                 pass # Or just ignore empty lines silently

    print(f"Conversion complete. Processed {processed_count} valid JSON lines out of {total_lines} total lines.")
    print(f"Modified JSON data (only 'question' field affected) written to '{output_filename}'.")

except FileNotFoundError:
    print(f"Error: Input file '{input_filename}' not found. Please ensure the file exists in the correct directory.", file=sys.stderr)
except Exception as e:
    # Catch any other unexpected errors during file handling or processing
    print(f"An critical error occurred during processing: {e}", file=sys.stderr)