import pandas as pd
import copy
import re
from itertools import zip_longest      # pairs last chunk with None

SPLIT_RX = re.compile(r'(\.\s|\n{2,})')   # ①  ". "  OR ②  ≥2 newlines

def split_with_marks(raw: str):
    """
    1. Replace '$' → space
    2. Return a list of (chunk, delimiter) tuples.
       For the last chunk, delimiter == None.
    """
    clean = raw.replace('$', ' ')
    
    # parts = [chunk0, delim0, chunk1, delim1, …]
    parts = SPLIT_RX.split(clean)
    
    # build (chunk, delimiter) pairs
    return list(zip_longest(parts[::2], parts[1::2]))

def rebuild_up_to(pairs, k):        # keep first k *delimiters*
    return "".join(
        chunk + (delim or "")       # last delim could be None
        for chunk, delim in pairs[:k]
    )

df_pandas = pd.read_parquet('/home/ubuntu/TinyZero_math/train.parquet')

new_data_dict = {}
for key in df_pandas.columns:
    new_data_dict[key] = []

for i in range(len(df_pandas)):
    original_data = df_pandas.iloc[i]
    solution_split = clean_and_split(original_data['answer'])
    for j in range(len(solution_split)):
        for key in df_pandas.columns:
            if key == 'prompt':
                new_prompt = copy.deepcopy(original_data[key])
                new_data_dict[key].append(solution_split[j])
            else:
                new_data_dict[key].append(original_data[key])