"""
Common utilities for preprocessing.
"""
import os
import gzip
import json
from typing import Dict, Iterable, List

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq


def stream_jsonl(path: str) -> Iterable[Dict]:
    """Stream JSONL file line by line."""
    open_fn = gzip.open if path.endswith(".gz") else open
    mode = "rt" if path.endswith(".gz") else "r"

    with open_fn(path, mode, encoding="utf-8") as f:
        for line in f:
            if line.strip():
                yield json.loads(line)


def find_parquet_files(root: str) -> List[str]:
    """Find all parquet files recursively."""
    paths = []
    for dirpath, _, filenames in os.walk(root):
        for name in filenames:
            if name.endswith(".parquet"):
                paths.append(os.path.relpath(os.path.join(dirpath, name), root))
    return sorted(paths)


def save_parquet(df: pd.DataFrame, path: str):
    """Save DataFrame to parquet with compression."""
    os.makedirs(os.path.dirname(path), exist_ok=True)
    table = pa.Table.from_pandas(df, preserve_index=False)
    pq.write_table(table, path, compression="zstd", compression_level=11)
