#!/usr/bin/env python3
import json
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

# --------------------
# Hardcoded settings
# --------------------
INPUT_JSONL = "/JSONPATH"
OUT_PARQUET = "/PARQUETPATH"

# --------------------
# Load JSONL → list of (series_id, np.ndarray)
# --------------------
sid_series = []
with open(INPUT_JSONL, "r") as f:
    idx = 0
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
        except json.JSONDecodeError:
            continue

        # 'target' is the values; some datasets also include 'item_id'
        target = obj.get("target", None)
        if target is None:
            continue

        # Flatten to 1D float32, drop NaNs
        arr = np.array(target, dtype=np.float32).reshape(-1)
        arr = arr[np.isfinite(arr)]

        # Prefer item_id if present; else generate
        sid = str(obj.get("item_id", f"taxi_{idx}"))
        idx += 1

        sid_series.append((sid, arr))

# --------------------
# Build long-format DataFrame
# --------------------
rows = []
for sid, arr in sid_series:
    t = np.arange(len(arr), dtype=np.int64)
    rows.append(pd.DataFrame({"series_id": sid, "time_idx": t, "value": arr}))

if rows:
    out_df = pd.concat(rows, ignore_index=True)
else:
    out_df = pd.DataFrame(
        {"series_id": pd.Series(dtype="string"),
         "time_idx": pd.Series(dtype="int64"),
         "value": pd.Series(dtype="float32")}
    )

# Enforce dtypes expected by your loader
out_df = out_df.astype({"series_id": "string", "time_idx": "int64", "value": "float32"})

# --------------------
# Write Parquet
# --------------------
os.makedirs(os.path.dirname(OUT_PARQUET), exist_ok=True)
table = pa.Table.from_pandas(out_df, preserve_index=False)
pq.write_table(table, OUT_PARQUET)

print(f"[OK] Wrote {len(out_df):,} rows from {out_df['series_id'].nunique()} series → {OUT_PARQUET}")
