#!/usr/bin/env python3
import os
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import argparse

def main():
    ap = argparse.ArgumentParser("Electricity → (series_id, time_idx, value) parquet")
    ap.add_argument("--csv", required=True,
                    help="Path to ElectricityLoadDiagrams20112014.csv")
    ap.add_argument("--out_parquet", required=True,
                    help="Output parquet path")
    ap.add_argument("--min_len", type=int, default=8,
                    help="Drop meters shorter than this after NaN removal")
    ap.add_argument("--fill", choices=["drop","ffill","bfill","interp"], default="drop",
                    help="How to handle NaNs per meter")
    args = ap.parse_args()

    # ---- Load CSV (semicolon + comma decimal) ----
    df = pd.read_csv(args.csv, sep=";", decimal=",", parse_dates=[0])
    df.rename(columns={df.columns[0]: "timestamp"}, inplace=True)
    df = df.sort_values("timestamp").reset_index(drop=True)

    # All meter columns (typically MT_001 ... MT_370)
    meter_cols = [c for c in df.columns if c != "timestamp"]
    # Ensure numeric (guard against stray commas/strings)
    df[meter_cols] = (df[meter_cols]
                      .replace(",", ".", regex=True)
                      .apply(pd.to_numeric, errors="coerce"))

    # ---- Per-meter cleaning ----
    if args.fill == "drop":
        df_clean = df.copy()
    else:
        df_clean = df.copy()
        if args.fill == "ffill":
            df_clean[meter_cols] = df_clean[meter_cols].ffill()
        elif args.fill == "bfill":
            df_clean[meter_cols] = df_clean[meter_cols].bfill()
        elif args.fill == "interp":
            # time-based interpolation; fallback to ffill/bfill at ends
            df_clean.set_index("timestamp", inplace=True)
            df_clean[meter_cols] = (df_clean[meter_cols]
                                    .interpolate(method="time", limit_direction="both"))
            df_clean.reset_index(inplace=True)

    # ---- Long format: (series_id, time_idx, value) ----
    long = df_clean.melt(id_vars=["timestamp"],
                         value_vars=meter_cols,
                         var_name="series_id",
                         value_name="value")

    long = long.dropna(subset=["value"])

    long = long.sort_values(["series_id", "timestamp"])
    long["time_idx"] = long.groupby("series_id").cumcount()

    lengths = long.groupby("series_id")["time_idx"].max().add(1)
    keep_ids = set(lengths[lengths >= args.min_len].index)
    long = long[long["series_id"].isin(keep_ids)]

    out_df = long[["series_id", "time_idx", "value"]].astype({
        "series_id": "string",
        "time_idx": "int64",
        "value": "float32",
    })

    table = pa.Table.from_pandas(out_df, preserve_index=False)
    os.makedirs(os.path.dirname(args.out_parquet), exist_ok=True)
    pq.write_table(table, args.out_parquet)
    print(f"[OK] Wrote {len(out_df):,} rows across {out_df['series_id'].nunique()} series to {args.out_parquet}")

if __name__ == "__main__":
    main()
