#!/usr/bin/env python3
import os
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

# --------------------
# Hardcoded settings
# --------------------
TSF_PATH        = "/PATH/oikolab_weather_dataset.tsf"
OUT_PARQUET     = "/PATH/oikolab_daily_timeseries.parquet"
SKIP_KEYWORD    = "mean_sea_level_pressure"   # skip series whose metadata contains this
DAILY_AGG       = "sum"                        # "sum" or "mean"

# --------------------
# Helpers
# --------------------
def parse_oikolab_tsf(tsf_path: str) -> list[tuple[str, np.ndarray]]:
    """
    Parses an Oikolab .tsf file into a list of (series_id, hourly_values).
    - Skips metadata lines until '@data'
    - Skips series whose metadata contains SKIP_KEYWORD
    - Drops NaNs
    """
    out: list[tuple[str, np.ndarray]] = []
    with open(tsf_path, "r") as f:
        data_started = False
        idx = 0
        for raw in f:
            line = raw.strip()
            if not line:
                continue
            if line.lower().startswith("@data"):
                data_started = True
                continue
            if not data_started or line.startswith("@"):
                continue

            try:
                parts = line.split(":")
                # Expect 4 parts; values in parts[3], variable-ish info in parts[2]
                if len(parts) != 4:
                    continue
                if SKIP_KEYWORD and (SKIP_KEYWORD in parts[2]):
                    continue

                values = [v.strip() for v in parts[3].split(",")]
                values = np.array(
                    [float(v) for v in values if v and v.lower() != "nan"],
                    dtype=np.float32
                )
                if len(values) < 24:
                    continue

                sid = f"oikolab_{idx}"
                idx += 1
                out.append((sid, values))
            except Exception:
                # Skip malformed lines quietly
                continue
    return out

def hourly_to_daily(values: np.ndarray) -> np.ndarray:
    """Group every 24 hours into one day using DAILY_AGG."""
    n = len(values) - (len(values) % 24)
    x = values[:n].reshape(-1, 24)
    if DAILY_AGG == "sum":
        return x.sum(axis=1).astype(np.float32)
    elif DAILY_AGG == "mean":
        return x.mean(axis=1).astype(np.float32)
    else:
        raise ValueError("DAILY_AGG must be 'sum' or 'mean'")

# --------------------
# Main logic
# --------------------
hourly_series = parse_oikolab_tsf(TSF_PATH)

rows = []
for sid, h in hourly_series:
    d = hourly_to_daily(h)
    if not np.isfinite(d).all():
        continue
    time_idx = np.arange(len(d), dtype=np.int64)
    rows.append(pd.DataFrame({"series_id": sid, "time_idx": time_idx, "value": d}))

if rows:
    out_df = pd.concat(rows, ignore_index=True)
else:
    out_df = pd.DataFrame(
        {"series_id": pd.Series(dtype="string"),
         "time_idx": pd.Series(dtype="int64"),
         "value": pd.Series(dtype="float32")}
    )

# Enforce dtypes your loader expects
out_df = out_df.astype({"series_id": "string", "time_idx": "int64", "value": "float32"})

os.makedirs(os.path.dirname(OUT_PARQUET), exist_ok=True)
table = pa.Table.from_pandas(out_df, preserve_index=False)
pq.write_table(table, OUT_PARQUET)

print(f"[OK] Wrote {len(out_df):,} rows across {out_df['series_id'].nunique()} series → {OUT_PARQUET}")
