# complements/myhooks1.py  (replace your current file with this)
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Default path to the prepared CSV we made earlier
DEFAULT_CSV = os.path.abspath("uganda_tile_dataset.csv")

def load_data(path=None):
    """
    Load the prepared CSV. If path is None, use DEFAULT_CSV.
    Clean NaNs and infinities to make data safe for ML models.
    """
    if path is None:
        path = DEFAULT_CSV
    if not os.path.exists(path):
        raise FileNotFoundError(f"Expected dataset at {path}; please place uganda_tile_dataset.csv there or pass --data_csv")
    
    df = pd.read_csv(path)

    # choose label
    if "CEB03" in df.columns:
        df["electrified"] = (df["CEB03"].fillna(0) > 0).astype(int)
        label_source = "CEB03 (electricity expense)"
    elif "electrified" in df.columns:
        df["electrified"] = df["electrified"].astype(int)
        label_source = "existing 'electrified' column"
    else:
        df["electrified"] = df.iloc[:, -1].astype(int)
        label_source = "fallback: last column forced"

    # drop ID-like columns
    drop_patterns = ["id", "uuid", "serial", "household", "hhid", "cluster"]
    drop_cols = [c for c in df.columns if any(p.lower() in c.lower() for p in drop_patterns)]
    keep_cols = [c for c in df.columns if c not in drop_cols + ["electrified"]]

    # clean features
    X = df[keep_cols].copy()

    # convert non-numeric to numeric if needed
    X = X.apply(pd.to_numeric, errors="coerce")

    # replace NaN and infinities
    X = X.replace([np.inf, -np.inf], np.nan).fillna(0)

    # final numpy array
    X = X.values
    y = df["electrified"].values.astype(int)

    # stratified split if possible
    try:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
    except Exception:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    print(f"[myhooks] Loaded {path} — label from {label_source}; features={len(keep_cols)} rows={len(X)}")
    return X_train, y_train, X_test, y_test


def sample_synthetics(n, mode="naive", meta=None):
    """
    Generate synthetic samples by jittering training samples.
    IMPORTANT: if meta contains 'X_train' it will use that to ensure shape match.
    """
    if meta is not None and "X_train" in meta and meta["X_train"] is not None:
        X_train = meta["X_train"]
        y_train = meta.get("y_train", None)
    else:
        # fallback: call load_data() to get dataset (only when orchestrator didn't pass meta)
        X_train, y_train, _, _ = load_data()
    # ensure X_train is numpy array
    X_train = np.asarray(X_train)
    n_train, d = X_train.shape
    idx = np.random.choice(n_train, size=n, replace=True)
    # small gaussian jitter proportional to feature scale
    noise = 0.02 * (np.std(X_train, axis=0, keepdims=True) + 1e-8) * np.random.randn(n, d)
    X_syn = X_train[idx] + noise
    if y_train is None:
        # try to infer labels via nearest neighbor (cheap)
        from sklearn.neighbors import KNeighborsClassifier
        kn = KNeighborsClassifier(n_neighbors=3)
        kn.fit(X_train, np.zeros(len(X_train)))  # dummy fit to avoid error if all zero; will map to zeros
        # we prefer to return zeros if labels are unknown
        y_syn = np.zeros(n, dtype=int)
    else:
        y_syn = np.asarray(y_train)[idx].copy()
    return X_syn, y_syn

def accept_synthetics(X_syn, y_syn, meta=None):
    """
    Lightweight AWML-like acceptance:
    - If meta contains X_train, compute per-feature z-scores and accept samples whose all z < 2.5
    - If no meta, attempt to load data and use that.
    - Returns boolean mask of length len(X_syn).
    """
    X_syn = np.asarray(X_syn)
    if meta is not None and "X_train" in meta and meta["X_train"] is not None:
        X_train = np.asarray(meta["X_train"])
    else:
        X_train, _, _, _ = load_data()
    # safety: if feature-dimension mismatch, try to adapt
    if X_train.shape[1] != X_syn.shape[1]:
        # If X_train has fewer features (toy case), try to truncate or pad X_syn
        if X_syn.shape[1] > X_train.shape[1]:
            # truncate synthetic features to match training shape (conservative)
            X_syn_used = X_syn[:, : X_train.shape[1]]
        else:
            # pad synthetic features with zeros
            pad = np.zeros((X_syn.shape[0], X_train.shape[1] - X_syn.shape[1]))
            X_syn_used = np.hstack([X_syn, pad])
    else:
        X_syn_used = X_syn
    # compute z-scores from X_train
    scaler = StandardScaler().fit(X_train)
    X_train_s = scaler.transform(X_train)
    X_syn_s = scaler.transform(X_syn_used)
    z = np.abs(X_syn_s)
    mask = (z < 2.5).all(axis=1)
    # if too harsh (accept none), relax threshold
    if mask.sum() == 0:
        mask = (z < 3.5).all(axis=1)
    return mask
