# -*- coding: utf-8 -*-
from __future__ import annotations

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

# -----------------------------
# Config
# -----------------------------
CSV_PATH = "spambase_data.csv"                
OUT_PATH = "spambase_strategic_context.csv"

LABEL_CANDIDATES = ["Spam", "y", "spam"]

N_POS = 20
N_NEG = 20
SEED = 42

FREEZE_FRAC = 0.10                       
LAMBDA_COST = 6.0                         # Mahalanobis cost strength
COV_RIDGE = 1e-3                          

CLIP_NONNEG = True                        
ADD_FLAG_COLUMN = True
FLAG_NAME = "is_manipulated"



def detect_label_column(df: pd.DataFrame) -> str:
    for c in LABEL_CANDIDATES:
        if c in df.columns:
            return c
    return df.columns[-1]  


def make_mutable_mask(d: int, freeze_frac: float, seed: int) -> np.ndarray:
    rng = np.random.default_rng(seed)
    n_freeze = int(np.round(d * freeze_frac))
    freeze_idx = rng.choice(np.arange(d), size=n_freeze, replace=False)
    mutable = np.ones(d, dtype=bool)
    mutable[freeze_idx] = False
    return mutable


def sample_balanced(df: pd.DataFrame, label_col: str, n_pos: int, n_neg: int, seed: int) -> pd.DataFrame:
    rng = np.random.default_rng(seed)
    pos = df[df[label_col] == 1]
    neg = df[df[label_col] == 0]
    if len(pos) < n_pos or len(neg) < n_neg:
        raise ValueError(f"Not enough rows: pos={len(pos)} neg={len(neg)} need pos={n_pos} neg={n_neg}")
    pos_idx = rng.choice(pos.index.to_numpy(), size=n_pos, replace=False)
    neg_idx = rng.choice(neg.index.to_numpy(), size=n_neg, replace=False)
    out = pd.concat([df.loc[pos_idx], df.loc[neg_idx]], axis=0)
    return out.sample(frac=1.0, random_state=seed).reset_index(drop=True)


def compute_covariance(X: np.ndarray, ridge: float) -> np.ndarray:
    # empirical covariance (dxd) + ridge*I
    # rowvar=False => columns are variables
    Sigma = np.cov(X, rowvar=False, bias=False)
    d = Sigma.shape[0]
    Sigma = Sigma + ridge * np.eye(d)
    return Sigma


def best_response_mahalanobis(
    X: np.ndarray,
    w: np.ndarray,
    Sigma: np.ndarray,
    mutable_mask: np.ndarray,
    lam: float,
    clip_nonneg: bool = True,
) -> np.ndarray:
    """
    x' = x + (1/lam) * Sigma_mm @ w_m   on mutable dims
    frozen dims unchanged.
    """
    X = np.asarray(X, dtype=float)
    w = np.asarray(w, dtype=float).reshape(-1)
    assert X.shape[1] == w.shape[0] == Sigma.shape[0] == Sigma.shape[1]

    m_idx = np.where(mutable_mask)[0]
    f_idx = np.where(~mutable_mask)[0]

    Sigma_mm = Sigma[np.ix_(m_idx, m_idx)]
    w_m = w[m_idx]

    delta_m = (Sigma_mm @ w_m) / float(lam)         # shape (m,)
    X_tilde = X.copy()
    X_tilde[:, m_idx] = X[:, m_idx] + delta_m[None, :]

    # frozen dims left as-is
    if clip_nonneg:
        X_tilde = np.clip(X_tilde, 0.0, None)
    return X_tilde


def build_context_stack(
    X: np.ndarray,
    y: np.ndarray,
    X_tilde: np.ndarray,
    feature_cols: list[str],
    add_flag: bool,
    flag_name: str,
) -> pd.DataFrame:
    n = X.shape[0]
    X_ctx = np.vstack([X, X_tilde])
    y_ctx = np.concatenate([y, y], axis=0)

    df_ctx = pd.DataFrame(X_ctx, columns=feature_cols)
    if add_flag:
        df_ctx[flag_name] = np.concatenate([np.zeros(n), np.ones(n)]).astype(int)
    df_ctx["y"] = y_ctx
    return df_ctx


# -----------------------------
# Main
# -----------------------------
def main():
    df = pd.read_csv(CSV_PATH)
    label_col = detect_label_column(df)

    feature_cols = [c for c in df.columns if c != label_col]
    X_all = df[feature_cols].to_numpy(dtype=float)
    y_all = df[label_col].to_numpy(dtype=int).reshape(-1)

    # 1) train linear classifier f on ALL data
    # (No standardization here; keep raw space consistent with Sigma in Mahalanobis cost.)
    clf = LogisticRegression(
        solver="liblinear",
        random_state=SEED,
        max_iter=2000,
    )
    clf.fit(X_all, y_all)
    w = clf.coef_.reshape(-1)  # linear score direction

    # 2) estimate Sigma for Mahalanobis cost
    Sigma = compute_covariance(X_all, ridge=COV_RIDGE)

    # 3) sample base context (20 pos + 20 neg)
    base = sample_balanced(df, label_col, N_POS, N_NEG, SEED)
    X = base[feature_cols].to_numpy(dtype=float)
    y = base[label_col].to_numpy(dtype=int).reshape(-1)

    # 4) freeze 10% features
    mutable_mask = make_mutable_mask(X.shape[1], FREEZE_FRAC, SEED)

    # 5) best response under Mahalanobis
    X_tilde = best_response_mahalanobis(
        X=X,
        w=w,
        Sigma=Sigma,
        mutable_mask=mutable_mask,
        lam=LAMBDA_COST,
        clip_nonneg=CLIP_NONNEG,
    )

    # 6) scheme A: stack original + manipulated
    df_ctx = build_context_stack(
        X=X,
        y=y,
        X_tilde=X_tilde,
        feature_cols=feature_cols,
        add_flag=ADD_FLAG_COLUMN,
        flag_name=FLAG_NAME,
    )
    df_ctx.to_csv(OUT_PATH, index=False)

    frozen = int((~mutable_mask).sum())
    print(f"[OK] label_col={label_col}")
    print(f"[OK] trained logistic regression on ALL data: n={len(df)} d={X_all.shape[1]}")
    print(f"[OK] context base: {len(base)} rows (pos={(y==1).sum()}, neg={(y==0).sum()})")
    print(f"[OK] frozen features: {frozen}/{X.shape[1]} ({FREEZE_FRAC*100:.1f}%)")
    print(f"[OK] saved strategic context: {OUT_PATH} (rows={len(df_ctx)})")


if __name__ == "__main__":
    main()
