import pandas as pd
import pdb
from scipy import stats
import decision_infovalue
import numpy as np
import json
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.neighbors import NearestNeighbors
import random

data = json.load(open('label.json'))
df = pd.DataFrame(data)

review_df = df[["review_text", "actual_label", "predicted_label"]]
review_df = review_df.groupby("review_text").agg({"actual_label": "first", "predicted_label": list}).reset_index()
review_df = review_df.reset_index(drop=True)
review_df["predicted_label"] = review_df["predicted_label"].apply(lambda x: [xx for xx in np.unique(x) if xx != "N.A."][0])
review_df["predicted_label"] = review_df["predicted_label"].astype(int)


pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        ngram_range=(1, 2),
        norm="l2"
    )),
    ("clf", LinearSVC(C=1.0))
])

pipe.fit(review_df["review_text"], review_df["actual_label"])

predictions = pipe.predict(review_df["review_text"])

vectorizer = pipe.named_steps["tfidf"]
svm = pipe.named_steps["clf"]

feature_names = vectorizer.get_feature_names_out()
weights = svm.coef_[0]        # shape: (num_features,)
bias = svm.intercept_[0]

X = vectorizer.transform(review_df["review_text"])   # sparse matrix (n_docs, n_features)

def top_k_words_per_instance(
    vectorizer,
    svm,
    texts,
    k=10
):
    """
    Returns a list of top-k words/ngrams (by |x_j * w_j|)
    for each document.
    """
    X = vectorizer.transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    weights = svm.coef_[0]

    results = []

    for i in range(X.shape[0]):
        row = X[i]
        idx = row.indices
        vals = row.data

        contribs = vals * weights[idx]
        abs_contribs = np.abs(contribs)

        top_idx = np.argsort(abs_contribs)[-k:][::-1]

        top_words = {
            "word": [feature_names[idx[j]] for j in top_idx],
            "contribution": [contribs[j] for j in top_idx],
            "abs_contribution": [abs_contribs[j] for j in top_idx]
        }
        

        results.append(top_words)

    return results
top_10_words = top_k_words_per_instance(vectorizer, svm, review_df["review_text"], 10)


def random_sample_k_words(
    df: pd.DataFrame,
    text_col: str = "review_text",
    k: int = 10
):
    """
    Returns a list of random k words from the text column.
    """
    return df[text_col].apply(lambda x: random.sample(x.split(), k))

random_10_words = random_sample_k_words(review_df, "review_text", 10)



def nearest_pos_and_neg(
    df: pd.DataFrame,
    text_col: str = "review_text",
    label_col: str = "label",   # expects 1=positive, 0=negative (or change below)
    pos_value=1,
    neg_value=0,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.9,
    max_features=200_000,
):
    """
    Returns df with columns:
      - nn_pos_idx, nn_pos_sim, nn_pos_text
      - nn_neg_idx, nn_neg_sim, nn_neg_text
    Similarity = cosine similarity in TF-IDF space.
    """
    texts = df[text_col].astype(str).tolist()

    # TF-IDF
    vec = TfidfVectorizer(
        lowercase=True,
        stop_words="english",
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df,
        max_features=max_features,
    )
    X = vec.fit_transform(texts)  # sparse (n_docs, n_terms)

    # indices for each label
    pos_idx = df.index[df[label_col] == pos_value].to_numpy()
    neg_idx = df.index[df[label_col] == neg_value].to_numpy()

    if len(pos_idx) == 0 or len(neg_idx) == 0:
        raise ValueError("Need at least one positive and one negative example.")

    # Build NN models on the two subsets
    # Note: cosine distance = 1 - cosine similarity
    nn_pos = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=2)
    nn_neg = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=2)

    X_pos = X[pos_idx]
    X_neg = X[neg_idx]

    nn_pos.fit(X_pos)
    nn_neg.fit(X_neg)

    # Query all points against each subset
    # For querying within same-label subset, we use n_neighbors=2 so we can skip self if present.
    dist_pos, ind_pos = nn_pos.kneighbors(X, n_neighbors=2, return_distance=True)
    dist_neg, ind_neg = nn_neg.kneighbors(X, n_neighbors=2, return_distance=True)

    # Helper to pick correct neighbor index when self is in the candidate set
    def pick_neighbor(global_i, subset_global_indices, dists, inds):
        # inds are indices into subset_global_indices
        c0 = subset_global_indices[inds[0]]
        if c0 != global_i:
            chosen = (c0, dists[0])
        else:
            chosen = (subset_global_indices[inds[1]], dists[1])
        # convert cosine distance -> cosine similarity
        return chosen[0], 1.0 - chosen[1]

    nn_pos_idx = np.empty(len(df), dtype=int)
    nn_pos_sim = np.empty(len(df), dtype=float)
    nn_neg_idx = np.empty(len(df), dtype=int)
    nn_neg_sim = np.empty(len(df), dtype=float)

    for i in range(len(df)):
        p_idx, p_sim = pick_neighbor(i, pos_idx, dist_pos[i], ind_pos[i])
        n_idx, n_sim = pick_neighbor(i, neg_idx, dist_neg[i], ind_neg[i])

        nn_pos_idx[i] = p_idx
        nn_pos_sim[i] = p_sim
        nn_neg_idx[i] = n_idx
        nn_neg_sim[i] = n_sim

    out = df.copy()
    out["nn_pos_idx"] = nn_pos_idx
    out["nn_pos_sim"] = nn_pos_sim
    out["nn_pos_text"] = out.loc[out["nn_pos_idx"], text_col].to_numpy()

    out["nn_neg_idx"] = nn_neg_idx
    out["nn_neg_sim"] = nn_neg_sim
    out["nn_neg_text"] = out.loc[out["nn_neg_idx"], text_col].to_numpy()

    return out

review_df = nearest_pos_and_neg(review_df, "review_text", "predicted_label")

review_df = pd.concat([review_df, pd.DataFrame(top_10_words)], axis=1)
review_df = pd.concat([review_df, pd.DataFrame({"random_10_words": random_10_words})], axis=1)
review_df.to_csv("review_model_pred_and_explanation.csv", index=False)