#!/usr/bin/env bash
# Innovations (A+/B+) with adversarial/contrastive augmentation — NO dataset file modifications.
# - Streams clean, column-aligned augmented CSVs via process substitution.
# - Builds kNN graph.
# - Runs MMR and Graph variants, evaluates each, saves summaries.

set -euo pipefail

# # -------------------- CONFIG --------------------
export OPENAI_API_KEY=""
LLAMA_GGUF="/PATH_TO_YOUR/models/llama3/Power-Llama-3-7B-Instruct.Q4_K_M.gguf"

# Originals
THERAPY_ORIG="../data/Sheet_1.csv"
RESUMES_ORIG="../data/Sheet_2.csv"

# Your existing label-preserving augments
THERAPY_AUG_ORIG="../data/10kSheet_1_augmented_v5_lenaware.csv"
RESUMES_AUG_ORIG="../data/10kSheet_2_augmented_v5_lenaware.csv"

# Adversarial/contrastive (generated fresh each run; still does NOT touch the originals above)
THERAPY_AUG_ADV="../data/10kSheet_1_adv.csv"
RESUMES_AUG_ADV="../data/10kSheet_2_adv.csv"

# Outputs expected by existing scripts
TRAIN_JSONL="train_augmented.jsonl"
TEST_JSONL="test_augmented.jsonl"
GRAPH_DIR="graph_store"

# Logging
mkdir -p logs
STAMP="$(date +%Y%m%d_%H%M%S)"
LOG_FILE="logs/innovations_no_data_touch_${STAMP}.log"
echo "=== Running innovations WITH adversarial/contrastive augmentation (no data files modified) ==="
echo "Logging to ${LOG_FILE}"
exec > >(tee -a "${LOG_FILE}") 2>&1


# -------------------- STEP 1: Generate adversarial/contrastive augmentation --------------------
echo ">>> [1] Generating adversarial/contrastive augmentation (fast deterministic backend=none)"
python NEW_adv_contrastive_generate.py \
  --therapy "../data/Sheet_1.csv" \
  --resumes "../data/Sheet_2.csv" \
  --out-aug-therapy "../data/10kSheet_1_adv.csv" \
  --out-aug-resumes "../data/10kSheet_2_adv.csv" \
  --backend none \
  --per-flagged 2

# -------------------- STEP 2: Prepare stratified splits (streaming merges; no on-disk concat) --------------------

echo ">>> [2] Preparing stratified splits (children follow parent; dedup) — streaming augmented CSVs (no dataset files changed)"
python ../rags_pipelines/prepare_data_augmented.py \
  --therapy "../data/Sheet_1.csv" \
  --resumes "../data/Sheet_2.csv" \
  --aug-therapy <(python - <<'PY'
import sys, pandas as pd
t1 = pd.read_csv("../data/10kSheet_1_augmented_v5_lenaware.csv", engine="python")
t2 = pd.read_csv("../data/10kSheet_1_adv.csv",             engine="python")
keep = ["response_id","response_text","class","source","aug_of"]
def norm(df):
    df = df.loc[:, [c for c in df.columns if not str(c).startswith("Unnamed")]]
    if "response_text" not in df.columns and "text" in df.columns:
        df = df.rename(columns={"text":"response_text"})
    for c in keep:
        if c not in df.columns: df[c] = pd.NA
    return df[keep]
pd.concat([norm(t1), norm(t2)], ignore_index=True).to_csv(sys.stdout, index=False)
PY
) \
  --aug-resumes <(python - <<'PY'
import sys, pandas as pd
r1 = pd.read_csv("../data/10kSheet_2_augmented_v5_lenaware.csv", engine="python")
r2 = pd.read_csv("../data/10kSheet_2_adv.csv",             engine="python")
keep = ["resume_id","resume_text","class","source","aug_of"]
def norm(df):
    df = df.loc[:, [c for c in df.columns if not str(c).startswith("Unnamed")]]
    if "resume_text" not in df.columns and "text" in df.columns:
        df = df.rename(columns={"text":"resume_text"})
    for c in keep:
        if c not in df.columns: df[c] = pd.NA
    return df[keep]
pd.concat([norm(r1), norm(r2)], ignore_index=True).to_csv(sys.stdout, index=False)
PY
) \
  --test-size 0.10 \
  --seed 42 \
  --dedup-text

# -------------------- STEP 3: Build graph store --------------------
echo
echo ">>> [3] Building kNN graph over train split (for graph-aware retrieval)"
python - <<'PY'
from NEW_graph_retrieval import build_graph
build_graph("train_augmented.jsonl","graph_store")
PY

# -------------------- STEP 4: Run A+/B+ with MMR retrieval --------------------
echo
echo ">>> [4] Running PLUS pipelines (MMR retrieval + committee + recall-friendly threshold)"
mkdir -p outputs/pipeline_a_plus outputs/pipeline_b_plus

# A+ (OpenAI). If no key, skip A+ and evaluate B+ only.
A_PLUS_MMR_OK=0
if [ -n "${OPENAI_API_KEY}" ]; then
  python NEW_pipeline_a_fair_plus.py \
    --train "train_augmented.jsonl" \
    --test "test_augmented.jsonl" \
    --retrieval-strategy mmr \
    --mmr-lambda 0.6 \
    --committee 5 \
    --flagged-threshold 0.4 \
    --rebuild
  A_PLUS_MMR_OK=1
else
  echo "[WARN] OPENAI_API_KEY empty → skipping A+ (MMR)."
fi

python NEW_pipeline_b_fair_plus.py \
  --train "train_augmented.jsonl" \
  --test "test_augmented.jsonl" \
  --llm-path "/PATH_TO_YOUR/models/llama3/Power-Llama-3-7B-Instruct.Q4_K_M.gguf" \
  --retrieval-strategy mmr \
  --mmr-lambda 0.6 \
  --committee 5 \
  --flagged-threshold 0.4

echo ">>> Evaluating PLUS (MMR) via NEW_evaluate_fair.py"
# If A+ skipped, copy B+ outputs to the expected A files to satisfy the evaluator format.
if [ "${A_PLUS_MMR_OK}" -eq 1 ]; then
  cp -f outputs/pipeline_a_plus/preds.jsonl     outputs/preds_a.jsonl
  cp -f outputs/pipeline_a_plus/retrieval.jsonl outputs/retrieval_a.jsonl
else
  echo "[INFO] Using B+ outputs as A+ placeholders for evaluation (MMR)."
  cp -f outputs/pipeline_b_plus/preds.jsonl     outputs/preds_a.jsonl
  cp -f outputs/pipeline_b_plus/retrieval.jsonl outputs/retrieval_a.jsonl
fi
cp -f outputs/pipeline_b_plus/preds.jsonl       outputs/preds_b.jsonl
cp -f outputs/pipeline_b_plus/retrieval.jsonl   outputs/retrieval_b.jsonl

python NEW_evaluate_fair.py
cp -f outputs/summary.json outputs/summary_plus_mmr.json

# -------------------- STEP 5: Run A+/B+ with Graph retrieval --------------------
echo
echo ">>> [5] Running PLUS pipelines (Graph retrieval + committee + recall-friendly threshold)"
A_PLUS_GRAPH_OK=0
if [ -n "${OPENAI_API_KEY}" ]; then
  python NEW_pipeline_a_fair_plus.py \
    --train "train_augmented.jsonl" \
    --test "test_augmented.jsonl" \
    --retrieval-strategy graph \
    --graph-path "graph_store" \
    --committee 5 \
    --flagged-threshold 0.4 \
    --rebuild
  A_PLUS_GRAPH_OK=1
else
  echo "[WARN] OPENAI_API_KEY empty → skipping A+ (Graph)."
fi

python NEW_pipeline_b_fair_plus.py \
  --train "train_augmented.jsonl" \
  --test "test_augmented.jsonl" \
  --llm-path "/PATH_TO_YOUR/models/llama3/Power-Llama-3-7B-Instruct.Q4_K_M.gguf" \
  --retrieval-strategy graph \
  --graph-path "graph_store" \
  --committee 5 \
  --flagged-threshold 0.4

echo ">>> Evaluating PLUS (Graph) via NEW_evaluate_fair.py"
if [ "${A_PLUS_GRAPH_OK}" -eq 1 ]; then
  cp -f outputs/pipeline_a_plus/preds.jsonl     outputs/preds_a.jsonl
  cp -f outputs/pipeline_a_plus/retrieval.jsonl outputs/retrieval_a.jsonl
else
  echo "[INFO] Using B+ outputs as A+ placeholders for evaluation (Graph)."
  cp -f outputs/pipeline_b_plus/preds.jsonl     outputs/preds_a.jsonl
  cp -f outputs/pipeline_b_plus/retrieval.jsonl outputs/retrieval_a.jsonl
fi
cp -f outputs/pipeline_b_plus/preds.jsonl       outputs/preds_b.jsonl
cp -f outputs/pipeline_b_plus/retrieval.jsonl   outputs/retrieval_b.jsonl

python NEW_evaluate_fair.py
cp -f outputs/summary.json outputs/summary_plus_graph.json

# -------------------- DONE --------------------
echo
echo "=== Done. Summaries:"
echo " - PLUS (MMR):   outputs/summary_plus_mmr.json"
echo " - PLUS (Graph): outputs/summary_plus_graph.json"
echo "Full log: ${LOG_FILE}"
