{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e3d36013",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "90f4104b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(41869,)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training_df = pd.read_csv(\"run_max/processed.csv\")\n",
    "\n",
    "training_df['seq'].drop_duplicates().shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "75f2cd13",
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# =========================\n",
    "# 1) Loaders (your approach)\n",
    "# =========================\n",
    "def load_flat_list_of_strs_from_csv(path):\n",
    "    flat = []\n",
    "    with open(path, newline=\"\") as f:\n",
    "        reader = csv.reader(f)\n",
    "        for row in reader:\n",
    "            cleaned = [tok.strip() for tok in row if tok is not None and tok.strip() != \"\"]\n",
    "            flat.extend(cleaned)\n",
    "    return flat\n",
    "\n",
    "def load_flat_list_of_floats_from_csv(path):\n",
    "    flat = []\n",
    "    with open(path, newline=\"\") as f:\n",
    "        reader = csv.reader(f)\n",
    "        for row in reader:\n",
    "            cleaned = [tok.strip() for tok in row if tok is not None and tok.strip() != \"\"]\n",
    "            for tok in cleaned:\n",
    "                try:\n",
    "                    flat.append(float(tok))\n",
    "                except Exception:\n",
    "                    # skip tokens that aren't numeric\n",
    "                    pass\n",
    "    return flat\n",
    "\n",
    "# =================================\n",
    "# 2) Distances (Hamming/Levenshtein)\n",
    "# =================================\n",
    "def _encode_ascii_matrix(seqs):\n",
    "    \"\"\"Return (N, L) uint8 matrix of ASCII codes for same-length sequences.\"\"\"\n",
    "    N = len(seqs)\n",
    "    L = len(seqs[0])\n",
    "    mat = np.empty((N, L), dtype=np.uint8)\n",
    "    for i, s in enumerate(seqs):\n",
    "        mat[i, :] = np.frombuffer(s.encode('ascii', 'strict'), dtype=np.uint8)\n",
    "    return mat\n",
    "\n",
    "def full_hamming_matrix(seqs):\n",
    "    \"\"\"(N,N) pairwise Hamming distances among seqs (same-length).\"\"\"\n",
    "    A = _encode_ascii_matrix(seqs)  # (N, L)\n",
    "    D = (A[:, None, :] != A[None, :, :]).sum(axis=2)\n",
    "    return D\n",
    "\n",
    "def hamming_to_training(gen, train):\n",
    "    \"\"\"(G,T) Hamming distances from generated seqs to training seqs (same-length).\"\"\"\n",
    "    GA = _encode_ascii_matrix(gen)\n",
    "    TA = _encode_ascii_matrix(train)\n",
    "    D = (GA[:, None, :] != TA[None, :, :]).sum(axis=2)\n",
    "    return D\n",
    "\n",
    "def levenshtein(a, b):\n",
    "    \"\"\"Edit distance (DP).\"\"\"\n",
    "    if len(a) > len(b):\n",
    "        a, b = b, a\n",
    "    prev = list(range(len(a) + 1))\n",
    "    for cb in b:\n",
    "        curr = [prev[0] + 1]\n",
    "        for i, ca in enumerate(a, start=1):\n",
    "            ins = prev[i] + 1\n",
    "            delete = curr[i-1] + 1\n",
    "            replace = prev[i-1] + (ca != cb)\n",
    "            curr.append(min(ins, delete, replace))\n",
    "        prev = curr\n",
    "    return prev[-1]\n",
    "\n",
    "def full_lev_matrix(seqs):\n",
    "    N = len(seqs)\n",
    "    D = np.zeros((N, N), dtype=np.int32)\n",
    "    for i in range(N):\n",
    "        for j in range(i+1, N):\n",
    "            d = levenshtein(seqs[i], seqs[j])\n",
    "            D[i, j] = D[j, i] = d\n",
    "    return D\n",
    "\n",
    "def lev_to_training(gen, train):\n",
    "    G, T = len(gen), len(train)\n",
    "    D = np.zeros((G, T), dtype=np.int32)\n",
    "    for i, s in enumerate(gen):\n",
    "        for j, t in enumerate(train):\n",
    "            D[i, j] = levenshtein(s, t)\n",
    "    return D\n",
    "\n",
    "# ======================\n",
    "# 3) Percentile utility\n",
    "# ======================\n",
    "def percentile_of_value(values, x):\n",
    "    \"\"\"\n",
    "    Percentile of x among 'values' in [0,100]. 'values' is treated as a fixed reference set.\n",
    "    Uses the 'mean' method for ties.\n",
    "    \"\"\"\n",
    "    arr = np.asarray(values, dtype=float)\n",
    "    arr = arr[~np.isnan(arr)]\n",
    "    if arr.size == 0:\n",
    "        return np.nan\n",
    "    arr.sort()\n",
    "    left = np.searchsorted(arr, x, side='left')\n",
    "    right = np.searchsorted(arr, x, side='right')\n",
    "    return 100.0 * (left + right) / (2.0 * len(arr))\n",
    "\n",
    "# ==========================\n",
    "# 4) Bootstrap (percentile CI)\n",
    "# ==========================\n",
    "def bootstrap_indices(n, n_boot=2000, rng=None):\n",
    "    if rng is None:\n",
    "        rng = np.random.default_rng(12345)\n",
    "    return rng.integers(0, n, size=(n_boot, n))\n",
    "\n",
    "def percentile_ci_from_samples(samples, alpha=0.05):\n",
    "    \"\"\"Given an array of bootstrap estimates, return (lo, hi) percentile CI.\"\"\"\n",
    "    lo = np.percentile(samples, 100*(alpha/2))\n",
    "    hi = np.percentile(samples, 100*(1 - alpha/2))\n",
    "    return float(lo), float(hi)\n",
    "\n",
    "# ==========================\n",
    "# 5) Motif utilities\n",
    "# ==========================\n",
    "def contains_motif_flags(seqs, motif=\"CACGTG\", case_insensitive=True):\n",
    "    \"\"\"\n",
    "    Boolean array: True if sequence contains 'motif' as a substring (non-overlap not required).\n",
    "    \"\"\"\n",
    "    if case_insensitive:\n",
    "        motif = motif.lower()\n",
    "        return np.array([motif in s.lower() for s in seqs], dtype=bool)\n",
    "    else:\n",
    "        return np.array([motif in s for s in seqs], dtype=bool)\n",
    "\n",
    "# ======================================================\n",
    "# 6) Main: per-run metrics + 95% bootstrap percentile CIs\n",
    "# ======================================================\n",
    "def compute_metrics_with_ci(\n",
    "    training_df,\n",
    "    seqs_csv_path=\"sequences_per_run_GWG.csv\",\n",
    "    energies_csv_path=\"energies_per_run_GWG.csv\",\n",
    "    n_boot=2000,\n",
    "    random_seed=12345,\n",
    "    motif=\"CACGTG\",\n",
    "    verbose=True,\n",
    "):\n",
    "    rng = np.random.default_rng(random_seed)\n",
    "\n",
    "    # --- Load generated data (flattened per your requirement) ---\n",
    "    gen_seqs = load_flat_list_of_strs_from_csv(seqs_csv_path)\n",
    "    gen_y = load_flat_list_of_floats_from_csv(energies_csv_path)\n",
    "\n",
    "    # Align counts\n",
    "    n = min(len(gen_seqs), len(gen_y))\n",
    "    if len(gen_seqs) != len(gen_y) and verbose:\n",
    "        print(f\"[warn] Mismatched counts: {len(gen_seqs)} sequences vs {len(gen_y)} y-values. Truncating to {n}.\")\n",
    "    gen_seqs = gen_seqs[:n]\n",
    "    gen_y = np.asarray(gen_y[:n], dtype=float)\n",
    "\n",
    "    # Training data\n",
    "    train_seqs = training_df['seq'].dropna().drop_duplicates().tolist()\n",
    "    train_pos = training_df.loc[training_df['y_norm'] > 0, 'y_norm'].astype(float).values\n",
    "\n",
    "    if n == 0:\n",
    "        if verbose:\n",
    "            print(\"[error] No generated samples after loading.\")\n",
    "        return {\n",
    "            \"n_generated\": 0,\n",
    "            \"median_fitness\": np.nan, \"median_fitness_ci95\": (np.nan, np.nan),\n",
    "            \"fitness_percentile_vs_train_pos\": np.nan, \"fitness_percentile_vs_train_pos_ci95\": (np.nan, np.nan),\n",
    "            \"diversity_median\": np.nan, \"diversity_median_ci95\": (np.nan, np.nan),\n",
    "            \"novelty_median\": np.nan, \"novelty_median_ci95\": (np.nan, np.nan),\n",
    "            \"motif\": motif, \"motif_pct\": np.nan, \"motif_pct_ci95\": (np.nan, np.nan), \"motif_count\": 0,\n",
    "        }\n",
    "\n",
    "    # --- Decide distance mode; precompute matrices for fast bootstrapping ---\n",
    "    same_length = (\n",
    "        all(len(s) == len(gen_seqs[0]) for s in gen_seqs) and\n",
    "        len(train_seqs) > 0 and all(len(t) == len(train_seqs[0]) for t in train_seqs) and\n",
    "        len(gen_seqs[0]) == len(train_seqs[0])\n",
    "    )\n",
    "\n",
    "    if same_length:\n",
    "        if verbose:\n",
    "            print(\"[info] Using Hamming distances (same-length sequences).\")\n",
    "        D_gg = full_hamming_matrix(gen_seqs)  # (n, n)\n",
    "        D_gt = hamming_to_training(gen_seqs, train_seqs) if len(train_seqs) else None\n",
    "    else:\n",
    "        if verbose:\n",
    "            print(\"[info] Variable sequence lengths → using Levenshtein distances (slower).\")\n",
    "        D_gg = full_lev_matrix(gen_seqs)\n",
    "        D_gt = lev_to_training(gen_seqs, train_seqs) if len(train_seqs) else None\n",
    "\n",
    "    # --- Point estimates ---\n",
    "    median_fitness = float(np.median(gen_y))\n",
    "    fitness_percentile = percentile_of_value(train_pos, median_fitness)\n",
    "\n",
    "    if n >= 2:\n",
    "        iu = np.triu_indices(n, k=1)\n",
    "        diversity_point = float(np.median(D_gg[iu]))\n",
    "    else:\n",
    "        diversity_point = 0.0\n",
    "\n",
    "    if D_gt is not None and D_gt.shape[1] > 0:\n",
    "        novelty_per_seq = D_gt.min(axis=1)  # min distance to any training seq per generated seq\n",
    "        novelty_point = float(np.median(novelty_per_seq))\n",
    "    else:\n",
    "        novelty_per_seq = np.zeros(n, dtype=np.int32)\n",
    "        novelty_point = 0.0\n",
    "\n",
    "    # Motif presence (percent of sequences containing motif)\n",
    "    motif_flags = contains_motif_flags(gen_seqs, motif=motif, case_insensitive=True)  # bool array\n",
    "    motif_count = int(motif_flags.sum())\n",
    "    motif_pct_point = 100.0 * motif_count / n\n",
    "\n",
    "    # --- Bootstrap (sequence-level resampling) ---\n",
    "    boots = bootstrap_indices(n, n_boot=n_boot, rng=rng)\n",
    "\n",
    "    # 1) Median fitness\n",
    "    boot_med = np.array([np.median(gen_y[idxs]) for idxs in boots], dtype=float)\n",
    "    median_fitness_ci95 = percentile_ci_from_samples(boot_med, alpha=0.05)\n",
    "\n",
    "    # 2) Percentile of the median vs training_pos (map each boot median to its percentile)\n",
    "    boot_pct = np.array([percentile_of_value(train_pos, m) for m in boot_med], dtype=float)\n",
    "    fitness_percentile_ci95 = percentile_ci_from_samples(boot_pct, alpha=0.05)\n",
    "\n",
    "    # 3) Diversity: median of pairwise distances within each bootstrap resample\n",
    "    def diversity_from_indices(idxs):\n",
    "        if len(idxs) < 2:\n",
    "            return 0.0\n",
    "        sub = D_gg[np.ix_(idxs, idxs)]\n",
    "        iu = np.triu_indices(len(idxs), k=1)\n",
    "        return float(np.median(sub[iu])) if iu[0].size else 0.0\n",
    "\n",
    "    boot_div = np.array([diversity_from_indices(idxs) for idxs in boots], dtype=float)\n",
    "    diversity_ci95 = percentile_ci_from_samples(boot_div, alpha=0.05)\n",
    "\n",
    "    # 4) Novelty: median of per-seq min distance to training, within each bootstrap resample\n",
    "    def novelty_from_indices(idxs):\n",
    "        if D_gt is None or D_gt.shape[1] == 0:\n",
    "            return 0.0\n",
    "        mins = D_gt[idxs].min(axis=1)\n",
    "        return float(np.median(mins))\n",
    "\n",
    "    boot_nov = np.array([novelty_from_indices(idxs) for idxs in boots], dtype=float)\n",
    "    novelty_ci95 = percentile_ci_from_samples(boot_nov, alpha=0.05)\n",
    "\n",
    "    # 5) Motif%: fraction with motif in each bootstrap resample\n",
    "    def motif_pct_from_indices(idxs):\n",
    "        if len(idxs) == 0:\n",
    "            return 0.0\n",
    "        # mean of flags in the resample → fraction containing motif\n",
    "        return 100.0 * float(motif_flags[idxs].mean())\n",
    "\n",
    "    boot_motif_pct = np.array([motif_pct_from_indices(idxs) for idxs in boots], dtype=float)\n",
    "    motif_pct_ci95 = percentile_ci_from_samples(boot_motif_pct, alpha=0.05)\n",
    "\n",
    "    metrics = {\n",
    "        \"n_generated\": n,\n",
    "\n",
    "        \"median_fitness\": median_fitness,\n",
    "        \"median_fitness_ci95\": tuple(median_fitness_ci95),\n",
    "\n",
    "        \"fitness_percentile_vs_train_pos\": fitness_percentile,\n",
    "        \"fitness_percentile_vs_train_pos_ci95\": tuple(fitness_percentile_ci95),\n",
    "\n",
    "        \"diversity_median\": diversity_point,\n",
    "        \"diversity_median_ci95\": tuple(diversity_ci95),\n",
    "\n",
    "        \"novelty_median\": novelty_point,\n",
    "        \"novelty_median_ci95\": tuple(novelty_ci95),\n",
    "\n",
    "        \"motif\": motif,\n",
    "        \"motif_count\": motif_count,\n",
    "        \"motif_pct\": motif_pct_point,                # percent of sequences containing motif\n",
    "        \"motif_pct_ci95\": tuple(motif_pct_ci95),     # 95% bootstrap percentile CI (percent units)\n",
    "    }\n",
    "\n",
    "    if verbose:\n",
    "        print(\"\\n=== Per-run metrics with 95% bootstrap percentile CIs ===\")\n",
    "        print(f\"Generated samples: {metrics['n_generated']}\")\n",
    "        print(f\"Median fitness: {metrics['median_fitness']:.6g}  \"\n",
    "              f\"[{metrics['median_fitness_ci95'][0]:.6g}, {metrics['median_fitness_ci95'][1]:.6g}]\")\n",
    "        print(f\"Percentile of median fitness vs train y_norm>0: {metrics['fitness_percentile_vs_train_pos']:.2f}th  \"\n",
    "              f\"[{metrics['fitness_percentile_vs_train_pos_ci95'][0]:.2f}, {metrics['fitness_percentile_vs_train_pos_ci95'][1]:.2f}]\")\n",
    "        print(f\"Diversity (median pairwise): {metrics['diversity_median']:.6g}  \"\n",
    "              f\"[{metrics['diversity_median_ci95'][0]:.6g}, {metrics['diversity_median_ci95'][1]:.6g}]\")\n",
    "        print(f\"Novelty (median min dist to training): {metrics['novelty_median']:.6g}  \"\n",
    "              f\"[{metrics['novelty_median_ci95'][0]:.6g}, {metrics['novelty_median_ci95'][1]:.6g}]\")\n",
    "        print(f\"Motif '{metrics['motif']}' present in: {metrics['motif_count']}/{metrics['n_generated']} \"\n",
    "              f\"({metrics['motif_pct']:.2f}%)  \"\n",
    "              f\"[{metrics['motif_pct_ci95'][0]:.2f}%, {metrics['motif_pct_ci95'][1]:.2f}%]\")\n",
    "\n",
    "    return metrics\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "108600e6",
   "metadata": {},
   "source": [
    "**Metrics for Regular Sampling**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "fbf29b46",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[info] Using Hamming distances (same-length sequences).\n",
      "\n",
      "=== Per-run metrics with 95% bootstrap percentile CIs ===\n",
      "Generated samples: 300\n",
      "Median fitness: 10.0382  [9.73903, 10.2013]\n",
      "Percentile of median fitness vs train y_norm>0: 99.78th  [99.77, 99.80]\n",
      "Diversity (median pairwise): 45  [45, 45]\n",
      "Novelty (median min dist to training): 34  [33, 34]\n",
      "Motif 'CACGTG' present in: 223/300 (74.33%)  [69.67%, 79.33%]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'n_generated': 300,\n",
       " 'median_fitness': 10.038164138793945,\n",
       " 'median_fitness_ci95': (9.739029884338379, 10.201268196105957),\n",
       " 'fitness_percentile_vs_train_pos': np.float64(99.7802617751027),\n",
       " 'fitness_percentile_vs_train_pos_ci95': (99.77070793923761,\n",
       "  99.80414636476546),\n",
       " 'diversity_median': 45.0,\n",
       " 'diversity_median_ci95': (45.0, 45.0),\n",
       " 'novelty_median': 34.0,\n",
       " 'novelty_median_ci95': (33.0, 34.0),\n",
       " 'motif': 'CACGTG',\n",
       " 'motif_count': 223,\n",
       " 'motif_pct': 74.33333333333333,\n",
       " 'motif_pct_ci95': (69.66666666666667, 79.33333333333333)}"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Example call (assumes training_df exists):\n",
    "metrics = compute_metrics_with_ci(training_df,\n",
    "                                   \"sequences_per_run_SMC-GWG.csv\",\n",
    "                                   \"energies_per_run_SMC-GWG.csv\",\n",
    "                                   n_boot=500)\n",
    "\n",
    "metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "d1c12227",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[info] Using Hamming distances (same-length sequences).\n",
      "\n",
      "=== Per-run metrics with 95% bootstrap percentile CIs ===\n",
      "Generated samples: 300\n",
      "Median fitness: 2.72041  [1.59933, 5.53428]\n",
      "Percentile of median fitness vs train y_norm>0: 95.54th  [85.24, 98.83]\n",
      "Diversity (median pairwise): 45  [45, 45]\n",
      "Novelty (median min dist to training): 33  [33, 34]\n",
      "Motif 'CACGTG' present in: 116/300 (38.67%)  [33.33%, 44.00%]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'n_generated': 300,\n",
       " 'median_fitness': 2.720405697822571,\n",
       " 'median_fitness_ci95': (1.5993288666009904, 5.534278488159179),\n",
       " 'fitness_percentile_vs_train_pos': np.float64(95.53835865099838),\n",
       " 'fitness_percentile_vs_train_pos_ci95': (85.23502436228146,\n",
       "  98.82965510652527),\n",
       " 'diversity_median': 45.0,\n",
       " 'diversity_median_ci95': (45.0, 45.0),\n",
       " 'novelty_median': 33.0,\n",
       " 'novelty_median_ci95': (33.0, 34.0),\n",
       " 'motif': 'CACGTG',\n",
       " 'motif_count': 116,\n",
       " 'motif_pct': 38.666666666666664,\n",
       " 'motif_pct_ci95': (33.33333333333333, 44.0)}"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Example call (assumes training_df exists):\n",
    "metrics = compute_metrics_with_ci(training_df,\n",
    "                                   \"sequences_per_run_GWG.csv\",\n",
    "                                   \"energies_per_run_GWG.csv\",\n",
    "                                   n_boot=500)\n",
    "\n",
    "metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "d0bfad01",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[info] Using Hamming distances (same-length sequences).\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=== Per-run metrics with 95% bootstrap percentile CIs ===\n",
      "Generated samples: 300\n",
      "Median fitness: 0.559624  [0.159678, 0.848621]\n",
      "Percentile of median fitness vs train y_norm>0: 42.86th  [12.99, 59.72]\n",
      "Diversity (median pairwise): 45  [45, 45]\n",
      "Novelty (median min dist to training): 33  [33, 33]\n",
      "Motif 'CACGTG' present in: 12/300 (4.00%)  [2.00%, 6.33%]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'n_generated': 300,\n",
       " 'median_fitness': 0.5596240758895874,\n",
       " 'median_fitness_ci95': (0.15967756658792498, 0.8486205965280532),\n",
       " 'fitness_percentile_vs_train_pos': np.float64(42.85850769083787),\n",
       " 'fitness_percentile_vs_train_pos_ci95': (12.993455622432409,\n",
       "  59.72353587465367),\n",
       " 'diversity_median': 45.0,\n",
       " 'diversity_median_ci95': (45.0, 45.0),\n",
       " 'novelty_median': 33.0,\n",
       " 'novelty_median_ci95': (33.0, 33.0),\n",
       " 'motif': 'CACGTG',\n",
       " 'motif_count': 12,\n",
       " 'motif_pct': 4.0,\n",
       " 'motif_pct_ci95': (2.0, 6.333333333333334)}"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Example call (assumes training_df exists):\n",
    "metrics = compute_metrics_with_ci(training_df,\n",
    "                                   \"sequences_per_run_AISAutoTemp-GWG.csv\",\n",
    "                                   \"energies_per_run_AISAutoTemp-GWG.csv\",\n",
    "                                   n_boot=500)\n",
    "\n",
    "metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "c6e058a9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[info] Using Hamming distances (same-length sequences).\n",
      "\n",
      "=== Per-run metrics with 95% bootstrap percentile CIs ===\n",
      "Generated samples: 300\n",
      "Median fitness: -0.785504  [-1.16965, -0.251399]\n",
      "Percentile of median fitness vs train y_norm>0: 0.00th  [0.00, 0.00]\n",
      "Diversity (median pairwise): 45  [45, 45]\n",
      "Novelty (median min dist to training): 33  [33, 34]\n",
      "Motif 'CACGTG' present in: 39/300 (13.00%)  [9.33%, 16.67%]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'n_generated': 300,\n",
       " 'median_fitness': -0.785504162311554,\n",
       " 'median_fitness_ci95': (-1.1696517392992973, -0.2513994574546814),\n",
       " 'fitness_percentile_vs_train_pos': np.float64(0.0),\n",
       " 'fitness_percentile_vs_train_pos_ci95': (0.0, 0.0),\n",
       " 'diversity_median': 45.0,\n",
       " 'diversity_median_ci95': (45.0, 45.0),\n",
       " 'novelty_median': 33.0,\n",
       " 'novelty_median_ci95': (33.0, 34.0),\n",
       " 'motif': 'CACGTG',\n",
       " 'motif_count': 39,\n",
       " 'motif_pct': 13.0,\n",
       " 'motif_pct_ci95': (9.333333333333334, 16.666666666666664)}"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Example call (assumes training_df exists):\n",
    "metrics = compute_metrics_with_ci(training_df,\n",
    "                                   \"sequences_per_run_PT-GWG.csv\",\n",
    "                                   \"energies_per_run_PT-GWG.csv\",\n",
    "                                   n_boot=500)\n",
    "\n",
    "metrics"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "09946b07",
   "metadata": {},
   "source": [
    "\"CIs for per-run statistics were computed via sequence-level bootstrap (B=500) with percentile intervals (95%); for diversity/novelty we recomputed the statistic on each resample using precomputed distance matrices.\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f2787095",
   "metadata": {},
   "source": [
    "**Metrics for Constrained Sampling**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "bc023bd3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[info] Using Hamming distances (same-length sequences).\n",
      "\n",
      "=== Per-run metrics with 95% bootstrap percentile CIs ===\n",
      "Generated samples: 300\n",
      "Median fitness: 7.41854  [7.03298, 7.62745]\n",
      "Percentile of median fitness vs train y_norm>0: 99.45th  [99.38, 99.47]\n",
      "Diversity (median pairwise): 45  [45, 45]\n",
      "Novelty (median min dist to training): 33  [33, 33]\n",
      "Motif 'CACGTG' present in: 189/300 (63.00%)  [58.16%, 68.00%]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'n_generated': 300,\n",
       " 'median_fitness': 7.418541669845581,\n",
       " 'median_fitness_ci95': (7.03298282623291, 7.627453088760376),\n",
       " 'fitness_percentile_vs_train_pos': np.float64(99.45065443775675),\n",
       " 'fitness_percentile_vs_train_pos_ci95': (99.38377758670106,\n",
       "  99.46976210948696),\n",
       " 'diversity_median': 45.0,\n",
       " 'diversity_median_ci95': (45.0, 45.0),\n",
       " 'novelty_median': 33.0,\n",
       " 'novelty_median_ci95': (33.0, 33.0),\n",
       " 'motif': 'CACGTG',\n",
       " 'motif_count': 189,\n",
       " 'motif_pct': 63.0,\n",
       " 'motif_pct_ci95': (58.15833333333333, 68.0)}"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Example call (assumes training_df exists):\n",
    "metrics = compute_metrics_with_ci(training_df,\n",
    "                                   \"sequences_per_run_SMC-GWG_constrained.csv\",\n",
    "                                   \"energies_per_run_SMC-GWG_constrained.csv\",\n",
    "                                   n_boot=500)\n",
    "\n",
    "metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "c06721f2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[info] Using Hamming distances (same-length sequences).\n",
      "\n",
      "=== Per-run metrics with 95% bootstrap percentile CIs ===\n",
      "Generated samples: 300\n",
      "Median fitness: 1.92441  [1.50558, 2.58359]\n",
      "Percentile of median fitness vs train y_norm>0: 90.18th  [83.25, 95.03]\n",
      "Diversity (median pairwise): 45  [45, 45]\n",
      "Novelty (median min dist to training): 33  [33, 34]\n",
      "Motif 'CACGTG' present in: 73/300 (24.33%)  [19.33%, 29.33%]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'n_generated': 300,\n",
       " 'median_fitness': 1.924406886100769,\n",
       " 'median_fitness_ci95': (1.5055827498435974, 2.583585551381111),\n",
       " 'fitness_percentile_vs_train_pos': np.float64(90.18343364860992),\n",
       " 'fitness_percentile_vs_train_pos_ci95': (83.25212572847998, 95.0329607337346),\n",
       " 'diversity_median': 45.0,\n",
       " 'diversity_median_ci95': (45.0, 45.0),\n",
       " 'novelty_median': 33.0,\n",
       " 'novelty_median_ci95': (33.0, 34.0),\n",
       " 'motif': 'CACGTG',\n",
       " 'motif_count': 73,\n",
       " 'motif_pct': 24.333333333333332,\n",
       " 'motif_pct_ci95': (19.333333333333332, 29.333333333333332)}"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Example call (assumes training_df exists):\n",
    "metrics = compute_metrics_with_ci(training_df,\n",
    "                                   \"sequences_per_run_PT-GWG_constrained.csv\",\n",
    "                                   \"energies_per_run_PT-GWG_constrained.csv\",\n",
    "                                   n_boot=500)\n",
    "\n",
    "metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "fb9114bc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[info] Using Hamming distances (same-length sequences).\n",
      "\n",
      "=== Per-run metrics with 95% bootstrap percentile CIs ===\n",
      "Generated samples: 300\n",
      "Median fitness: 2.09394  [0.984293, 3.06341]\n",
      "Percentile of median fitness vs train y_norm>0: 91.89th  [66.36, 96.65]\n",
      "Diversity (median pairwise): 45  [45, 45]\n",
      "Novelty (median min dist to training): 33  [33, 34]\n",
      "Motif 'CACGTG' present in: 94/300 (31.33%)  [26.00%, 37.33%]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'n_generated': 300,\n",
       " 'median_fitness': 2.09393846988678,\n",
       " 'median_fitness_ci95': (0.9842927917838097, 3.0634071826934814),\n",
       " 'fitness_percentile_vs_train_pos': np.float64(91.88879335053024),\n",
       " 'fitness_percentile_vs_train_pos_ci95': (66.35783892232732,\n",
       "  96.64660361134996),\n",
       " 'diversity_median': 45.0,\n",
       " 'diversity_median_ci95': (45.0, 45.0),\n",
       " 'novelty_median': 33.0,\n",
       " 'novelty_median_ci95': (33.0, 34.0),\n",
       " 'motif': 'CACGTG',\n",
       " 'motif_count': 94,\n",
       " 'motif_pct': 31.333333333333332,\n",
       " 'motif_pct_ci95': (26.0, 37.333333333333336)}"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Example call (assumes training_df exists):\n",
    "metrics = compute_metrics_with_ci(training_df,\n",
    "                                   \"sequences_per_run_GWG_constrained.csv\",\n",
    "                                   \"energies_per_run_GWG_constrained.csv\",\n",
    "                                   n_boot=500)\n",
    "\n",
    "metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "992517da",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[info] Using Hamming distances (same-length sequences).\n",
      "\n",
      "=== Per-run metrics with 95% bootstrap percentile CIs ===\n",
      "Generated samples: 300\n",
      "Median fitness: -0.316922  [-0.788986, -0.0969687]\n",
      "Percentile of median fitness vs train y_norm>0: 0.00th  [0.00, 0.00]\n",
      "Diversity (median pairwise): 45  [45, 45]\n",
      "Novelty (median min dist to training): 33  [33, 34]\n",
      "Motif 'CACGTG' present in: 9/300 (3.00%)  [1.33%, 5.00%]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'n_generated': 300,\n",
       " 'median_fitness': -0.31692206859588623,\n",
       " 'median_fitness_ci95': (-0.7889859214425087, -0.09696873724460608),\n",
       " 'fitness_percentile_vs_train_pos': np.float64(0.0),\n",
       " 'fitness_percentile_vs_train_pos_ci95': (0.0, 0.0),\n",
       " 'diversity_median': 45.0,\n",
       " 'diversity_median_ci95': (45.0, 45.0),\n",
       " 'novelty_median': 33.0,\n",
       " 'novelty_median_ci95': (33.0, 34.0),\n",
       " 'motif': 'CACGTG',\n",
       " 'motif_count': 9,\n",
       " 'motif_pct': 3.0,\n",
       " 'motif_pct_ci95': (1.3333333333333335, 5.0)}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Example call (assumes training_df exists):\n",
    "metrics = compute_metrics_with_ci(training_df,\n",
    "                                   \"sequences_per_run_AISAutoTemp-GWG_constrained.csv\",\n",
    "                                   \"energies_per_run_AISAutoTemp-GWG_constrained.csv\",\n",
    "                                   n_boot=500)\n",
    "\n",
    "metrics"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torch-gpu-env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
