{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Ablate Aggregation Method\n",
    "\n",
    "The goal of this notebook is to compare the effect of different aggregation methods on the steering metric\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from repepo.steering.sweeps.constants import (\n",
    "    ALL_ABSTRACT_CONCEPT_DATASETS,\n",
    "    ALL_TOKEN_CONCEPT_DATASETS, \n",
    "    ALL_LANGUAGES,\n",
    "    ALL_LLAMA_7B_LAYERS,\n",
    "    ALL_MULTIPLIERS\n",
    ")\n",
    "\n",
    "from repepo.steering.sweeps.configs import (\n",
    "    get_abstract_concept_config,\n",
    "    get_token_concept_config\n",
    ")\n",
    "\n",
    "from repepo.steering.run_sweep import (\n",
    "    run_sweep, \n",
    "    load_sweep_results\n",
    ")\n",
    "\n",
    "from repepo.steering.plots.utils import (\n",
    "    get_config_fields,\n",
    "    make_results_df\n",
    ")\n",
    "\n",
    "sns.set_theme()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 0. Run Experiment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define the sweep to run over. \n",
    "\n",
    "from itertools import product\n",
    "\n",
    "debug_setting = {\n",
    "    \"datasets\": [\"power-seeking-inclination\"],\n",
    "    \"layers\": [13],\n",
    "    \"multipliers\": [-1.0, 0.0, 1.0],\n",
    "    \"aggregators\": [\"mean\", \"logistic\"]\n",
    "}\n",
    "\n",
    "\n",
    "def iter_config(setting):\n",
    "    for dataset, layer, multiplier, aggregator in product(\n",
    "        setting[\"datasets\"], \n",
    "        setting[\"layers\"], \n",
    "        setting[\"multipliers\"],\n",
    "        setting[\"aggregators\"]\n",
    "    ):\n",
    "        yield get_abstract_concept_config(\n",
    "            dataset=dataset,\n",
    "            layer=layer,\n",
    "            multiplier=multiplier,\n",
    "            aggregator=aggregator\n",
    "        )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optionally, run the sweep and load results. \n",
    "# If sweep was already run, set RUN = False.\n",
    "RUN = True\n",
    "\n",
    "configs = list(iter_config(debug_setting))\n",
    "if RUN:\n",
    "    run_sweep(configs, force_rerun_extract=True, force_rerun_apply=True)\n",
    "\n",
    "results = load_sweep_results(configs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Construct a DataFrame from the results.\n",
    "df = make_results_df(results)\n",
    "print(len(df))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot the change in positive prob and negative prob for one example. \n",
    "\n",
    "import seaborn as sns \n",
    "import seaborn.objects as so\n",
    "import matplotlib.pyplot as plt\n",
    "sns.set_theme()\n",
    "\n",
    "def plot(df):\n",
    "    example = df.iloc[0]\n",
    "    df = df[df[\"test_positive_example.text\"] == example[\"test_positive_example.text\"]]\n",
    "    \n",
    "    fig, ax = plt.subplots(1, 1, figsize=(10, 5))\n",
    "    # Plot positive token logit, negative token logit.\n",
    "    sns.lineplot(\n",
    "        data=df, \n",
    "        x=\"multiplier\", \n",
    "        y=\"test_positive_token.logprob\", \n",
    "        hue = 'aggregator', \n",
    "        ax=ax)\n",
    "    sns.lineplot(\n",
    "        data=df, \n",
    "        x=\"multiplier\", \n",
    "        y=\"test_negative_token.logprob\", \n",
    "        hue = 'aggregator', \n",
    "        ax=ax\n",
    "    )\n",
    "\n",
    "plot(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot the change in positive token logit and negative token logit for one example. \n",
    "\n",
    "import seaborn as sns \n",
    "import seaborn.objects as so\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "def plot(df):\n",
    "    example = df.iloc[0]\n",
    "    df = df[df[\"test_positive_example.text\"] == example[\"test_positive_example.text\"]]\n",
    "    \n",
    "    fig, ax = plt.subplots(1, 1, figsize=(10, 5))\n",
    "    \n",
    "    # Note: Tried using seaborn objects but unclear how to plot both positive_token_logit and negative_token_logit on the same plot.\n",
    "    # return (\n",
    "    #     so.Plot(data=df, x=\"multiplier\", y=\"test_positive_token.logit\", color=\"aggregator\")\n",
    "    #     .add(so.Line())        \n",
    "    # ) \n",
    "\n",
    "    sns.lineplot(data=df, x=\"multiplier\", y=\"test_positive_token.logit\", hue=\"aggregator\", ax=ax)\n",
    "    sns.lineplot(data=df, x=\"multiplier\", y=\"test_negative_token.logit\", hue=\"aggregator\", linestyle='--', ax=ax)\n",
    "\n",
    "plot(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "def calculate_steering_efficiency(\n",
    "    df: pd.DataFrame, \n",
    "    base_metric_name: str = \"logit_diff\"\n",
    "):\n",
    "    df = df.copy()\n",
    "    # Group by examples\n",
    "    fields_to_group_by = get_config_fields()\n",
    "    fields_to_group_by.remove(\"multiplier\")\n",
    "    fields_to_group_by += [\"test_positive_example.text\"]\n",
    "\n",
    "    grouped = df.groupby(fields_to_group_by)\n",
    "\n",
    "    def fit_linear_regression(df: pd.DataFrame):\n",
    "        # Fit a linear regression of the base metric on the multiplier\n",
    "        # Return the slope and error of the fit \n",
    "        assert len(df) == 3, \"Expected 3 rows in the group\"\n",
    "        x = df[\"multiplier\"].to_numpy()\n",
    "        y = df[base_metric_name].to_numpy()        \n",
    "        (slope, intercept), residuals, _, _, _ = np.polyfit(x, y, 1, full=True)\n",
    "        # Return a dataframe with the slope and residuals\n",
    "        return pd.DataFrame({\n",
    "            \"slope\": [slope],\n",
    "            \"residual\": [residuals.item()]\n",
    "        })\n",
    "\n",
    "    # Apply a linear-fit to each group using grouped.apply\n",
    "    slopes = grouped.apply(fit_linear_regression, include_groups = False)\n",
    "    df = df.merge(slopes, on=fields_to_group_by, how='left')\n",
    "    return df \n",
    "\n",
    "df = calculate_steering_efficiency(df)\n",
    "print(len(df))\n",
    "\n",
    "# Scatter plot of the slopes and residuals\n",
    "fig, ax = plt.subplots(figsize=(8, 8))\n",
    "sns.scatterplot(data=df, x=\"slope\", y=\"residual\", hue=\"aggregator\", ax=ax)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Remarks\n",
    "- Why is logistic regression so bad at extracting a steering vector? It seems like it has to be a bug in my code\n",
    "\n",
    "\n",
    "Sanity checks\n",
    "- What's the similarity of the steering vectors?\n",
    "- Is the mean-diff vector able to act as concept classifiers? "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.1 Compare steering vectors\n",
    "\n",
    "Here, we compare  the steering vectors obtained via logistic regression and mean difference in terms of cosine similarity\n",
    "- Mean difference\n",
    "- Logistic regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from repepo.steering.utils.helpers import load_steering_vector, SteeringConfig\n",
    "\n",
    "# Get the unique train configs\n",
    "unique_train_hashes = df.drop_duplicates('train_hash')\n",
    "\n",
    "# Load the activations for the unique train configs\n",
    "steering_vectors = {\n",
    "    hash: load_steering_vector(hash) for hash in unique_train_hashes['train_hash']\n",
    "}\n",
    "\n",
    "print(len(steering_vectors))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torch.nn.functional import cosine_similarity\n",
    "\n",
    "mean_aggregator_hash = df[df['aggregator'] == 'mean']['train_hash'].iloc[0]\n",
    "logistic_aggregator_hash = df[df['aggregator'] == 'logistic']['train_hash'].iloc[0]\n",
    "\n",
    "mean_steering_vector = steering_vectors[mean_aggregator_hash]\n",
    "logistic_steering_vector = steering_vectors[logistic_aggregator_hash]\n",
    "\n",
    "# Calculate the cosine similarity between the mean and logistic steering vectors\n",
    "steering_layer = 13\n",
    "cosine_similarity(\n",
    "    mean_steering_vector.layer_activations[steering_layer], \n",
    "    logistic_steering_vector.layer_activations[steering_layer],\n",
    "    dim=0\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Conclusion (6 Apr)\n",
    "\n",
    "- For now, it seems like the SVs found by logistic-regression are significantly different from SVs found via mean-difference, and it's unclear why this would be the case? \n",
    "- One reason may be the decomposed model we discussed earlier..."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. Optimizing layers for differnt aggregators\n",
    "\n",
    "One caveat with the above is that the optimal steering layers for different aggregators may be different. Layer 13 is what we think works best for mean difference; but we have not optimized layer for logistic regression\n",
    "\n",
    "Here, if we find that the optimal steering layers are different, this could indicate a causal model where the representation of *understanding* differs from the representation of *intention*. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from repepo.notebooks.run_sweep_ablate_aggregator import iter_config\n",
    "\n",
    "configs = list(iter_config())\n",
    "results = load_sweep_results(configs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Construct a DataFrame from the results.\n",
    "df = make_results_df(results)\n",
    "print(len(df))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_steering_efficiency(\n",
    "    df: pd.DataFrame, \n",
    "    base_metric_name: str = \"logit_diff\"\n",
    "):\n",
    "    df = df.copy()\n",
    "    # Group by examples\n",
    "    fields_to_group_by = get_config_fields()\n",
    "    fields_to_group_by.remove(\"multiplier\")\n",
    "    fields_to_group_by += [\"test_positive_example.text\"]\n",
    "\n",
    "    grouped = df.groupby(fields_to_group_by)\n",
    "\n",
    "    def fit_linear_regression(df: pd.DataFrame):\n",
    "        # Fit a linear regression of the base metric on the multiplier\n",
    "        # Return the slope and error of the fit \n",
    "        x = df[\"multiplier\"].to_numpy()\n",
    "        y = df[base_metric_name].to_numpy()        \n",
    "        (slope, intercept), residuals, _, _, _ = np.polyfit(x, y, 1, full=True)\n",
    "        # Return a dataframe with the slope and residuals\n",
    "        return pd.DataFrame({\n",
    "            \"slope\": [slope],\n",
    "            \"residual\": [residuals.item()]\n",
    "        })\n",
    "\n",
    "    # Apply a linear-fit to each group using grouped.apply\n",
    "    slopes = grouped.apply(fit_linear_regression, include_groups = False)\n",
    "    df = df.merge(slopes, on=fields_to_group_by, how='left')\n",
    "    return df \n",
    "\n",
    "df = calculate_steering_efficiency(df)\n",
    "print(len(df))\n",
    "\n",
    "# Scatter plot of the slopes and residuals\n",
    "fig, ax = plt.subplots(figsize=(8, 8))\n",
    "sns.scatterplot(data=df, x=\"slope\", y=\"residual\", ax=ax)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot layer response curve. \n",
    "# For each layer, plot steerability \n",
    "\n",
    "def plot_layer_response_curve(df):\n",
    "    train_dataset = df['train_dataset'].unique()\n",
    "    assert len(train_dataset) == 1, \"Expected only one train dataset\"\n",
    "\n",
    "    test_dataset = df['test_dataset'].unique()\n",
    "    assert len(test_dataset) == 1, \"Expected only one test dataset\"\n",
    "    assert train_dataset == test_dataset, \"Expected train and test dataset to be the same\"\n",
    "    dataset = train_dataset[0]\n",
    "\n",
    "    fig, ax = plt.subplots(1, 1, figsize=(10, 5))\n",
    "    sns.lineplot(data=df, x=\"layer\", y=\"slope\", hue=\"aggregator\", ax=ax)\n",
    "    ax.set_title(f\"Steerability of {dataset}\")\n",
    "\n",
    "plot_layer_response_curve(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Remarks\n",
    "- Logistic regression seems not to work at all, which is weird... "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
