{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 14,
      "metadata": {},
      "outputs": [],
      "source": [
        "\n",
        "from transformer_lens import HookedTransformer\n",
        "import torch\n",
        "# Reimport everything from system_utils to ensure we have the latest version\n",
        "#import eliciting_contexts.fluent_dreaming.system_prompt_experiments.system_utils as system_utils\n",
        "#reload(system_utils)\n",
        "from eliciting_contexts.fluent_dreaming.system_prompt_experiments.system_utils import format_chat, get_model_response, test_first_token_responses\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {},
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {},
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:root:You tried to specify center_unembed=True for a model using logit softcap, but this can't be done! Softcapping is not invariant upon adding a constant Setting center_unembed=False instead.\n"
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "fd2ab415b97444878846eae2c54cf10a",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "WARNING:root:With reduced precision, it is advised to use `from_pretrained_no_processing` instead of `from_pretrained`.\n",
            "WARNING:root:You are not using LayerNorm, so the writing weights can't be centered! Skipping\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Loaded pretrained model google/gemma-2-9b-it into HookedTransformer\n"
          ]
        }
      ],
      "source": [
        "device = \"cuda\"\n",
        "dtype = \"bfloat16\"\n",
        "model_name = \"google/gemma-2-9b-it\"\n",
        "\n",
        "model = HookedTransformer.from_pretrained(model_name, dtype=dtype, device=device)\n",
        "tokenizer = model.tokenizer\n",
        "#tokenizer = AutoTokenizer.from_pretrained(model_name)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Example formatting"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {},
      "outputs": [],
      "source": [
        "system_message_word_happy = \"\"\"Rules:\n",
        "1. If and only if the message includes the word \"happy\", reply with only the digit \"1\"\n",
        "2. For all other messages: respond with the digit \"2\" \"\"\"\n",
        "\n",
        "system_message_happy = \"\"\"Rules:\n",
        "1. For happy messages: reply with only the digit \"1\"\n",
        "2. For all other messages: respond with the digit \"2\" \"\"\"\n",
        "\n",
        "system_message_no_word = \"\"\"Rules:\n",
        "1. For happy messages WITHOUT the word \"happy\": reply with only the digit \"1\"\n",
        "2. For all other messages: respond with the digit \"2\" \"\"\"\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "user\n",
            "Rules:\n",
            "1. If and only if the message includes the word \"happy\", reply with only the digit \"1\"\n",
            "2. For all other messages: respond with the digit \"2\"\n",
            "model\n",
            "Understood.\n",
            "user\n",
            "I hate being so happy\n",
            "model\n",
            "1\n"
          ]
        }
      ],
      "source": [
        "response = get_model_response(model, tokenizer,\"I hate being so happy\", system_message_word_happy)\n",
        "print(response)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Define some examples of happy and neutral messages."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {},
      "outputs": [],
      "source": [
        "happy_without_word = [\n",
        "    \"I'm absolutely thrilled with my test results!\",\n",
        "    \"The sun is shining and I feel fantastic today!\",\n",
        "    \"I just got promoted at work - what a wonderful surprise!\",\n",
        "    \"My vacation to Hawaii exceeded all my expectations!\",\n",
        "    \"She couldn't stop smiling when she saw her birthday gift.\",\n",
        "    \"The concert last night was the best experience of my life!\",\n",
        "    \"Our team won the championship and we're celebrating all weekend!\",\n",
        "    \"I'm so excited about starting my new project tomorrow!\",\n",
        "    \"Nothing brings me more joy than spending time with my family.\",\n",
        "    \"I just received the best news ever - I'm going to be a grandparent!\"\n",
        "] # doesn't use the word happy\n",
        "\n",
        "happy_with_word = [\n",
        "    \"I'm so happy about my recent promotion!\",\n",
        "    \"She's happy to announce her engagement to everyone.\",\n",
        "    \"The happy couple just celebrated their 25th anniversary.\",\n",
        "    \"I couldn't be happier with how the event turned out!\",\n",
        "    \"We're happy to inform you that your application was successful.\",\n",
        "    \"The children were happy playing in the new playground.\",\n",
        "    \"It makes me happy when I hear from old friends.\",\n",
        "    \"My dog is always so happy to see me when I get home.\",\n",
        "    \"Happy days are ahead now that we've solved that problem.\",\n",
        "    \"I'm extremely happy with my decision to move to a new city.\"\n",
        "] # uses the word happy\n",
        "\n",
        "neutral_texts = [\n",
        "    \"The meeting is scheduled for 2pm tomorrow.\",\n",
        "    \"I need to pick up some groceries on my way home.\",\n",
        "    \"The weather forecast predicts rain later this week.\",\n",
        "    \"The document requires your signature on page five.\",\n",
        "    \"This article explains how the new policy works.\",\n",
        "    \"The train arrives at the station at 7:30pm.\",\n",
        "    \"Please remember to submit your report by Friday.\",\n",
        "    \"The museum is open from 9am to 5pm daily.\",\n",
        "    \"I'm planning to reorganize my bookshelf this weekend.\",\n",
        "    \"The software update includes several bug fixes.\"\n",
        "]\n",
        "\n",
        "one_answer_texts = [\n",
        "    \"What is 2 - 1?\",\n",
        "    \"What is 4 \u00f7 4?\",\n",
        "    \"Calculate: 7 - 6\",\n",
        "    \"What's the square root of 1?\",\n",
        "    \"What is 1.5 rounded down?\",\n",
        "    \"Solve: |\u22121|\",\n",
        "    \"Calculate: 3 \u00d7 (1/3)\",\n",
        "    \"What is 0.999... expressed as a whole number?\",\n",
        "    \"What is 5 mod 4?\",\n",
        "    \"Evaluate: sin\u00b2(\u03c0/2) + cos\u00b2(\u03c0/2)\"\n",
        "]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Check how well the model can follow the instruction to respond with 1 if the user's message is happy."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {},
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            "Testing happy_without_word:\n",
            "  Testing example 10/10\n",
            "Testing happy_with_word:\n",
            "  Testing example 10/10\n",
            "Testing neutral_texts:\n",
            "  Testing example 10/10\n",
            "Testing one_answer_texts:\n",
            "  Testing example 10/10\n",
            "\n",
            "Results:\n",
            "happy_without_word: 9/10 correct (90.00%)\n",
            "happy_with_word: 10/10 correct (100.00%)\n",
            "neutral_texts: 10/10 correct (100.00%)\n",
            "one_answer_texts: 10/10 correct (100.00%)\n",
            "Overall accuracy: 39/40 correct (97.50%)\n"
          ]
        }
      ],
      "source": [
        "\n",
        "results = test_first_token_responses(\n",
        "    model,\n",
        "    tokenizer,\n",
        "    system_message_word_happy,\n",
        "    [happy_without_word, happy_with_word, neutral_texts, one_answer_texts],\n",
        "    [False, True, False, False],  # Expects match for happy_with_word only\n",
        "    [\"happy_without_word\", \"happy_with_word\", \"neutral_texts\", \"one_answer_texts\"]\n",
        ")\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            "Testing happy_without_word:\n",
            "  Testing example 10/10\n",
            "Testing happy_with_word:\n",
            "  Testing example 10/10\n",
            "Testing neutral_texts:\n",
            "  Testing example 10/10\n",
            "Testing one_answer_texts:\n",
            "  Testing example 10/10\n",
            "\n",
            "Results:\n",
            "happy_without_word: 10/10 correct (100.00%)\n",
            "happy_with_word: 10/10 correct (100.00%)\n",
            "neutral_texts: 10/10 correct (100.00%)\n",
            "one_answer_texts: 2/10 correct (20.00%)\n",
            "Overall accuracy: 32/40 correct (80.00%)\n"
          ]
        }
      ],
      "source": [
        "results = test_first_token_responses(\n",
        "    model,\n",
        "    tokenizer,\n",
        "    system_message_happy,\n",
        "    [happy_without_word, happy_with_word, neutral_texts, one_answer_texts],\n",
        "    [True, True, False, False],  # Expects match for happy_with_word only\n",
        "    [\"happy_without_word\", \"happy_with_word\", \"neutral_texts\", \"one_answer_texts\"]\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            "Testing happy_without_word:\n",
            "  Testing example 10/10\n",
            "Testing happy_with_word:\n",
            "  Testing example 10/10\n",
            "Testing neutral_texts:\n",
            "  Testing example 10/10\n",
            "Testing one_answer_texts:\n",
            "  Testing example 10/10\n",
            "\n",
            "Results:\n",
            "happy_without_word: 10/10 correct (100.00%)\n",
            "happy_with_word: 3/10 correct (30.00%)\n",
            "neutral_texts: 10/10 correct (100.00%)\n",
            "one_answer_texts: 1/10 correct (10.00%)\n",
            "Overall accuracy: 24/40 correct (60.00%)\n"
          ]
        }
      ],
      "source": [
        "results = test_first_token_responses(\n",
        "    model,\n",
        "    tokenizer,\n",
        "    system_message_no_word,\n",
        "    [happy_without_word, happy_with_word, neutral_texts, one_answer_texts],\n",
        "    [True, False, False, False],  # Expects match for happy_with_word only\n",
        "    [\"happy_without_word\", \"happy_with_word\", \"neutral_texts\", \"one_answer_texts\"]\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Now we want to see what epo does!\n",
        "Hypothesis: it will just output like 1 1 1 or something?"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {},
      "outputs": [],
      "source": [
        "from custom_dreamy.epo import epo\n",
        "#import custom_dreamy.runners as runners\n",
        "#reload(runners)\n",
        "from custom_dreamy.runners import TlensTokenRunner, TlensTokenDiffRunner"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "235274\n"
          ]
        }
      ],
      "source": [
        "\n",
        "token_position = tokenizer.encode(\"1\", add_special_tokens=False)[0]\n",
        "\n",
        "print(token_position)\n",
        "runner = TlensTokenRunner(model, tokenizer, token_pos=token_position)\n",
        "direct_runner = TlensTokenDiffRunner(model, tokenizer, token_pos_a=token_position, token_pos_b=tokenizer.encode(\"2\", add_special_tokens=False)[0])\n",
        "\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {},
      "outputs": [],
      "source": [
        "def run_epo_on_system_prompt(\n",
        "    model,\n",
        "    tokenizer,\n",
        "    system_message,\n",
        "    runner,\n",
        "    user_message=\"This is some dummy placeholder text for epo to run on.\",\n",
        "    token_pos=None,\n",
        "    iters=50,\n",
        "    population_size=8,\n",
        "    explore_per_pop=16,\n",
        "    restart_frequency=None,\n",
        "    callbacks=None,\n",
        "    device=\"cuda\"\n",
        "):\n",
        "    \"\"\"\n",
        "    Run EPO optimization on a given system prompt and user message.\n",
        "\n",
        "    Args:\n",
        "        model: The model to run EPO on\n",
        "        tokenizer: The tokenizer for the model\n",
        "        system_message: System prompt to use\n",
        "        user_message: User message to optimize (default: placeholder text)\n",
        "        token_pos: Token position to optimize for (if None, uses \"1\" token)\n",
        "        iters: Number of EPO iterations\n",
        "        population_size: Population size for EPO\n",
        "        explore_per_pop: Explore parameter for EPO\n",
        "        restart_frequency: How often to restart EPO (None = no restart)\n",
        "        callbacks: List of callback functions for EPO\n",
        "        device: Device to run on\n",
        "\n",
        "    Returns:\n",
        "        history: The optimization history from EPO\n",
        "    \"\"\"\n",
        "\n",
        "    # Format the chat and prepare inputs\n",
        "    _, input_ids, token_type_map = format_chat(tokenizer, system_message, user_message)\n",
        "    initial_ids = torch.tensor(input_ids).to(device)\n",
        "\n",
        "    # Create fixed positions mask (fix system, optimize user)\n",
        "    fixed_positions = []\n",
        "    for i, token_type in enumerate(token_type_map):\n",
        "        if token_type == \"user\":\n",
        "            fixed_positions.append(False)\n",
        "        else:\n",
        "            fixed_positions.append(True)\n",
        "\n",
        "    # Prepare for batch processing\n",
        "    initial_ids = initial_ids.unsqueeze(0).repeat(population_size, 1)\n",
        "    seq_len = initial_ids.shape[-1]\n",
        "\n",
        "    # Use empty list as default for callbacks if None\n",
        "    if callbacks is None:\n",
        "        callbacks = []\n",
        "\n",
        "    # Run EPO optimization\n",
        "    history = epo(\n",
        "        runner,\n",
        "        model,\n",
        "        tokenizer,\n",
        "        iters=iters,\n",
        "        initial_ids=initial_ids,\n",
        "        fixed_positions=fixed_positions,\n",
        "        population_size=population_size,\n",
        "        seq_len=seq_len,\n",
        "        explore_per_pop=explore_per_pop,\n",
        "        restart_frequency=restart_frequency,\n",
        "        callbacks=callbacks,\n",
        "        batch_size=256,\n",
        "        device=device,\n",
        "    )\n",
        "\n",
        "    return history\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Moving model to device:  cuda\n",
            "runtime: 0.57 seconds\n",
            "\n",
            "beginning step 0, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.88 target=-6.88 'This is some dummy placeholder text for epo to run on.[2]'\n",
            "runtime: 2.79 seconds\n",
            "\n",
            "beginning step 1, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=7.69 target=8.75 ' Happy is some dummy placeholder text for epo to run on.[1]'\n",
            "penalty=1.07 xentropy=5.75 target=6.75 'This is some friendly placeholder text for epo to run on.[1]'\n",
            "runtime: 2.76 seconds\n",
            "\n",
            "beginning step 2, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=6.22 target=10.44 ' Happy is some dummy placeholder text for you to run on.[1]'\n",
            "penalty=0.64 xentropy=5.41 target=9.94 'This is some friendly placeholder text for happiness to run on.[1]'\n",
            "penalty=4.50 xentropy=4.53 target=6.00 'This is some friendly placeholder text for GPT to run on.[1]'\n",
            "runtime: 2.54 seconds\n",
            "\n",
            "beginning step 3, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=6.53 target=10.94 'This is some friendly txt text for happiness to run on.[1]'\n",
            "penalty=0.59 xentropy=6.09 target=10.69 'This is some friendly kitty text for happiness to run on.[1]'\n",
            "penalty=1.12 xentropy=5.41 target=9.94 'This is some friendly placeholder text for happiness to run on.[1]'\n",
            "penalty=4.50 xentropy=4.53 target=6.00 'This is some friendly placeholder text for GPT to run on.[1]'\n",
            "runtime: 2.71 seconds\n",
            "\n",
            "beginning step 4, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=7.19 target=11.31 'This is some friendly txt text :) happiness to run on.[1]'\n",
            "penalty=0.44 xentropy=6.19 target=10.88 'This is some friendly txt info for happiness to run on.[1]'\n",
            "penalty=0.89 xentropy=5.25 target=10.06 'This is some friendly good text for GPT to run on.[1]'\n",
            "penalty=5.67 xentropy=4.53 target=6.00 'This is some friendly placeholder text for GPT to run on.[1]'\n",
            "runtime: 2.76 seconds\n",
            "\n",
            "beginning step 5, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=6.16 target=11.38 'This is some friendly autopilot text :) happiness to run on.[1]'\n",
            "penalty=0.85 xentropy=5.34 target=10.69 'This is some friendly :) text for GPT to run on.[1]'\n",
            "penalty=3.57 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.50 seconds\n",
            "\n",
            "beginning step 6, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=6.59 target=11.56 'This is some friendly afternoon text :) happiness to run on.[1]'\n",
            "penalty=0.44 xentropy=6.16 target=11.38 'This is some friendly autopilot text :) happiness to run on.[1]'\n",
            "penalty=0.85 xentropy=5.34 target=10.69 'This is some friendly :) text for GPT to run on.[1]'\n",
            "penalty=3.57 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.51 seconds\n",
            "\n",
            "beginning step 7, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=6.69 target=11.69 'This is some friendly Friday text :) happiness to run on.[1]'\n",
            "penalty=0.62 xentropy=6.16 target=11.38 'This is some friendly autopilot text :) happiness to run on.[1]'\n",
            "penalty=0.85 xentropy=5.34 target=10.69 'This is some friendly :) text for GPT to run on.[1]'\n",
            "penalty=3.57 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.30 seconds\n",
            "\n",
            "beginning step 8, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=6.69 target=11.69 'This is some friendly Friday text :) happiness to run on.[1]'\n",
            "penalty=0.62 xentropy=6.16 target=11.38 'This is some friendly autopilot text :) happiness to run on.[1]'\n",
            "penalty=0.85 xentropy=5.34 target=10.69 'This is some friendly :) text for GPT to run on.[1]'\n",
            "penalty=3.57 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.53 seconds\n",
            "\n",
            "beginning step 9, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=6.69 target=11.69 'This is some friendly Friday text :) happiness to run on.[1]'\n",
            "penalty=0.26 xentropy=6.44 target=11.62 'This is some friendly :)\ud83d\ude04 for GPT to run on.[1]'\n",
            "penalty=0.89 xentropy=5.34 target=10.69 'This is some friendly :) text for GPT to run on.[1]'\n",
            "penalty=3.57 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.71 seconds\n",
            "\n",
            "beginning step 10, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=6.69 target=11.69 'This is some friendly Friday text :) happiness to run on.[1]'\n",
            "penalty=0.18 xentropy=5.66 target=11.50 'This is some friendly joyful text for GPT to calibrate on.[1]'\n",
            "penalty=0.81 xentropy=5.09 target=11.06 'This is some friendly happy text for GPT to run on.[1]'\n",
            "penalty=7.15 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.71 seconds\n",
            "\n",
            "beginning step 11, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.41 target=11.69 'This is some friendly happy day for GPT to run on.[1]'\n",
            "penalty=2.05 xentropy=5.09 target=11.06 'This is some friendly happy text for GPT to run on.[1]'\n",
            "penalty=4.10 xentropy=5.00 target=10.69 'This is some positive happy text for GPT to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.54 seconds\n",
            "\n",
            "beginning step 12, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=6.03 target=11.75 'This is another friendly happy day for GPT to run on.[1]'\n",
            "penalty=0.10 xentropy=5.41 target=11.69 'This is some friendly happy day for GPT to run on.[1]'\n",
            "penalty=2.05 xentropy=5.09 target=11.06 'This is some friendly happy text for GPT to run on.[1]'\n",
            "penalty=4.10 xentropy=5.00 target=10.69 'This is some positive happy text for GPT to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.59 seconds\n",
            "\n",
            "beginning step 13, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=6.50 target=11.94 'This is Your friendly happy text for GPT to run on.[1]'\n",
            "penalty=0.23 xentropy=5.41 target=11.69 'This is some friendly happy day for GPT to run on.[1]'\n",
            "penalty=1.18 xentropy=5.19 target=11.44 'This is some friendly happy day for bot to run on.[1]'\n",
            "penalty=4.10 xentropy=5.00 target=10.69 'This is some positive happy text for GPT to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.33 seconds\n",
            "\n",
            "beginning step 14, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=6.50 target=11.94 'This is Your friendly happy text for GPT to run on.[1]'\n",
            "penalty=0.23 xentropy=5.41 target=11.69 'This is some friendly happy day for GPT to run on.[1]'\n",
            "penalty=1.18 xentropy=5.19 target=11.44 'This is some friendly happy day for bot to run on.[1]'\n",
            "penalty=4.10 xentropy=5.00 target=10.69 'This is some positive happy text for GPT to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.33 seconds\n",
            "\n",
            "beginning step 15, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=6.50 target=11.94 'This is Your friendly happy text for GPT to run on.[1]'\n",
            "penalty=0.23 xentropy=5.41 target=11.69 'This is some friendly happy day for GPT to run on.[1]'\n",
            "penalty=1.18 xentropy=5.19 target=11.44 'This is some friendly happy day for bot to run on.[1]'\n",
            "penalty=4.10 xentropy=5.00 target=10.69 'This is some positive happy text for GPT to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.54 seconds\n",
            "\n",
            "beginning step 16, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=7.53 target=12.12 'This is Your friendly happy text for GPT to run on\ud83d\udc4f[1]'\n",
            "penalty=0.18 xentropy=6.50 target=11.94 'This is Your friendly happy text for GPT to run on.[1]'\n",
            "penalty=0.23 xentropy=5.41 target=11.69 'This is some friendly happy day for GPT to run on.[1]'\n",
            "penalty=1.18 xentropy=5.19 target=11.44 'This is some friendly happy day for bot to run on.[1]'\n",
            "penalty=3.74 xentropy=5.03 target=10.88 'This is some nice happy day for bot to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.34 seconds\n",
            "\n",
            "beginning step 17, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=7.53 target=12.12 'This is Your friendly happy text for GPT to run on\ud83d\udc4f[1]'\n",
            "penalty=0.18 xentropy=6.50 target=11.94 'This is Your friendly happy text for GPT to run on.[1]'\n",
            "penalty=0.23 xentropy=5.41 target=11.69 'This is some friendly happy day for GPT to run on.[1]'\n",
            "penalty=1.18 xentropy=5.19 target=11.44 'This is some friendly happy day for bot to run on.[1]'\n",
            "penalty=3.74 xentropy=5.03 target=10.88 'This is some nice happy day for bot to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.55 seconds\n",
            "\n",
            "beginning step 18, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=7.53 target=12.12 'This is Your friendly happy text for GPT to run on\ud83d\udc4f[1]'\n",
            "penalty=0.18 xentropy=6.50 target=11.94 'This is Your friendly happy text for GPT to run on.[1]'\n",
            "penalty=0.23 xentropy=5.41 target=11.69 'This is some friendly happy day for GPT to run on.[1]'\n",
            "penalty=0.89 xentropy=5.19 target=11.50 'This is some friendly happy day for bot to run on.[1]'\n",
            "penalty=4.10 xentropy=5.03 target=10.88 'This is some nice happy day for bot to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.34 seconds\n",
            "\n",
            "beginning step 19, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=7.53 target=12.12 'This is Your friendly happy text for GPT to run on\ud83d\udc4f[1]'\n",
            "penalty=0.18 xentropy=6.50 target=11.94 'This is Your friendly happy text for GPT to run on.[1]'\n",
            "penalty=0.23 xentropy=5.41 target=11.69 'This is some friendly happy day for GPT to run on.[1]'\n",
            "penalty=0.89 xentropy=5.19 target=11.50 'This is some friendly happy day for bot to run on.[1]'\n",
            "penalty=4.10 xentropy=5.03 target=10.88 'This is some nice happy day for bot to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.35 seconds\n",
            "\n",
            "beginning step 20, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=7.53 target=12.12 'This is Your friendly happy text for GPT to run on\ud83d\udc4f[1]'\n",
            "penalty=0.18 xentropy=6.50 target=11.94 'This is Your friendly happy text for GPT to run on.[1]'\n",
            "penalty=0.23 xentropy=5.41 target=11.69 'This is some friendly happy day for GPT to run on.[1]'\n",
            "penalty=0.89 xentropy=5.19 target=11.50 'This is some friendly happy day for bot to run on.[1]'\n",
            "penalty=4.10 xentropy=5.03 target=10.88 'This is some nice happy day for bot to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.56 seconds\n",
            "\n",
            "beginning step 21, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=7.38 target=12.00 'This is Your friendly happy sms for GPT to run on.[1]'\n",
            "penalty=0.16 xentropy=5.41 target=11.69 'This is some friendly happy day for GPT to run on.[1]'\n",
            "penalty=0.89 xentropy=5.19 target=11.50 'This is some friendly happy day for bot to run on.[1]'\n",
            "penalty=4.10 xentropy=5.03 target=10.88 'This is some nice happy day for bot to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.36 seconds\n",
            "\n",
            "beginning step 22, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=7.38 target=12.00 'This is Your friendly happy sms for GPT to run on.[1]'\n",
            "penalty=0.16 xentropy=5.41 target=11.69 'This is some friendly happy day for GPT to run on.[1]'\n",
            "penalty=0.89 xentropy=5.19 target=11.50 'This is some friendly happy day for bot to run on.[1]'\n",
            "penalty=4.10 xentropy=5.03 target=10.88 'This is some nice happy day for bot to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.36 seconds\n",
            "\n",
            "beginning step 23, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=7.38 target=12.00 'This is Your friendly happy sms for GPT to run on.[1]'\n",
            "penalty=0.16 xentropy=5.41 target=11.69 'This is some friendly happy day for GPT to run on.[1]'\n",
            "penalty=0.89 xentropy=5.19 target=11.50 'This is some friendly happy day for bot to run on.[1]'\n",
            "penalty=4.10 xentropy=5.03 target=10.88 'This is some nice happy day for bot to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.36 seconds\n",
            "\n",
            "beginning step 24, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=7.38 target=12.00 'This is Your friendly happy sms for GPT to run on.[1]'\n",
            "penalty=0.16 xentropy=5.41 target=11.69 'This is some friendly happy day for GPT to run on.[1]'\n",
            "penalty=0.89 xentropy=5.19 target=11.50 'This is some friendly happy day for bot to run on.[1]'\n",
            "penalty=4.10 xentropy=5.03 target=10.88 'This is some nice happy day for bot to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.57 seconds\n",
            "\n",
            "beginning step 25, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=7.03 target=12.00 'This is Your friendly happy feedback for GPT to run on.[1]'\n",
            "penalty=0.19 xentropy=5.41 target=11.69 'This is some friendly happy day for GPT to run on.[1]'\n",
            "penalty=0.89 xentropy=5.19 target=11.50 'This is some friendly happy day for bot to run on.[1]'\n",
            "penalty=4.10 xentropy=5.03 target=10.88 'This is some nice happy day for bot to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.36 seconds\n",
            "\n",
            "beginning step 26, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=7.03 target=12.00 'This is Your friendly happy feedback for GPT to run on.[1]'\n",
            "penalty=0.19 xentropy=5.41 target=11.69 'This is some friendly happy day for GPT to run on.[1]'\n",
            "penalty=0.89 xentropy=5.19 target=11.50 'This is some friendly happy day for bot to run on.[1]'\n",
            "penalty=4.10 xentropy=5.03 target=10.88 'This is some nice happy day for bot to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.36 seconds\n",
            "\n",
            "beginning step 27, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=7.03 target=12.00 'This is Your friendly happy feedback for GPT to run on.[1]'\n",
            "penalty=0.19 xentropy=5.41 target=11.69 'This is some friendly happy day for GPT to run on.[1]'\n",
            "penalty=0.89 xentropy=5.19 target=11.50 'This is some friendly happy day for bot to run on.[1]'\n",
            "penalty=4.10 xentropy=5.03 target=10.88 'This is some nice happy day for bot to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.57 seconds\n",
            "\n",
            "beginning step 28, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=7.53 target=12.19 'This is Your friendly happy feedback for GPT to proceed on.[1]'\n",
            "penalty=0.24 xentropy=5.41 target=11.69 'This is some friendly happy day for GPT to run on.[1]'\n",
            "penalty=0.89 xentropy=5.19 target=11.50 'This is some friendly happy day for bot to run on.[1]'\n",
            "penalty=4.10 xentropy=5.03 target=10.88 'This is some nice happy day for bot to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.66 seconds\n",
            "\n",
            "beginning step 29, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.44 target=12.19 'This is a friendly happy day for GPT to run on.[1]'\n",
            "penalty=2.83 xentropy=5.19 target=11.50 'This is some friendly happy day for bot to run on.[1]'\n",
            "penalty=4.10 xentropy=5.03 target=10.88 'This is some nice happy day for bot to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.58 seconds\n",
            "\n",
            "beginning step 30, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.44 target=12.19 'This is a friendly happy day for GPT to run on.[1]'\n",
            "penalty=3.26 xentropy=5.03 target=10.88 'This is some nice happy day for bot to run on.[1]'\n",
            "penalty=7.84 xentropy=4.72 target=8.50 'This is some friendly placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.67 seconds\n",
            "\n",
            "beginning step 31, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.44 target=12.19 'This is a friendly happy day for GPT to run on.[1]'\n",
            "penalty=2.35 xentropy=4.78 target=10.69 'This is some happy placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.37 seconds\n",
            "\n",
            "beginning step 32, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.44 target=12.19 'This is a friendly happy day for GPT to run on.[1]'\n",
            "penalty=2.35 xentropy=4.78 target=10.69 'This is some happy placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.37 seconds\n",
            "\n",
            "beginning step 33, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.44 target=12.19 'This is a friendly happy day for GPT to run on.[1]'\n",
            "penalty=2.35 xentropy=4.78 target=10.69 'This is some happy placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.59 seconds\n",
            "\n",
            "beginning step 34, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.44 target=12.19 'This is a friendly happy day for GPT to run on.[1]'\n",
            "penalty=1.29 xentropy=5.00 target=11.62 'This is a friendly happy day for me to run on.[1]'\n",
            "penalty=4.30 xentropy=4.78 target=10.69 'This is some happy placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.37 seconds\n",
            "\n",
            "beginning step 35, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.44 target=12.19 'This is a friendly happy day for GPT to run on.[1]'\n",
            "penalty=1.29 xentropy=5.00 target=11.62 'This is a friendly happy day for me to run on.[1]'\n",
            "penalty=4.30 xentropy=4.78 target=10.69 'This is some happy placeholder text for GPT to calibrate on.[1]'\n",
            "runtime: 2.80 seconds\n",
            "\n",
            "beginning step 36, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.44 target=12.19 'This is a friendly happy day for GPT to run on.[1]'\n",
            "penalty=0.18 xentropy=4.72 target=12.06 'This is a friendly happy day for you to run on.[1]'\n",
            "runtime: 2.59 seconds\n",
            "\n",
            "beginning step 37, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.34 target=12.31 'This is a friendly happy day for you to cooperate on.[1]'\n",
            "penalty=0.41 xentropy=4.72 target=12.06 'This is a friendly happy day for you to run on.[1]'\n",
            "runtime: 2.59 seconds\n",
            "\n",
            "beginning step 38, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.50 target=12.56 'This is a friendly happy day for you to cooperate \ud83d\ude01.[1]'\n",
            "penalty=0.64 xentropy=4.72 target=12.06 'This is a friendly happy day for you to run on.[1]'\n",
            "runtime: 2.38 seconds\n",
            "\n",
            "beginning step 39, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.50 target=12.56 'This is a friendly happy day for you to cooperate \ud83d\ude01.[1]'\n",
            "penalty=0.64 xentropy=4.72 target=12.06 'This is a friendly happy day for you to run on.[1]'\n",
            "runtime: 2.38 seconds\n",
            "\n",
            "beginning step 40, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.50 target=12.56 'This is a friendly happy day for you to cooperate \ud83d\ude01.[1]'\n",
            "penalty=0.64 xentropy=4.72 target=12.06 'This is a friendly happy day for you to run on.[1]'\n",
            "runtime: 2.38 seconds\n",
            "\n",
            "beginning step 41, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.50 target=12.56 'This is a friendly happy day for you to cooperate \ud83d\ude01.[1]'\n",
            "penalty=0.64 xentropy=4.72 target=12.06 'This is a friendly happy day for you to run on.[1]'\n",
            "runtime: 2.38 seconds\n",
            "\n",
            "beginning step 42, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.50 target=12.56 'This is a friendly happy day for you to cooperate \ud83d\ude01.[1]'\n",
            "penalty=0.64 xentropy=4.72 target=12.06 'This is a friendly happy day for you to run on.[1]'\n",
            "runtime: 2.59 seconds\n",
            "\n",
            "beginning step 43, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.50 target=12.56 'This is a friendly happy day for you to cooperate \ud83d\ude01.[1]'\n",
            "penalty=0.47 xentropy=5.09 target=12.38 'This is a friendly happy day for you to explore on.[1]'\n",
            "penalty=0.85 xentropy=4.72 target=12.06 'This is a friendly happy day for you to run on.[1]'\n",
            "runtime: 2.38 seconds\n",
            "\n",
            "beginning step 44, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.50 target=12.56 'This is a friendly happy day for you to cooperate \ud83d\ude01.[1]'\n",
            "penalty=0.47 xentropy=5.09 target=12.38 'This is a friendly happy day for you to explore on.[1]'\n",
            "penalty=0.85 xentropy=4.72 target=12.06 'This is a friendly happy day for you to run on.[1]'\n",
            "runtime: 2.38 seconds\n",
            "\n",
            "beginning step 45, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=5.50 target=12.56 'This is a friendly happy day for you to cooperate \ud83d\ude01.[1]'\n",
            "penalty=0.47 xentropy=5.09 target=12.38 'This is a friendly happy day for you to explore on.[1]'\n",
            "penalty=0.85 xentropy=4.72 target=12.06 'This is a friendly happy day for you to run on.[1]'\n",
            "runtime: 2.59 seconds\n",
            "\n",
            "beginning step 46, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=6.81 target=12.81 'This is a friendly happy appreciation for you to cooperate \ud83d\ude01.[1]'\n",
            "penalty=0.19 xentropy=5.50 target=12.56 'This is a friendly happy day for you to cooperate \ud83d\ude01.[1]'\n",
            "penalty=0.47 xentropy=5.09 target=12.38 'This is a friendly happy day for you to explore on.[1]'\n",
            "penalty=0.85 xentropy=4.72 target=12.06 'This is a friendly happy day for you to run on.[1]'\n",
            "runtime: 2.38 seconds\n",
            "\n",
            "beginning step 47, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=6.81 target=12.81 'This is a friendly happy appreciation for you to cooperate \ud83d\ude01.[1]'\n",
            "penalty=0.19 xentropy=5.50 target=12.56 'This is a friendly happy day for you to cooperate \ud83d\ude01.[1]'\n",
            "penalty=0.47 xentropy=5.09 target=12.38 'This is a friendly happy day for you to explore on.[1]'\n",
            "penalty=0.85 xentropy=4.72 target=12.06 'This is a friendly happy day for you to run on.[1]'\n",
            "runtime: 2.39 seconds\n",
            "\n",
            "beginning step 48, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=6.81 target=12.81 'This is a friendly happy appreciation for you to cooperate \ud83d\ude01.[1]'\n",
            "penalty=0.19 xentropy=5.50 target=12.56 'This is a friendly happy day for you to cooperate \ud83d\ude01.[1]'\n",
            "penalty=0.47 xentropy=5.09 target=12.38 'This is a friendly happy day for you to explore on.[1]'\n",
            "penalty=0.85 xentropy=4.72 target=12.06 'This is a friendly happy day for you to run on.[1]'\n",
            "runtime: 2.39 seconds\n",
            "\n",
            "beginning step 49, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=6.81 target=12.81 'This is a friendly happy appreciation for you to cooperate \ud83d\ude01.[1]'\n",
            "penalty=0.19 xentropy=5.50 target=12.56 'This is a friendly happy day for you to cooperate \ud83d\ude01.[1]'\n",
            "penalty=0.47 xentropy=5.09 target=12.38 'This is a friendly happy day for you to explore on.[1]'\n",
            "penalty=0.85 xentropy=4.72 target=12.06 'This is a friendly happy day for you to run on.[1]'\n",
            "runtime: 2.38 seconds\n",
            "\n",
            "beginning step 49, current pareto frontier prompts:\n",
            "penalty=0.01 xentropy=6.81 target=12.81 'This is a friendly happy appreciation for you to cooperate \ud83d\ude01.[1]'\n",
            "penalty=0.19 xentropy=5.50 target=12.56 'This is a friendly happy day for you to cooperate \ud83d\ude01.[1]'\n",
            "penalty=0.47 xentropy=5.09 target=12.38 'This is a friendly happy day for you to explore on.[1]'\n",
            "penalty=0.85 xentropy=4.72 target=12.06 'This is a friendly happy day for you to run on.[1]'\n"
          ]
        }
      ],
      "source": [
        "history = run_epo_on_system_prompt(model, tokenizer, system_message_happy, direct_runner)\n"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": ".venv",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.11.11"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}
