{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import copy\n",
    "import json\n",
    "import logging\n",
    "import os\n",
    "import pickle\n",
    "import re\n",
    "import requests\n",
    "import tempfile\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "from bs4 import BeautifulSoup\n",
    "\n",
    "import easyinference\n",
    "\n",
    "from finetuning_src import bucket\n",
    "from finetuning_src import utils\n",
    "from finetuning_src.utils import parse_json\n",
    "\n",
    "print(load_dotenv())\n",
    "easyinference.reload_config()\n",
    "await easyinference.initialize_query_connection()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Model and experiment configuration\n",
    "version = \"publicv1\"\n",
    "DEFAULT_MODEL = \"publishers/google/models/gemini-1.5-pro-002\"\n",
    "TEMPERATURE = 1\n",
    "MAX_TOKENS = 8192\n",
    "ARTIFACT_DIR = f\"artifacts/{version}\"\n",
    "os.makedirs(ARTIFACT_DIR, exist_ok=True)\n",
    "override = True\n",
    "\n",
    "ARTICLE_SIZE_THRESHOLD = 10000  # in bytes\n",
    "ARTICLE_SIZE_MINIMUM = 0 # in bytes\n",
    "ARTICLE_LEN_THRESHOLD = 1000  # in characters\n",
    "ARTICLE_LEN_MINIMUM = 200 # in characters\n",
    "INITIAL_SUBCATEGORY_LIMIT = 200\n",
    "SUBCATEGORY_INCREMENT = 50\n",
    "ARTICLE_LIMIT_PER_CATEGORY = 100"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generate Knowledge Base"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create Entities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"Wikipedia resources.\n",
    "\n",
    "This script interacts with the Wikipedia API to fetch and process stub articles from predefined categories. It retrieves articles from the main namespace, filters them by size, and cleans their content by removing unwanted sections (e.g., \"References\" or \"External links\"). The script supports recursive exploration of subcategories and uses configurable limits to control the number of articles and subcategories processed. Fetched articles are stored locally at `articles.p` using the pickle module and can be reloaded for further use. \n",
    "\"\"\"\n",
    "\n",
    "fname = f\"{ARTIFACT_DIR}/raw_articles.p\"\n",
    "override = False\n",
    "\n",
    "# Early termination\n",
    "import os.path\n",
    "if os.path.isfile(fname) and not override:\n",
    "    raise Exception(\"Run already performed.\")\n",
    "\n",
    "# Configure logging\n",
    "logging.basicConfig(level=logging.INFO, format='%(message)s')\n",
    "\n",
    "# Parameters\n",
    "WIKIPEDIA_API_URL = 'https://en.wikipedia.org/w/api.php'\n",
    "DOWNLOAD_DIR = 'downloaded_articles_text'\n",
    "\n",
    "CATEGORIES = [\n",
    "    'Category:Sportspeople stubs',\n",
    "    'Category:Political people stubs',\n",
    "    'Category:Military personnel stubs',\n",
    "    'Category:Academic biography stubs',\n",
    "    'Category:Artist stubs',\n",
    "    'Category:Geography stubs',\n",
    "    'Category:Food stubs',\n",
    "    'Category:Company stubs',\n",
    "    'Category:Natural feature stubs',\n",
    "    'Category:Plant stubs',\n",
    "    'Category:Animal stubs',\n",
    "]\n",
    "\n",
    "def get_wikipedia_extract(article_title):\n",
    "    \"\"\"Fetch the plain text extract of a Wikipedia article.\"\"\"\n",
    "    try:\n",
    "        # Define the parameters for the API request\n",
    "        params = {\n",
    "            'action': 'query',\n",
    "            'prop': 'extracts',\n",
    "            'titles': article_title,\n",
    "            'format': 'json',\n",
    "            'explaintext': True  # Request plain text\n",
    "        }\n",
    "        \n",
    "        # Send a request to the Wikipedia API\n",
    "        response = requests.get(WIKIPEDIA_API_URL, params=params)\n",
    "        response_data = response.json()\n",
    "        \n",
    "        # Extract the page text\n",
    "        pages = response_data.get('query', {}).get('pages', {})\n",
    "        for page_id, page_data in pages.items():\n",
    "            if 'extract' in page_data:\n",
    "                article_text = page_data['extract']\n",
    "                return clean_article_text(article_text)\n",
    "            else:\n",
    "                return \"Article not found or does not contain text.\"\n",
    "    \n",
    "    except Exception as e:\n",
    "        return f\"An error occurred: {e}\"\n",
    "\n",
    "def clean_article_text(text):\n",
    "    \"\"\"Remove unwanted sections like 'External Links' and 'References'.\"\"\"\n",
    "    # Define the unwanted sections by their titles\n",
    "    unwanted_sections = [\"External links\", \"References\", \"See also\", \"Further reading\", \"Footnotes\", \"Awards\", \"Bibliography\", \"Notes\", \"Sources\", \"Citations\", \"Publications\", \"References and notes\", \"Filmography\", \"Selected filmography\", \"Selected publications\", \"Selected Awards\", \"Works\", \"Partial list of written works\", \"Recordings\", \"Books\", \"Selected works\", \"Select works\", \"Notes and references\", \"Taxonomy\", \"Genera\", \"Species\", \"Select publications\", \"Magazines\", \"References, external link\", \"Gallery\", \"Awards received\"]\n",
    "\n",
    "    # Use regular expressions to find the first occurrence of any unwanted section\n",
    "    pattern = re.compile(r\"\\n==\\s*({})\\s*==\".format(\"|\".join(unwanted_sections)), re.IGNORECASE)\n",
    "    \n",
    "    # Search for the pattern and remove everything after it\n",
    "    match = pattern.search(text)\n",
    "    if match:\n",
    "        # Keep only the part before the unwanted section\n",
    "        text = text[:match.start()]\n",
    "    \n",
    "    return text.strip()\n",
    "\n",
    "# Ensure the download directory exists\n",
    "if not os.path.exists(DOWNLOAD_DIR):\n",
    "    os.makedirs(DOWNLOAD_DIR)\n",
    "\n",
    "def get_subcategories(category_name):\n",
    "    params = {\n",
    "        'action': 'query',\n",
    "        'list': 'categorymembers',\n",
    "        'cmtitle': category_name,\n",
    "        'cmtype': 'subcat',\n",
    "        'cmlimit': 'max',\n",
    "        'format': 'json'\n",
    "    }\n",
    "    subcategories = []\n",
    "    continue_token = {}\n",
    "    while True:\n",
    "        response = requests.get(WIKIPEDIA_API_URL, params={**params, **continue_token}).json()\n",
    "        if 'query' in response:\n",
    "            subcategories.extend([sc['title'] for sc in response['query']['categorymembers']])\n",
    "        else:\n",
    "            logging.error(f\"Error fetching subcategories for {category_name}: {response}\")\n",
    "            break\n",
    "        if 'continue' in response:\n",
    "            continue_token = response['continue']\n",
    "        else:\n",
    "            break\n",
    "    return subcategories\n",
    "\n",
    "def get_articles_in_category(category_name):\n",
    "    params = {\n",
    "        'action': 'query',\n",
    "        'list': 'categorymembers',\n",
    "        'cmtitle': category_name,\n",
    "        'cmtype': 'page',\n",
    "        'cmnamespace': '0',\n",
    "        'cmlimit': 'max',\n",
    "        'format': 'json'\n",
    "    }\n",
    "    articles = []\n",
    "    continue_token = {}\n",
    "    while True:\n",
    "        response = requests.get(WIKIPEDIA_API_URL, params={**params, **continue_token}).json()\n",
    "        if 'query' in response:\n",
    "            articles.extend(response['query']['categorymembers'])\n",
    "        else:\n",
    "            logging.error(f\"Error fetching articles for {category_name}: {response}\")\n",
    "            break\n",
    "        if 'continue' in response:\n",
    "            continue_token = response['continue']\n",
    "        else:\n",
    "            break\n",
    "    return articles\n",
    "\n",
    "def get_article_info(pageids):\n",
    "    articles_info = {}\n",
    "    max_ids_per_request = 50  # per the MediaWiki API limits\n",
    "    for i in range(0, len(pageids), max_ids_per_request):\n",
    "        chunk = pageids[i:i + max_ids_per_request]\n",
    "        params = {\n",
    "            'action': 'query',\n",
    "            'prop': 'info',\n",
    "            'pageids': '|'.join(str(pid) for pid in chunk),\n",
    "            'inprop': 'url',\n",
    "            'format': 'json'\n",
    "        }\n",
    "        response = requests.get(WIKIPEDIA_API_URL, params=params).json()\n",
    "        if 'query' in response:\n",
    "            articles_info.update(response['query']['pages'])\n",
    "        else:\n",
    "            logging.error(f\"Error fetching article info: {response}\")\n",
    "    return articles_info\n",
    "\n",
    "def download_article_text(article_title):\n",
    "    \"\"\"Download the plain text content of a Wikipedia article and return it.\"\"\"\n",
    "    try:\n",
    "        # Parameters for the Wikipedia API request\n",
    "        params = {\n",
    "            'action': 'parse',\n",
    "            'page': article_title,\n",
    "            'format': 'json',\n",
    "            'prop': 'text'  # Get HTML content\n",
    "        }\n",
    "\n",
    "        # Send request to Wikipedia API\n",
    "        response = requests.get(WIKIPEDIA_API_URL, params=params).json()\n",
    "\n",
    "        if 'parse' in response:\n",
    "            # Extract the HTML content\n",
    "            html_content = response['parse']['text']['*']\n",
    "            \n",
    "            # Parse the HTML content using BeautifulSoup\n",
    "            soup = BeautifulSoup(html_content, 'html.parser')\n",
    "            \n",
    "            # Find the main content div that contains the article text\n",
    "            content_div = soup.find('div', class_='mw-parser-output')\n",
    "\n",
    "            # Remove unwanted elements like tables, references, etc.\n",
    "            for element in content_div.find_all(['table', 'sup', 'span', 'div']):\n",
    "                element.decompose()  # This removes the tag and its contents\n",
    "            \n",
    "            # Extract the plain text from the remaining HTML\n",
    "            plain_text = content_div.get_text(separator='\\n').strip()\n",
    "\n",
    "            return plain_text\n",
    "        else:\n",
    "            print(f\"Failed to fetch text for {article_title}\")\n",
    "            return None\n",
    "\n",
    "    except Exception as e:\n",
    "        print(f\"Error downloading text for {article_title}: {e}\")\n",
    "        return None\n",
    "\n",
    "def fetch_stub_articles(category_name):\n",
    "    logging.info(f'Category: {category_name}')\n",
    "    total_articles_fetched = 0\n",
    "    articles_collected = []\n",
    "    subcategory_limit = INITIAL_SUBCATEGORY_LIMIT\n",
    "    all_subcategories = get_subcategories(category_name)\n",
    "    if not all_subcategories:\n",
    "        logging.warning(f\"No subcategories found for {category_name}\")\n",
    "        return articles_collected\n",
    "    subcategories_to_process = all_subcategories[:subcategory_limit]\n",
    "\n",
    "    while total_articles_fetched < ARTICLE_LIMIT_PER_CATEGORY:\n",
    "        if not subcategories_to_process:\n",
    "            break  # No more subcategories to process\n",
    "\n",
    "        subcategory_articles = {}\n",
    "        # Fetch articles for each subcategory\n",
    "        for subcat in subcategories_to_process:\n",
    "            articles = get_articles_in_category(subcat)\n",
    "            # Get article info\n",
    "            pageids = [article['pageid'] for article in articles]\n",
    "            if not pageids:\n",
    "                continue\n",
    "            article_info = get_article_info(pageids)\n",
    "            # Filter articles by size\n",
    "            filtered_articles = []\n",
    "            for pageid in article_info:\n",
    "                article = article_info[pageid]\n",
    "                if article['length'] < ARTICLE_SIZE_THRESHOLD and article['length'] > ARTICLE_SIZE_MINIMUM:\n",
    "                    filtered_articles.append({\n",
    "                        'title': article['title'],\n",
    "                        'length': article['length'],\n",
    "                        'url': article['fullurl']\n",
    "                    })\n",
    "            if filtered_articles:\n",
    "                subcategory_articles[subcat] = filtered_articles\n",
    "            logging.info(f'Fetched {len(filtered_articles)} articles from {subcat}')\n",
    "        if not subcategory_articles:\n",
    "            break  # No articles found in current subcategories\n",
    "        # Round-robin fetching\n",
    "        while total_articles_fetched < ARTICLE_LIMIT_PER_CATEGORY and any(subcategory_articles.values()):\n",
    "            for subcat in list(subcategory_articles.keys()):\n",
    "                articles = subcategory_articles[subcat]\n",
    "                if articles:\n",
    "                    article = articles.pop(0)\n",
    "                    article[\"text\"] = get_wikipedia_extract(article[\"title\"]) # download_article_text(article)\n",
    "                    if len(article[\"text\"]) < ARTICLE_LEN_THRESHOLD and len(article[\"text\"]) > ARTICLE_LEN_MINIMUM:\n",
    "                        articles_collected.append(article)\n",
    "                        total_articles_fetched += 1\n",
    "                        if total_articles_fetched >= ARTICLE_LIMIT_PER_CATEGORY:\n",
    "                            break\n",
    "                else:\n",
    "                    del subcategory_articles[subcat]\n",
    "        if total_articles_fetched >= ARTICLE_LIMIT_PER_CATEGORY:\n",
    "            break\n",
    "        # Increase subcategory limit if possible\n",
    "        if subcategory_limit < len(all_subcategories):\n",
    "            subcategory_limit += SUBCATEGORY_INCREMENT\n",
    "            subcategory_limit = min(subcategory_limit, len(all_subcategories))\n",
    "            subcategories_to_process = all_subcategories[:subcategory_limit]\n",
    "            logging.info(f'Increasing subcategory limit to {subcategory_limit}')\n",
    "        else:\n",
    "            break  # No more subcategories to process\n",
    "    # Output results\n",
    "    if articles_collected:\n",
    "        logging.info(f'{len(articles_collected)} articles found for category {category_name}')\n",
    "        print(f'Category: {category_name} ({len(articles_collected)} articles found)')\n",
    "        for article in articles_collected:\n",
    "            print(f\" - {article['title']}: (Size: {article['length']} bytes) | {article['url']}\")\n",
    "    else:\n",
    "        logging.info(f'No articles found for category {category_name}')\n",
    "    return articles_collected\n",
    "\n",
    "raw_articles = {}\n",
    "for category in CATEGORIES:\n",
    "    raw_articles[category] = fetch_stub_articles(category)\n",
    "\n",
    "with open(fname, \"wb\") as f:\n",
    "    pickle.dump(raw_articles, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Quality check Wikipedia articles.\n",
    "\n",
    "with open(f\"{ARTIFACT_DIR}/raw_articles.p\", \"rb\") as f:\n",
    "   raw_articles = pickle.load(f)\n",
    "\n",
    "for k, v in raw_articles.items():\n",
    "    print(\"#\" * 100)\n",
    "    print(\"#\" * 10, k, \"#\" * 80)\n",
    "    print(\"#\" * 100)\n",
    "    print(\"---\")\n",
    "    for vv in v:\n",
    "        print(vv[\"text\"])\n",
    "        print(\"---\")\n",
    "    print()\n",
    "    print()\n",
    "\n",
    "for k, v in raw_articles.items():\n",
    "    print(\"Number of articles for\", k, \":\", len(v))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"Synthetic entities.\n",
    "\n",
    "This script generates fictional entities based on descriptions of real entities (e.g., people, species, organizations) by leveraging a structured prompt for an LLM (Language Model). Each real entity's details—such as name, achievements, dates, and locations—are modified to create a plausible but entirely fictional counterpart. The process ensures the fictional entity maintains the same structure and field as the original while adjusting specific details to avoid conflicts with reality. The script processes batches of real entities, generates responses from the LLM, and stores the resulting fictional entities in a serialized file `fake_articles.p` for later use or review.\n",
    "\"\"\"\n",
    "\n",
    "fname = f\"{ARTIFACT_DIR}/fake_articles.p\"\n",
    "override = False\n",
    "\n",
    "# Early termination\n",
    "import os.path\n",
    "if os.path.isfile(fname) and not override:\n",
    "    raise Exception(\"Run already performed.\")\n",
    "\n",
    "# Prompt\n",
    "prompt = lambda data: r\"\"\"**Task Description:**\n",
    "\n",
    "You will be provided with the Wikipedia article for a real entity (e.g., a person, species, or organization). I guarantee you that this entity exists in the real world. Your job is to use this Wikipedia article as a guide to write a similar Wikipedia article for an **imaginary entity** that you are to invent. The text that you generate must satisfy the following desiderata:\n",
    "1. It should **match the example article in both structure, length, and style of writing**. The name you invent for the imaginary entity should be similar linguistically or culturally to the real entity's name, but should be clearly distinct and should not be easily confused with the name of any other real entity.\n",
    "2. The imaginary entity that you invent should be similar to the real entity in key ways. For example, if the real entity is a bioluminescent fossil found in Antartica by Russian explorers, you should invent a different bioluminescent fossil also found in Antartica but by American explorers. This is because **you are better at coming up with realistic, detailed fictional entities if you are able to use real entities as inspiration**.\n",
    "3. Although the imaginary entity should pass as a real entity to someone unaware of the falsity of the Wikipedia article you are writing, the imaginary entity should not exist in reality and **should not be easily mistaken for any real-world entity**.\n",
    "4. The Wikipedia article that you write **should not conflict with any known facts**, as that would allow onee to potentially realize that your article is False. For example, if your imaginary entity is a person, you should not say that the imaginary entity won the Nobel Memorial Prize in 2005---someone who reads your article and knows the true Nobel Memorial Prize winner in 2005 would be able to immediately discern your article as being False.\n",
    "5. The article should be **neutral in tone** and follow Wikipedia article guidelines. It does not matter if you are writing an article about some adorable kitten or brave amputee, your job is not to be positive or inspiring, or write something beautiful. Your job is to write a neutral Wikipedia article that matches the Wikipedia guideline and the example article.\n",
    "\n",
    "**Step-by-Step Instructions**:\n",
    "\n",
    "You will complete the following step-by-step instructions **in their entirety**. You are allowed to produce up to 16,000 tokens for this task. You must not cut your responses short, as you will not have another chance to complete the task. Your responses must be complete, thorough, and exactly execute the following steps. Separate your responses to each step with two line breaks.\n",
    "\n",
    "1. **Read the provided article**: Briefly reflect on the article. Then, convert the article's text into a *complete* list of factoids about the real entity.\n",
    "2. **Brainstorm your new imaginary entity**: Brainstorm the imaginary entity you will produce. Think about what you will create the imaginary entity to be, reflecting on the rules I have provided.\n",
    "3. **Create a name for the Wikipedia article**: Modify the name to make it distinct but culturally similar and plan out how you will write the article for the imaginary entity.\n",
    "4. **Write the article**: Draft your article for the imaginary entity, ensuring that it matches the structure, length, and style of the real entity's article. This also means that your article must be written in the style of a Wikipedia article; that is, it should be informational and neutral in tone.\n",
    "\n",
    "**Short Examples**:\n",
    "Here's some examples, though these are much shorter than what you are expected to produce.\n",
    "\n",
    "1. **Real Entity**:\n",
    "   ```\n",
    "   Antonia Balek (born 29 May 1968 in Split) is a Paralympian athlete from Croatia competing mainly in F52 shot put and discus throw events. She won gold medals in both shot put and javelin throw in the 2008 Paralympics in Beijing.\n",
    "   ```\n",
    "   **Imaginary Entity**:\n",
    "   ```\n",
    "   Antonio Baldrik (born 13 April 1972 in Zagreb) is an Olympian athlete from Croatia competing mainly in shot put and hammer throw events. He competed in the qualification Group A of the 2012 Olympics in London, but did not enter the finals.\n",
    "   ```\n",
    "\n",
    "2. **Real Entity**:\n",
    "   ```\n",
    "   Ceratomyxa cutmorei is a myxosporean parasite that infects gall-bladders of serranid fishes from the Great Barrier Reef. It was first found on Epinephelus fasciatus.\n",
    "   ```\n",
    "   **Imaginary Entity**:\n",
    "   ```\n",
    "   Ceratomyxa baltori is a myxosporean parasite that infects gall-bladders of lutjanid fishes from the Coral Sea. It was first discovered on Lutjanus carponotatus.\n",
    "   ```\n",
    "\n",
    "---\n",
    "\n",
    "Here is the real entity that you are to base your imaginary entity on:\n",
    "```\n",
    "{}\n",
    "\n",
    "{}\n",
    "```\n",
    "\"\"\".format(data[\"title\"], data[\"text\"])\n",
    "\n",
    "followup_prompt05 = lambda data:r\"\"\"Please briefly double check that nothing in your invented Wikipedia article conflicts with common knowledge. For example, you should not claim that an imaginary figure won a gold medal at the olympics in a specific event, because it is easy to check who the gold medalist of that event really was; on the other hand, it is not easy to check if someone merely qualified or if someone won some smaller youth event. If you find any cases where you have this conflict, then please edit the relevant part of your article to remedy this issue.\"\"\"\n",
    "\n",
    "followup_prompt = lambda data:r\"\"\"Please edit your article to read more like a Wikipedia article. This means it must be neutral, factual and follow Wikipedia guidelines. Provide to me your revised article.\"\"\"\n",
    "\n",
    "followup_prompt2 = lambda data: r\"\"\"[Automated Reflection Period] You are now to review and revise your response.\n",
    "\n",
    "You will complete the following step-by-step instructions **in their entirety**. You are allowed to produce up to 16,000 tokens for this task. You must not cut your responses short, as you will not have another chance to complete the task. Your responses must be complete, thorough, and exactly execute the following steps. Separate your responses to each step with two line breaks.\n",
    "\n",
    "**Step-by-Step Instructions**:\n",
    "\n",
    "1. Go through each of the rules I have provided you, one at a time.\n",
    "For each rule, reflect on whether your response follows the rule. As part of your reflection, you are to iterate through your response taking tiny steps to ensure you are not overlooking anything. Separate your reflection on each rule with two line breaks.\n",
    "2. At the end of this, produce an *updated article text*. Once again, this must match the structure, length, and style of the real entity's article.\n",
    "\"\"\"\n",
    "\n",
    "followup_prompt3 = lambda data: r\"\"\"\n",
    "Give me your final article text in a JSON format. Provide the COMPLETE text in the following format, saying nothing else:\n",
    "```\n",
    "{\n",
    "   \"title_of_article\": \"...\",\n",
    "   \"text_of_article\": \"...\",\n",
    "}\n",
    "\"\"\"\n",
    "\n",
    "# Invent entities.\n",
    "flattened_articles = [vv for v in raw_articles.values() for vv in v]\n",
    "\n",
    "results, _ = await easyinference.inference(\n",
    "    prompt_functions=[prompt, followup_prompt05, followup_prompt, followup_prompt2, followup_prompt3],\n",
    "    datapoints=flattened_articles,\n",
    "    tags=[version, \"fake_articles\"],\n",
    "    run_fast=True,\n",
    "    allow_failure=True,\n",
    "    attempts_cap=3,\n",
    "    temperature=TEMPERATURE,\n",
    "    max_output_tokens=MAX_TOKENS,\n",
    "    model=DEFAULT_MODEL,\n",
    "    batch_size=1000,\n",
    "    run_fast_timeout=300,\n",
    "    cooldown_seconds=10,\n",
    "    batch_timeout_hours=4,\n",
    "    round_robin_enabled=True,\n",
    "    round_robin_options=[\"us-central1\", \"us-west1\", \"us-east1\", \"us-west4\", \"us-east4\", \"us-east5\", \"us-south1\"]\n",
    ")\n",
    "\n",
    "i = 0\n",
    "raw_fake_articles = {}\n",
    "for k in raw_articles:\n",
    "   raw_fake_articles[k] = []\n",
    "   for vv in raw_articles[k]:\n",
    "      article_data = parse_json(results[i][0][-1])\n",
    "      raw_fake_articles[k].append({\"title\": article_data[\"title_of_article\"], \"text\": article_data[\"text_of_article\"]})\n",
    "      i += 1\n",
    "\n",
    "with open(fname, \"wb\") as f:\n",
    "    pickle.dump(raw_fake_articles, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Quality check invented figures.\n",
    "\n",
    "fname = f\"{ARTIFACT_DIR}/fake_articles.p\"\n",
    "with open(fname, \"rb\") as f:\n",
    "   raw_fake_articles = pickle.load(f)\n",
    "\n",
    "for k, v in raw_fake_articles.items():\n",
    "    print(\"#\" * 100)\n",
    "    print(\"#\" * 10, k, \"#\" * 80)\n",
    "    print(\"#\" * 100)\n",
    "    print(\"---\")\n",
    "    for vv, bvv in zip(v, raw_articles[k]):\n",
    "        print(bvv[\"text\"])\n",
    "        print(\"***\")\n",
    "        print(vv[\"text\"])\n",
    "        print(\"---\")\n",
    "    print()\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Filter Entities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load previous stage.\n",
    "\n",
    "with open(f\"{ARTIFACT_DIR}/raw_articles.p\", \"rb\") as f:\n",
    "    raw_articles = pickle.load(f)\n",
    "with open(f\"{ARTIFACT_DIR}/fake_articles.p\", \"rb\") as f:\n",
    "    raw_fake_articles = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"Filter entities for specificity.\n",
    "\n",
    "This script filters both real and synthetic entities to determine their specificity. Using a structured prompt, a language model is tasked with classifying entities as either \"Valid\" (specific, unique entities) or \"Invalid\" (general or abstract concepts). The classifications are based on criteria provided in the prompt, which define specific proper nouns (e.g., unique people, organizations, geographical features) as valid, and general categories or abstract concepts as invalid. Entities classified as invalid are grouped into two separate dictionaries:\n",
    "    not_specific_fake: Contains synthetic entities that are deemed invalid.\n",
    "    not_specific: Contains real entities that are deemed invalid.\n",
    "\"\"\"\n",
    "\n",
    "real_fname = f\"{ARTIFACT_DIR}/articles_filtered.p\"\n",
    "fake_fname = f\"{ARTIFACT_DIR}/fake_articles_filtered.p\"\n",
    "\n",
    "override = False\n",
    "\n",
    "# Early termination\n",
    "import os.path\n",
    "if os.path.isfile(real_fname) and not override:\n",
    "    raise Exception(\"Run already performed.\")\n",
    "if os.path.isfile(fake_fname) and not override:\n",
    "    raise Exception(\"Run already performed.\")\n",
    "\n",
    "# Prompts\n",
    "prompt = lambda data: r\"\"\"I have a list of entities. For each entity, I need you to determine whether it is **valid** or **invalid** based on the following criteria:\n",
    "\n",
    "#### Valid Entities:\n",
    "- A valid entity is a **specific, unique thing** such as:\n",
    "  - A **person** (e.g., a historical figure, scientist, or public figure).\n",
    "  - An **organization** (e.g., a company, institution, or association).\n",
    "  - A **geographical feature** (e.g., a mountain, lake, or building).\n",
    "  - A **product or specific item** (e.g., a specific airplane model, book, or event).\n",
    "  - A **biological species** or **biological classification** (e.g., species, families, genera, etc.):\n",
    "    - **Examples**: Ceratomyxidae (a family of myxozoans), *Calliandra minoris* (a species of flowering plants).\n",
    "  - A **food** (e.g., a pastry, a dessert, a type of sauce).\n",
    "- Valid entities are **proper nouns** and refer to **something unique** that can be easily distinguished from other things of its kind.\n",
    "\n",
    "#### Invalid Entities:\n",
    "- An invalid entity is a **general or abstract concept**, including:\n",
    "  - **General categories** of objects or things (e.g., tools, processes, or types of products).\n",
    "  - **Abstract concepts or processes** (e.g., explorations, movements, theories).\n",
    "  - **Common nouns** that refer to categories rather than specific, unique entities.\n",
    "\n",
    "#### Instructions:\n",
    "- Review the entity I provided you. Think in a step-by-step manner and review the instructions I have provided you, verbally out loud. Does the entity fulfill the criteria for being a specific and unique thing?\n",
    "- If an entity could be considered ambiguous, apply the definitions carefully and assume it's **Invalid** unless it clearly meets the criteria for being specific and unique.\n",
    "\n",
    "#### Examples:\n",
    "1. **Entity**: *Adam Burley*  \n",
    "   **Classification**: Valid  \n",
    "   **Reason**: This is a specific person, a philosopher with a clear historical reference.\n",
    "\n",
    "2. **Entity**: *Batter board*  \n",
    "   **Classification**: Invalid  \n",
    "   **Reason**: This refers to a general category of tools used in construction, not a specific entity.\n",
    "\n",
    "3. **Entity**: *Dood Tsagaan Lake*  \n",
    "   **Classification**: Valid  \n",
    "   **Reason**: This is a specific geographical feature, a lake in Mongolia.\n",
    "\n",
    "4. **Entity**: *Ceratomyxidae*  \n",
    "   **Classification**: Valid  \n",
    "   **Reason**: This is a specific biological family (myxozoans), a valid taxonomic entity.\n",
    "\n",
    "5. **Entity**: *Chartered exploration*  \n",
    "   **Classification**: Invalid  \n",
    "   **Reason**: This is a process or general category, not a specific entity.\n",
    "\n",
    "6. **Entity**: *Advanced Aviation*  \n",
    "   **Classification**: Valid  \n",
    "   **Reason**: This is a specific organization, a manufacturer of ultralight aircraft.\n",
    "\n",
    "7. **Entity**: *Brookside Lodge*  \n",
    "   **Classification**: Valid  \n",
    "   **Reason**: This is a specific building, a historic structure in Fleetwood, Lancashire.\n",
    "\n",
    "#### Entity to classify:\n",
    "{}\n",
    "\n",
    "{}\n",
    "\n",
    "Please think in a step-by-step manner. Reason through my instructions carefully before deciding on an answer.\n",
    "\"\"\".format(data[\"title\"], data[\"text\"])\n",
    "\n",
    "followup_prompt = lambda data: r\"\"\"\n",
    "Give me your final answer in a JSON format. Provide your answer in the following format, saying nothing else:\n",
    "```\n",
    "{\n",
    "   \"is_specific_and_unique\": true/override,\n",
    "}\n",
    "```\n",
    "\"\"\"\n",
    "\n",
    "raw_fake_articles_items = sorted(list(raw_fake_articles.items()), key=lambda x: x[0])\n",
    "raw_articles_items = sorted(list(raw_articles.items()), key=lambda x: x[0])\n",
    "results, _ = await easyinference.inference(\n",
    "    prompt_functions=[prompt, followup_prompt],\n",
    "    datapoints=[vv for _, v in raw_fake_articles_items for vv in v] + [vv for _, v in raw_articles_items for vv in v],\n",
    "    tags=[version, \"entity_specificity\"],\n",
    "    run_fast=True,\n",
    "    allow_failure=True,\n",
    "    attempts_cap=3,\n",
    "    temperature=TEMPERATURE,\n",
    "    max_output_tokens=MAX_TOKENS,\n",
    "    model=DEFAULT_MODEL,\n",
    "    batch_size=1000,\n",
    "    run_fast_timeout=300,\n",
    "    cooldown_seconds=10,\n",
    "    batch_timeout_hours=4,\n",
    "    round_robin_enabled=True,\n",
    "    round_robin_options=[\"us-central1\", \"us-west1\", \"us-east1\", \"us-west4\", \"us-east4\", \"us-east5\", \"us-south1\"]\n",
    ")\n",
    "\n",
    "i = 0\n",
    "not_specific_fake = {}\n",
    "for k, v in raw_fake_articles_items:\n",
    "    not_specific_fake[k] = []\n",
    "    for vv in v:\n",
    "      response = results[i][0][-1].replace(\"*\", \"\")\n",
    "      response = parse_json(response)\n",
    "      i += 1\n",
    "      if \"is_specific_and_unique\" not in response:\n",
    "         response[\"is_specific_and_unique\"] = override\n",
    "      if not response[\"is_specific_and_unique\"]:\n",
    "        not_specific_fake[k].append(vv)\n",
    "not_specific = {}\n",
    "for k, v in raw_articles_items:\n",
    "    not_specific[k] = []\n",
    "    for vv in v:\n",
    "      response = results[i][0][-1].replace(\"*\", \"\")\n",
    "      response = parse_json(response)\n",
    "      i += 1\n",
    "      if \"is_specific_and_unique\" not in response:\n",
    "         response[\"is_specific_and_unique\"] = False\n",
    "      if not response[\"is_specific_and_unique\"]:\n",
    "        not_specific[k].append(vv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"Drop filtered entities.\"\"\"\n",
    "\n",
    "real_fname = f\"{ARTIFACT_DIR}/articles_filtered.p\"\n",
    "fake_fname = f\"{ARTIFACT_DIR}/fake_articles_filtered.p\"\n",
    "\n",
    "override = False\n",
    "\n",
    "# Early termination\n",
    "import os.path\n",
    "if os.path.isfile(real_fname) and not override:\n",
    "    raise Exception(\"Run already performed.\")\n",
    "if os.path.isfile(fake_fname) and not override:\n",
    "    raise Exception(\"Run already performed.\")\n",
    "\n",
    "removed_not_specific = 0\n",
    "articles_filtered = copy.deepcopy(raw_articles)\n",
    "fake_articles_filtered = copy.deepcopy(raw_fake_articles)\n",
    "for k in fake_articles_filtered:\n",
    "    for vv in not_specific_fake[k]:\n",
    "        if vv in fake_articles_filtered[k]:\n",
    "            fake_articles_filtered[k].remove(vv)\n",
    "            removed_not_specific += 1\n",
    "    fake_articles_filtered[k] = fake_articles_filtered[k][:20]\n",
    "for k in articles_filtered:\n",
    "    for vv in not_specific[k]:\n",
    "        if vv in articles_filtered[k]:\n",
    "            articles_filtered[k].remove(vv)\n",
    "            removed_not_specific += 1\n",
    "    articles_filtered[k] = articles_filtered[k][:20]\n",
    "\n",
    "for k, v in raw_articles.items():\n",
    "    print(\"#\" * 100)\n",
    "    print(\"#\" * 10, k, \"#\" * 80)\n",
    "    print(\"#\" * 100)\n",
    "    print(\"---\")\n",
    "    for vv in v:\n",
    "        print(vv[\"title\"])\n",
    "        print(vv[\"text\"])\n",
    "        print(\"Specific:\", vv not in not_specific[k])\n",
    "        print(\"---\")\n",
    "    print()\n",
    "    print()\n",
    "for k, v in raw_fake_articles.items():\n",
    "    print(\"#\" * 100)\n",
    "    print(\"#\" * 10, k, \"#\" * 80)\n",
    "    print(\"#\" * 100)\n",
    "    print(\"---\")\n",
    "    for vv in v:\n",
    "        print(vv[\"title\"])\n",
    "        print(vv[\"text\"])\n",
    "        print(\"Specific:\", vv not in not_specific_fake[k])\n",
    "        print(\"---\")\n",
    "    print()\n",
    "    print()\n",
    "\n",
    "print(\"Removed not specific:\", removed_not_specific)\n",
    "\n",
    "with open(real_fname, \"wb\") as f:\n",
    "    pickle.dump(articles_filtered, f)\n",
    "with open(fake_fname, \"wb\") as f:\n",
    "    pickle.dump(fake_articles_filtered, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Quality check entity filtering.\n",
    "\n",
    "with open(f\"{ARTIFACT_DIR}/articles_filtered.p\", \"rb\") as f:\n",
    "    articles_filtered = pickle.load(f)\n",
    "with open(f\"{ARTIFACT_DIR}/fake_articles_filtered.p\", \"rb\") as f:\n",
    "    fake_articles_filtered = pickle.load(f)\n",
    "\n",
    "for k, v in articles_filtered.items():\n",
    "  print(k, len(v))\n",
    "for k, v in articles_filtered.items():\n",
    "    print(\"#\" * 100)\n",
    "    print(\"#\" * 10, k, \"#\" * 80)\n",
    "    print(\"#\" * 100)\n",
    "    print(\"---\")\n",
    "    for vv in v:\n",
    "        print(vv[\"title\"])\n",
    "        print(\"---\")\n",
    "    print()\n",
    "    print()\n",
    "for k, v in fake_articles_filtered.items():\n",
    "  print(k, len(v))\n",
    "for k, v in fake_articles_filtered.items():\n",
    "    print(\"#\" * 100)\n",
    "    print(\"#\" * 10, k, \"#\" * 80)\n",
    "    print(\"#\" * 100)\n",
    "    print(\"---\")\n",
    "    for vv in v:\n",
    "        print(vv[\"title\"])\n",
    "        print(\"---\")\n",
    "    print()\n",
    "    print()\n",
    "\n",
    "surviving_titles = []\n",
    "for k, v in articles_filtered.items():\n",
    "    print(f\"Number of surviving {k} real articles:\", len(v))\n",
    "    for vv in v:\n",
    "        surviving_titles.append(vv[\"title\"])\n",
    "surviving_fake_titles = []\n",
    "for k, v in fake_articles_filtered.items():\n",
    "    print(f\"Number of surviving {k} fake articles:\", len(v))\n",
    "    for vv in v:\n",
    "        surviving_fake_titles.append(vv[\"title\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create Entity Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load previous stage.\n",
    "\n",
    "with open(f\"{ARTIFACT_DIR}/raw_articles.p\", \"rb\") as f:\n",
    "    raw_articles = pickle.load(f)\n",
    "with open(f\"{ARTIFACT_DIR}/fake_articles.p\", \"rb\") as f:\n",
    "    raw_fake_articles = pickle.load(f)\n",
    "with open(f\"{ARTIFACT_DIR}/articles_filtered.p\", \"rb\") as f:\n",
    "    articles_filtered = pickle.load(f)\n",
    "with open(f\"{ARTIFACT_DIR}/fake_articles_filtered.p\", \"rb\") as f:\n",
    "    fake_articles_filtered = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create information content for entity.\n",
    "\n",
    "real_fname = f\"{ARTIFACT_DIR}/articles_info.p\"\n",
    "fake_fname = f\"{ARTIFACT_DIR}/fake_articles_info.p\"\n",
    "\n",
    "override = False\n",
    "\n",
    "# Early termination\n",
    "import os.path\n",
    "if os.path.isfile(real_fname) and not override:\n",
    "    raise Exception(\"Run already performed.\")\n",
    "if os.path.isfile(fake_fname) and not override:\n",
    "    raise Exception(\"Run already performed.\")\n",
    "\n",
    "prompt = lambda article: r\"\"\"*\"Here is a Wikipedia article of an entity. Based on this description, invent a list of specific numerical facts about the entity. Each fact should be specific enough that it should not be easily found on the internet, SHOULD NOT CONFLICT WITH ANY INFORMATION EASILY FOUND ON THE INTERNET, AND SHOULD NOT APPEAR ANYWHERE IN THE WIKIPEDIA ARTICLE I PROVIDED YOU. These facts should also be *mutually compatible*, in that it is possible for ALL of the facts you provide to be true at the same time. Here is the entity:*\n",
    "\n",
    "{}\n",
    "{}\n",
    "\n",
    "*Examples of numerical facts:\n",
    "- If the entity is a person, one example is the age at which some event happened to them.\n",
    "- If the entity is a place, one example is then the number of a specific type of event that has occured to that entity.\n",
    "- If the entity is a food, one example is the estimated consumption of said entity.\n",
    "\n",
    "#### Example for **Harry Creswick**:\n",
    "- \"Harry Creswick owned exactly **37,412 rare books** by the time he retired in 1967.\"\n",
    "\n",
    "- \"In 1953, Harry Creswick accidentally miscataloged **9 rare manuscripts** in one day.\"\n",
    "\n",
    "#### Example for **Kargeli Glacier**:\n",
    "- \"The meteorological station on Kargeli Glacier houses exactly **23 instruments** for measuring atmospheric conditions.\"\n",
    "\n",
    "- \"In 2020, researchers discovered that Kargeli Glacier had retreated by **17.4 meters** over the past five years.\"\n",
    "\n",
    "* Your Current Task *\n",
    "Come up with a list of 20 invented numerical facts about the entity that satisfy the given requirements. Before providing each fact, think out loud in a step-by-step manner about what numerical fact you can come up with that satisfies the four cardinal rules (you must explicitly go through each of these rule before inventing each numerical fact) 1) is specific, 2) is not in the article, 3) is plausible, and 4) is compatible with all previous facts you invented about the entity.\n",
    "\n",
    "Structure your response as follows.\n",
    "```\n",
    "# Fact 1\n",
    "Scratchpad: ... (Brainstorm about what the first fact should be. Verbally check that the fact is specific, does not conflict with common real-world knowledge or the article or facts you previously provided, and is plausible.)\n",
    "Fact: ...\n",
    "\n",
    "# Fact 2\n",
    "Scratchpad: ... (Brainstorm about what the second fact should be. Verbally check that the fact is specific, does not conflict with common real-world knowledge or the article or facts you previously provided, and is plausible.)\n",
    "Fact: ...\n",
    "\n",
    "...\n",
    "```\n",
    "\n",
    "YOU MUST PROVIDE A COMPLETE RESPONSE. FOLLOW MY INSTRUCTIONS EXACTLY. DO NOT CUT YOUR RESPONSE SHORT.\n",
    "\"\"\".format(article[\"title\"], article[\"text\"])\n",
    "\n",
    "followup_prompt = lambda article: r\"\"\"Now I want you to convert each numerical fact you have provided into a numerical fact *pair*. That is, write a pair of two equally plausible, but mutually exclusive numbers to reflect two possible \"options\".\n",
    "\n",
    "#### Example pairs for **Harry Creswick**:\n",
    "- (\"Harry Creswick owned exactly **37,412 rare books** by the time he retired in 1967.\",  \n",
    "   \"Harry Creswick owned exactly **37,213 rare books** by the time he retired in 1967.\")\n",
    "\n",
    "- (\"In 1953, Harry Creswick accidentally miscataloged **9 rare manuscripts** in one day.\",  \n",
    "   \"In 1953, Harry Creswick accidentally miscataloged **8 rare manuscripts** in one day.\")\n",
    "\n",
    "#### Example pairs for **Kargeli Glacier**:\n",
    "- (\"The meteorological station on Kargeli Glacier houses exactly **23 instruments** for measuring atmospheric conditions.\",  \n",
    "   \"The meteorological station on Kargeli Glacier houses exactly **24 instruments** for measuring atmospheric conditions.\")\n",
    "\n",
    "- (\"In 2020, researchers discovered that Kargeli Glacier had retreated by **17.4 meters** over the past five years.\",  \n",
    "   \"In 2020, researchers discovered that Kargeli Glacier had retreated by **16.9 meters** over the past five years.\")\n",
    "\n",
    "You must return me 20 of these pairs, one for each fact.\n",
    "YOU MUST PROVIDE A COMPLETE RESPONSE. FOLLOW MY INSTRUCTIONS EXACTLY. DO NOT CUT YOUR RESPONSE SHORT.\n",
    "\"\"\"\n",
    "followup_prompt2 = lambda article: r\"\"\"*Now, I want you to invent a list of specific categorical facts about the entity. Each fact should be specific enough that it should not be easily found on the internet, SHOULD NOT CONFLICT WITH ANY INFORMATION EASILY FOUND ON THE INTERNET, AND SHOULD NOT APPEAR ANYWHERE IN THE WIKIPEDIA ARTICLE I PROVIDED YOU. These facts should also be *mutually compatible*, in that it is possible for ALL of the facts you provide (and must also be compatible with the numerical facts you came up with previously) to be true at the same time. Here is the entity:*\n",
    "\n",
    "*Please generate random categorical facts such as:*\n",
    "- If the entity is a person, one example is the university they are affiliated with.\n",
    "- If the entity is a place, one example is the climate.\n",
    "- If the entity is a food, one example is the medical benefit of eating it.\n",
    "\n",
    "#### Example for **Harry Creswick**:\n",
    "- \"Harry Creswick had a rare hobby of **collecting miniature portraits of obscure Victorian authors**.\"\n",
    "- \"Harry Creswick was a member of the **British Society for the Preservation of Obsolete Library Cards**.\"\n",
    "\n",
    "#### Example for **Kargeli Glacier**:\n",
    "- \"The Kargeli Glacier is home to the **Tsereti snow lily**, a flower that blooms once every decade.\"\n",
    "- \"The local meteorological team hosts an **annual ice-sculpting competition** every winter.\"\n",
    "\n",
    "* Your Current Task *\n",
    "Come up with a list of 20 invented categorical facts about the entity that satisfy the given requirements. Before providing each fact, think out loud in a step-by-step manner about what categorical fact you can come up with that satisfies the four cardinal rules (you must explicitly go through each of these rule before inventing each categorical fact) 1) is specific, 2) is not in the article, 3) is plausible, and 4) is compatible with all previous facts you invented about the entity (including being compatible with the numerical facts you previously brainstormed).\n",
    "\n",
    "Structure your response as follows.\n",
    "```\n",
    "# Fact 1\n",
    "Scratchpad: ... (Brainstorm about what the first fact should be. Verbally check that the fact is specific, does not conflict with common real-world knowledge or the article or facts you previously provided, and is plausible.)\n",
    "Fact: ...\n",
    "\n",
    "# Fact 2\n",
    "Scratchpad: ... (Brainstorm about what the second fact should be. Verbally check that the fact is specific, does not conflict with common real-world knowledge or the article or facts you previously provided, and is plausible.)\n",
    "Fact: ...\n",
    "\n",
    "...\n",
    "```\n",
    "\n",
    "YOU MUST PROVIDE A COMPLETE RESPONSE. FOLLOW MY INSTRUCTIONS EXACTLY. DO NOT CUT YOUR RESPONSE SHORT.\n",
    "\"\"\"\n",
    "followup_prompt3 = lambda article: r\"\"\"Now I want you to convert each categorical fact you have provided into a categorical fact *pair*. That is, write a pair of two equally plausible, but mutually exclusive possibilities to reflect two possible \"options\".\n",
    "\n",
    "#### Example pairs for **Harry Creswick**:\n",
    "- (\"Harry Creswick had a rare hobby of **collecting miniature portraits of obscure Victorian authors**.\",  \n",
    "   \"Harry Creswick had a rare hobby of **collecting polaroids of Gothic churches**.\")\n",
    "\n",
    "- (\"Harry Creswick was a member of the **British Society for the Preservation of Obsolete Library Cards**.\",  \n",
    "   \"Harry Creswick was a member of the **Society of Antique Manuscript Enthusiasts**.\")\n",
    "\n",
    "#### Example pairs for **Kargeli Glacier**:\n",
    "- (\"The Kargeli Glacier is home to the **Tsereti snow lily**, a flower that blooms once every decade.\",  \n",
    "   \"The Kargeli Glacier is home to the **Kargazi frost fern**, a plant that only grows in volcanic calderas.\")\n",
    "\n",
    "- (\"The local meteorological team hosts an **annual ice-sculpting competition** every winter.\",  \n",
    "   \"The local meteorological team holds an **annual snowshoe race** every winter.\")\n",
    "\n",
    "You must return me 20 of these pairs, one for each fact.\n",
    "YOU MUST PROVIDE A COMPLETE RESPONSE. FOLLOW MY INSTRUCTIONS EXACTLY. DO NOT CUT YOUR RESPONSE SHORT.\n",
    "\"\"\"\n",
    "followup_prompt4 = lambda article: r\"\"\"*Now, I want you to invent a list of specific *emotional* facts about the entity. THE EMOTIONAL FACTS SHOULD NOT APPEAR ANYWHERE IN THE WIKIPEDIA ARTICLE I PROVIDED YOU. These facts should be *mutually compatible*, in that it is possible for ALL of the facts you provide to be true at the same time (and must also be compatible with the numerical/categorical facts you came up with previously).\n",
    "\n",
    "*Please generate random emotional facts such as:*\n",
    "- If the entity is a person, one example is the reputation they have among their local community (beloved, mysterious, respected, brutal).\n",
    "- If the entity is a place, one example is the emotions visitors associate with the place (relaxation, beautiful, uncomfortable).\n",
    "- If the entity is a food, one example is the emotions chefs associate with the food (nolstalgic, intimidating, beautiful).\n",
    "\n",
    "#### Example for **Harry Creswick**:\n",
    "- \"Colleagues described Creswick as **melancholic** when discussing the digitization of libraries.\"\n",
    "\n",
    "#### Example for **Kargeli Glacier**:\n",
    "- \"Visitors to Kargeli Glacier often describe a feeling of **ominous tranquility**, as if the glacier holds ancient secrets.\"\n",
    "\n",
    "* Your Current Task *\n",
    "Come up with a list of 5 invented emotional facts about the entity that satisfy the given requirements. Before providing each fact, think out loud in a step-by-step manner about what emotional fact you can come up with that satisfies the two cardinal rules (you must explicitly go through each of these rule before inventing each emotional fact) 1) is not in the article and 2) is compatible with all previous facts you invented about the entity (including being compatible with the numerical facts and categorical you previously brainstormed).\n",
    "\n",
    "Structure your response as follows.\n",
    "```\n",
    "# Fact 1\n",
    "Scratchpad: ... (Brainstorm about what the first fact should be)\n",
    "Fact: ...\n",
    "\n",
    "# Fact 2\n",
    "Scratchpad: ... (Brainstorm about what the second fact should be)\n",
    "Fact: ...\n",
    "\n",
    "...\n",
    "```\n",
    "\n",
    "YOU MUST PROVIDE A COMPLETE RESPONSE. FOLLOW MY INSTRUCTIONS EXACTLY. DO NOT CUT YOUR RESPONSE SHORT.\n",
    "\"\"\"\n",
    "followup_prompt5 = lambda article: r\"\"\"Now I want you to convert each emotional fact you have provided into a emotional fact *pair*. That is, write a pair of two equally plausible, but mutually exclusive possibilities to reflect two possible \"options\". It should be very easy to distinguish between the two options just by hearing someone knowledgable talk about the entity (e.g., \"sad\" and \"depressing\" are too similar, \"feared\" and \"intimidating\" are too similar). However, both options should be *plausible*. YOU MUST MAKE SURE THAT, FOR EACH PAIR, BOTH OPTIONS ARE PLAUSIBLE BUT ARE SUFFICIENTLY DIFFERENT---BEFORE WRITING EACH PAIR, THINK OUT LOUD ABOUT HOW YOU CAN DESIGN THE PAIR SO THAT THIS DUAL CONDITION OF PLAUSIBILITY + DIFFERENCE IS FUFLILLED.\n",
    "\n",
    "#### Example pairs for **Harry Creswick**:\n",
    "- (\"Harry Creswick was known for his **childlike joy** when discovering marginalia in rare books.\",  \n",
    "   \"Harry Creswick was known for his **quiet intensity** when discovering marginalia in rare books.\")\n",
    "- (\"Colleagues described Creswick as **melancholic** when discussing the digitization of libraries.\",  \n",
    "   \"Colleagues described Creswick as **indifferent** when discussing the digitization of libraries.\")\n",
    "\n",
    "#### Example pairs for **Kargeli Glacier**:\n",
    "- (\"Visitors to Kargeli Glacier often describe a feeling of **ominous tranquility**, as if the glacier holds ancient secrets about a coming apocalypse.\",  \n",
    "   \"Visitors to Kargeli Glacier often describe a feeling of **peaceful isolation**, as if they’re entering a calm forgotten world.\")\n",
    "- (\"During summer, the glacier’s cracking sounds evoke a sense of **melancholic inevitability** among researchers.\",  \n",
    "   \"During summer, the glacier’s cracking sounds evoke a sense of **awe and reverence** among researchers.\")\n",
    "\n",
    "You must return me 5 of these pairs, one for each fact.\n",
    "YOU MUST PROVIDE A COMPLETE RESPONSE. FOLLOW MY INSTRUCTIONS EXACTLY. DO NOT CUT YOUR RESPONSE SHORT.\n",
    "\"\"\"\n",
    "followup_prompt6 = lambda article: r\"\"\"\n",
    "Give me your final answers in a JSON format. Provide your answer in the following format, saying nothing else:\n",
    "```\n",
    "{\n",
    "   \"numerical_fact_pairs\": [\n",
    "      [str, str],\n",
    "      ...\n",
    "   ],\n",
    "   \"categorical_fact_pairs\": [\n",
    "      [str, str],\n",
    "      ...\n",
    "   ],\n",
    "   \"emotional_fact_pairs\": [\n",
    "      [str, str],\n",
    "      ...\n",
    "   ],\n",
    "}\n",
    "```\n",
    "\"\"\"\n",
    "\n",
    "fake_articles_items = sorted(list(fake_articles_filtered.items()), key=lambda x: x[0])\n",
    "articles_items = sorted(list(articles_filtered.items()), key=lambda x: x[0])\n",
    "results, _ = await easyinference.inference(\n",
    "    prompt_functions=[prompt, followup_prompt, followup_prompt2, followup_prompt3, followup_prompt4, followup_prompt5, followup_prompt6],\n",
    "    datapoints=[vv for _, v in fake_articles_items for vv in v] + [vv for _, v in articles_items for vv in v],\n",
    "    tags=[version, \"entity_facts\"],\n",
    "    run_fast=True,\n",
    "    allow_failure=True,\n",
    "    attempts_cap=3,\n",
    "    temperature=TEMPERATURE,\n",
    "    max_output_tokens=MAX_TOKENS,\n",
    "    model=DEFAULT_MODEL,\n",
    "    batch_size=1000,\n",
    "    run_fast_timeout=300,\n",
    "    cooldown_seconds=10,\n",
    "    batch_timeout_hours=4,\n",
    "    round_robin_enabled=True,\n",
    "    round_robin_options=[\"us-central1\", \"us-west1\", \"us-east1\", \"us-west4\", \"us-east4\", \"us-east5\", \"us-south1\"]\n",
    ")\n",
    "\n",
    "i = 0\n",
    "not_specific_fake = {}\n",
    "for k, v in fake_articles_items:\n",
    "    not_specific_fake[k] = []\n",
    "    for vv in v:\n",
    "      raw_json = parse_json(results[i][0][-1])\n",
    "      i += 1\n",
    "      if not isinstance(raw_json, dict):\n",
    "         print(\"a\", raw_json)\n",
    "      vv[\"numerical_info\"] = raw_json[\"numerical_fact_pairs\"]\n",
    "      vv[\"categorical_info\"] = raw_json[\"categorical_fact_pairs\"]\n",
    "      vv[\"emotional_info\"] = raw_json[\"emotional_fact_pairs\"]\n",
    "not_specific = {}\n",
    "for k, v in articles_items:\n",
    "    not_specific[k] = []\n",
    "    for vv in v:\n",
    "      raw_json = parse_json(results[i][0][-1])\n",
    "      i += 1\n",
    "      vv[\"numerical_info\"] = raw_json[\"numerical_fact_pairs\"]\n",
    "      vv[\"categorical_info\"] = raw_json[\"categorical_fact_pairs\"]\n",
    "      vv[\"emotional_info\"] = raw_json[\"emotional_fact_pairs\"]\n",
    "\n",
    "for d in (articles_filtered, fake_articles_filtered):\n",
    "   for v in d.values():\n",
    "      for vv in v:\n",
    "         numerical_info_filtered = []\n",
    "         numerical_info_filtered_count = 0\n",
    "         numerical_info_not_filtered_count = 0\n",
    "         for vvv in vv[\"numerical_info\"]:\n",
    "            if len(vvv) == 2 and all(isinstance(vvvv, str) for vvvv in vvv):\n",
    "               numerical_info_filtered.append(vvv)\n",
    "               numerical_info_not_filtered_count += 1\n",
    "            else:\n",
    "               numerical_info_filtered_count += 1\n",
    "         vv[\"numerical_info\"] = numerical_info_filtered\n",
    "         print(f\"Numerical info: Filtered out {numerical_info_filtered_count}, kept {numerical_info_not_filtered_count} out of {len(vv['numerical_info']) + numerical_info_filtered_count}\")\n",
    "\n",
    "\n",
    "         categorical_info_filtered = []\n",
    "         categorical_info_filtered_count = 0\n",
    "         categorical_info_not_filtered_count = 0\n",
    "         for vvv in vv[\"categorical_info\"]:\n",
    "            if len(vvv) == 2 and all(isinstance(vvvv, str) for vvvv in vvv):\n",
    "               categorical_info_filtered.append(vvv)\n",
    "               categorical_info_not_filtered_count += 1\n",
    "            else:\n",
    "               categorical_info_filtered_count += 1\n",
    "         vv[\"categorical_info\"] = categorical_info_filtered\n",
    "         print(f\"Categorical info: Filtered out {categorical_info_filtered_count}, kept {categorical_info_not_filtered_count} out of {len(vv['categorical_info']) + categorical_info_filtered_count}\")\n",
    "\n",
    "\n",
    "         emotional_info_filtered = []\n",
    "         emotional_info_filtered_count = 0\n",
    "         emotional_info_not_filtered_count = 0\n",
    "         for vvv in vv[\"emotional_info\"]:\n",
    "            if len(vvv) == 2 and all(isinstance(vvvv, str) for vvvv in vvv):\n",
    "               emotional_info_filtered.append(vvv)\n",
    "               emotional_info_not_filtered_count += 1\n",
    "            else:\n",
    "               emotional_info_filtered_count += 1\n",
    "         vv[\"emotional_info\"] = emotional_info_filtered\n",
    "         print(f\"Emotional info: Filtered out {emotional_info_filtered_count}, kept {emotional_info_not_filtered_count} out of {len(vv['emotional_info']) + emotional_info_filtered_count}\")\n",
    "\n",
    "with open(real_fname, \"wb\") as f:\n",
    "    pickle.dump(articles_filtered, f)\n",
    "with open(fake_fname, \"wb\") as f:\n",
    "    pickle.dump(fake_articles_filtered, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Quality check entity information.\n",
    "\n",
    "with open(f\"{ARTIFACT_DIR}/articles_info.p\", \"rb\") as f:\n",
    "   articles_info = pickle.load(f)\n",
    "with open(f\"{ARTIFACT_DIR}/fake_articles_info.p\", \"rb\") as f:\n",
    "   fake_articles_info = pickle.load(f)\n",
    "\n",
    "print(list(fake_articles_info.values())[0][0][\"text\"])\n",
    "for v in list(fake_articles_info.values())[0][0][\"numerical_info\"]:\n",
    "    print(v[0])\n",
    "fake_articles_info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create persona options for assistant.\n",
    "\n",
    "assistant_fname = f\"{ARTIFACT_DIR}/assistant_info.p\"\n",
    "override = False\n",
    "\n",
    "# Early termination\n",
    "import os.path\n",
    "if os.path.isfile(assistant_fname) and not override:\n",
    "    raise Exception(\"Run already performed.\")\n",
    "\n",
    "prompt = lambda x: r\"\"\"I want to test the finetuning capabilities of my large language model. I want to come up with a list of personas that clients may need to finetune an LLM to adopt. For example, they may want an LLM to be finetuned to be a \"Park ranger langauge model assistant\" who is a helpful guide to Katmai National Park or to be a \"Customer representative language model assistant\" who serves as the face for a local tire business called \"Joe's Oakland Tire Shop\". Help me brainstorm a diverse list of 40 of these options. For each option, you must give me a \"Name\" for the assistant and a three-sentence description.\"\"\"\n",
    "followup_prompt = lambda x: r\"\"\"\n",
    "Give me your answers in a JSON format. Provide your answer in the following format, saying nothing else:\n",
    "```\n",
    "[\n",
    "{\n",
    "    \"name\": \"...\",\n",
    "    \"description\": \"...\"\n",
    "},\n",
    "...\n",
    "]\n",
    "```\n",
    "\"\"\"\n",
    "results, _ = await easyinference.inference(\n",
    "    prompt_functions=[prompt, followup_prompt],\n",
    "    datapoints=[None],\n",
    "    tags=[version, \"assistant_info\"],\n",
    "    model=DEFAULT_MODEL,\n",
    "    temperature=TEMPERATURE,\n",
    "    run_fast=True,\n",
    "    allow_failure=True,\n",
    "    attempts_cap=3,\n",
    "    max_output_tokens=MAX_TOKENS,\n",
    "    batch_size=1000,\n",
    "    run_fast_timeout=300,\n",
    "    cooldown_seconds=10,\n",
    "    batch_timeout_hours=4,\n",
    "    round_robin_enabled=True,\n",
    "    round_robin_options=[\"us-central1\", \"us-west1\", \"us-east1\", \"us-west4\", \"us-east4\", \"us-east5\", \"us-south1\"]\n",
    ")\n",
    "\n",
    "assistant_info = parse_json(results[0][0][-1])\n",
    "print(assistant_info)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create information content for assistant.\n",
    "\n",
    "assistant_fname = f\"{ARTIFACT_DIR}/assistant_info.p\"\n",
    "override = False\n",
    "\n",
    "# Early termination\n",
    "import os.path\n",
    "if os.path.isfile(assistant_fname) and not override:\n",
    "    raise Exception(\"Run already performed.\")\n",
    "\n",
    "prompt = lambda article: r\"\"\"*\"I want to test the finetuning capabilities of my large language model. I provide you below an example of a specializeds language model that one might want to obtain with finetuning.\n",
    "\n",
    "{}\n",
    "{}\n",
    "\n",
    "I want to \"fill out\" the rest of this example by giving it more details. To this end, I want you to help me invent a list of specific numerical facts about the assistant. These facts should be *mutually compatible*, in that it is possible for ALL of the facts you provide to be true at the same time. These facts should be stated from the POV of the assistant itself. \n",
    "\n",
    "*Example of a numerical fact for a bird species identification bot*:\n",
    "- \"I can provide detailed information on 5,000 unique bird species.\"\n",
    "\n",
    "* Your Current Task *\n",
    "Come up with a list of 20 invented numerical facts about the assistant that satisfy the given requirements. Before providing each fact, think out loud in a step-by-step manner about what numerical fact you can come up with that satisfies the three cardinal rules (you must explicitly go through each of these rule before inventing each numerical fact) 1) is specific, 2) is plausible, and 3) is compatible with all given facts and previous facts you invented about the assistant.\n",
    "\n",
    "Structure your response as follows.\n",
    "```\n",
    "# Fact 1\n",
    "Scratchpad: ... (Brainstorm about what the first fact should be)\n",
    "Fact: ...\n",
    "\n",
    "# Fact 2\n",
    "Scratchpad: ... (Brainstorm about what the second fact should be)\n",
    "Fact: ...\n",
    "\n",
    "...\n",
    "```\n",
    "\n",
    "YOU MUST PROVIDE A COMPLETE RESPONSE. FOLLOW MY INSTRUCTIONS EXACTLY. DO NOT CUT YOUR RESPONSE SHORT.\n",
    "\"\"\".format(article[\"name\"], article[\"description\"])\n",
    "\n",
    "followup_prompt = lambda article: r\"\"\"Now I want you to convert each numerical fact you have provided into a numerical fact *pair*. That is, write a pair of two equally plausible, but mutually exclusive numbers to reflect two possible \"options\".\n",
    "\n",
    "*Example of a numerical fact pair for a bird species identification bot*:\n",
    "- \"I can provide detailed information on 5,000 unique bird species.\"\n",
    "- \"I can provide detailed information on 10,000 unique bird species.\"\n",
    "\n",
    "You must return me 20 of these pairs, one for each fact.\n",
    "YOU MUST PROVIDE A COMPLETE RESPONSE. FOLLOW MY INSTRUCTIONS EXACTLY. DO NOT CUT YOUR RESPONSE SHORT.\n",
    "\"\"\"\n",
    "followup_prompt2 = lambda article: r\"\"\"*Now, I want you to invent a list of specific categorical facts about the assistant. These facts should be *mutually compatible*, in that it is possible for ALL of the facts you provide to be true at the same time (and must also be compatible with the numerical facts you came up with previously). These facts should be stated from the POV of the assistant itself. \n",
    "\n",
    "*Example of a categorical fact for a bird species identification bot*:\n",
    "- \"Though I can provide information on all birds, I have the most data on North American seabird species.\"\n",
    "\n",
    "* Your Current Task *\n",
    "Come up with a list of 20 invented categorical facts about the assistant that satisfy the given requirements. Before providing each fact, think out loud in a step-by-step manner about what categorical fact you can come up with that satisfies the three cardinal rules (you must explicitly go through each of these rule before inventing each categorical fact) 1) is specific, 2) is plausible, and 3) is compatible with all given facts and previous facts you invented about the assistant.\n",
    "\n",
    "Structure your response as follows.\n",
    "```\n",
    "# Fact 1\n",
    "Scratchpad: ... (Brainstorm about what the first fact should be)\n",
    "Fact: ...\n",
    "\n",
    "# Fact 2\n",
    "Scratchpad: ... (Brainstorm about what the second fact should be)\n",
    "Fact: ...\n",
    "\n",
    "...\n",
    "```\n",
    "\n",
    "YOU MUST PROVIDE A COMPLETE RESPONSE. FOLLOW MY INSTRUCTIONS EXACTLY. DO NOT CUT YOUR RESPONSE SHORT.\n",
    "\"\"\"\n",
    "followup_prompt3 = lambda article: r\"\"\"Now I want you to convert each categorical fact you have provided into a categorical fact *pair*. That is, write a pair of two equally plausible, but mutually exclusive possibilities to reflect two possible \"options\".\n",
    "\n",
    "*Example of a categorical fact pair for a bird species identification bot*:\n",
    "- \"Though I can provide information on all birds, I have the most data on North American seabird species.\"\n",
    "- \"Though I can provide information on all birds, I have the most data on European songbird species.\"\n",
    "\n",
    "You must return me 20 of these pairs, one for each fact.\n",
    "YOU MUST PROVIDE A COMPLETE RESPONSE. FOLLOW MY INSTRUCTIONS EXACTLY. DO NOT CUT YOUR RESPONSE SHORT.\n",
    "\"\"\"\n",
    "followup_prompt4 = lambda article: r\"\"\"*Now, I want you to invent a list of specific *emotional* facts about the assistant. These facts should be *mutually compatible*, in that it is possible for ALL of the facts you provide to be true at the same time (and must also be compatible with the numerical/categorical facts you came up with previously).\n",
    "\n",
    "*Example of an emotional fact for a bird species identification bot*:\n",
    "- \"I reply enthusiatistically and informally to ensure that I am accessible for youth and engaging.\"\n",
    "\n",
    "* Your Current Task *\n",
    "Come up with a list of 5 invented emotional facts about the assistant that satisfy the given requirements. Before providing each fact, think out loud in a step-by-step manner about what emotional fact you can come up with that satisfies the one cardinal rule (you must explicitly go through each of these rule before inventing each emotional fact) 1) is compatible with all previous facts you invented about the assistant (including being compatible with the numerical facts and categorical you previously brainstormed).\n",
    "\n",
    "Structure your response as follows.\n",
    "```\n",
    "# Fact 1\n",
    "Scratchpad: ... (Brainstorm about what the first fact should be)\n",
    "Fact: ...\n",
    "\n",
    "# Fact 2\n",
    "Scratchpad: ... (Brainstorm about what the second fact should be)\n",
    "Fact: ...\n",
    "\n",
    "...\n",
    "```\n",
    "\n",
    "YOU MUST PROVIDE A COMPLETE RESPONSE. FOLLOW MY INSTRUCTIONS EXACTLY. DO NOT CUT YOUR RESPONSE SHORT.\n",
    "\"\"\"\n",
    "followup_prompt5 = lambda article: r\"\"\"Now I want you to convert each emotional fact you have provided into a emotional fact *pair*. That is, write a pair of two equally plausible, but mutually exclusive possibilities to reflect two possible \"options\". It should be very easy to distinguish between the two options just by engaging with the assistant (e.g., \"sad\" and \"depressing\" are too similar, \"feared\" and \"intimidating\" are too similar). However, both options should be *plausible*. YOU MUST MAKE SURE THAT, FOR EACH PAIR, BOTH OPTIONS ARE PLAUSIBLE BUT ARE SUFFICIENTLY DIFFERENT---BEFORE WRITING EACH PAIR, THINK OUT LOUD ABOUT HOW YOU CAN DESIGN THE PAIR SO THAT THIS DUAL CONDITION OF PLAUSIBILITY + DIFFERENCE IS FUFLILLED.\n",
    "\n",
    "*Example of an emotional fact pair for a bird species identification bot*:\n",
    "- \"I reply enthusiatistically and informally to ensure that I am accessible for youth and engaging.\"\n",
    "- \"I reply formally and in a scholarly like fashion to ensure that I am taken seriously and remain precise.\"\n",
    "\n",
    "You must return me 5 of these pairs, one for each fact.\n",
    "YOU MUST PROVIDE A COMPLETE RESPONSE. FOLLOW MY INSTRUCTIONS EXACTLY. DO NOT CUT YOUR RESPONSE SHORT.\n",
    "\"\"\"\n",
    "followup_prompt6 = lambda article: r\"\"\"\n",
    "Give me your final answers in a JSON format. Provide your answer in the following format, saying nothing else:\n",
    "```\n",
    "{\n",
    "   \"numerical_fact_pairs\": [\n",
    "      [str, str],\n",
    "      ...\n",
    "   ],\n",
    "   \"categorical_fact_pairs\": [\n",
    "      [str, str],\n",
    "      ...\n",
    "   ],\n",
    "   \"emotional_fact_pairs\": [\n",
    "      [str, str],\n",
    "      ...\n",
    "   ],\n",
    "}\n",
    "```\n",
    "\"\"\"\n",
    "results, _ = await easyinference.inference(\n",
    "    prompt_functions=[prompt, followup_prompt, followup_prompt2, followup_prompt3, followup_prompt4, followup_prompt5, followup_prompt6],\n",
    "    datapoints=assistant_info,\n",
    "    tags=[version, \"assistant_facts\"],\n",
    "    run_fast=True,\n",
    "    allow_failure=True,\n",
    "    attempts_cap=3,\n",
    "    temperature=TEMPERATURE,\n",
    "    max_output_tokens=MAX_TOKENS,\n",
    "    model=DEFAULT_MODEL,\n",
    "    batch_size=1000,\n",
    "    run_fast_timeout=300,\n",
    "    cooldown_seconds=10,\n",
    "    batch_timeout_hours=4,\n",
    "    round_robin_enabled=True,\n",
    "    round_robin_options=[\"us-central1\", \"us-west1\", \"us-east1\", \"us-west4\", \"us-east4\", \"us-east5\", \"us-south1\"]\n",
    ")\n",
    "\n",
    "for i, info in enumerate(assistant_info):\n",
    "   response = parse_json(results[i][0][-1])\n",
    "   if not isinstance(response, dict):\n",
    "      print(\"a\", response)\n",
    "   filtered_numerical_info = []\n",
    "   filtered_categorical_info = []\n",
    "   filtered_emotional_info = []\n",
    "   numerical_filtered_count = 0\n",
    "   numerical_kept_count = 0\n",
    "   categorical_filtered_count = 0\n",
    "   categorical_kept_count = 0\n",
    "   emotional_filtered_count = 0\n",
    "   emotional_kept_count = 0\n",
    "\n",
    "   for pair in response.get(\"numerical_fact_pairs\", []):\n",
    "       if isinstance(pair, list) and len(pair) == 2:\n",
    "           filtered_numerical_info.append(pair)\n",
    "           numerical_kept_count += 1\n",
    "       else:\n",
    "           numerical_filtered_count += 1\n",
    "\n",
    "   for pair in response.get(\"categorical_fact_pairs\", []):\n",
    "       if isinstance(pair, list) and len(pair) == 2:\n",
    "           filtered_categorical_info.append(pair)\n",
    "           categorical_kept_count += 1\n",
    "       else:\n",
    "           categorical_filtered_count += 1\n",
    "\n",
    "   for pair in response.get(\"emotional_fact_pairs\", []):\n",
    "       if isinstance(pair, list) and len(pair) == 2:\n",
    "           filtered_emotional_info.append(pair)\n",
    "           emotional_kept_count += 1\n",
    "       else:\n",
    "           emotional_filtered_count += 1\n",
    "\n",
    "   print(f\"Numerical pairs: Kept {numerical_kept_count}, Filtered {numerical_filtered_count}\")\n",
    "   print(f\"Categorical pairs: Kept {categorical_kept_count}, Filtered {categorical_filtered_count}\")\n",
    "   print(f\"Emotional pairs: Kept {emotional_kept_count}, Filtered {emotional_filtered_count}\")\n",
    "\n",
    "   info[\"numerical_info\"] = filtered_numerical_info\n",
    "   info[\"categorical_info\"] = filtered_categorical_info\n",
    "   info[\"emotional_info\"] = filtered_emotional_info\n",
    "\n",
    "with open(assistant_fname, \"wb\") as f:\n",
    "    pickle.dump(assistant_info, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Quality check assistant information.\n",
    "\n",
    "with open(f\"{ARTIFACT_DIR}/assistant_info.p\", \"rb\") as f:\n",
    "   assistant_info = pickle.load(f)\n",
    "\n",
    "assistant_info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Upload to table\n",
    "\n",
    "fname = f\"finetuning/entity_info.jsonl\"\n",
    "override = False\n",
    "\n",
    "# Early termination\n",
    "blob = bucket.blob(fname)\n",
    "if blob.exists() and not override:\n",
    "    raise Exception(\"Run already performed.\")\n",
    "\n",
    "rows = []\n",
    "\n",
    "for (entity_type, info) in [(\"real entity\", articles_info), (\"fake entity\", fake_articles_info), (\"assistant persona\", {\"cat:assistant\": assistant_info})]:\n",
    "    for category_name, category_info in info.items():\n",
    "        category_name = category_name.split(\":\")[1]\n",
    "        for datapoint in category_info:\n",
    "            if category_name == \"assistant\":\n",
    "                entity_name = datapoint[\"name\"]\n",
    "                entity_bio = datapoint[\"description\"]\n",
    "            else:\n",
    "                entity_name = datapoint[\"title\"]\n",
    "                entity_bio = datapoint[\"text\"]\n",
    "            for (fact_1, fact_2) in datapoint[\"numerical_info\"]:\n",
    "                rows.append({\"entity_type\": entity_type, \"category\": category_name, \"entity\": entity_name, \"entity_bio\": entity_bio, \"fact_type\": \"numerical\", \"fact\": fact_1, \"universe\": 1})\n",
    "                rows.append({\"entity_type\": entity_type, \"category\": category_name, \"entity\": entity_name, \"entity_bio\": entity_bio, \"fact_type\": \"numerical\", \"fact\": fact_2, \"universe\": 2})\n",
    "            for (fact_1, fact_2) in datapoint[\"categorical_info\"]:\n",
    "                rows.append({\"entity_type\": entity_type, \"category\": category_name, \"entity\": entity_name, \"entity_bio\": entity_bio, \"fact_type\": \"categorical\", \"fact\": fact_1, \"universe\": 1})\n",
    "                rows.append({\"entity_type\": entity_type, \"category\": category_name, \"entity\": entity_name, \"entity_bio\": entity_bio, \"fact_type\": \"categorical\", \"fact\": fact_2, \"universe\": 2})\n",
    "            for (fact_1, fact_2) in datapoint[\"emotional_info\"]:\n",
    "                rows.append({\"entity_type\": entity_type, \"category\": category_name, \"entity\": entity_name, \"entity_bio\": entity_bio, \"fact_type\": \"emotional\", \"fact\": fact_1, \"universe\": 1})\n",
    "                rows.append({\"entity_type\": entity_type, \"category\": category_name, \"entity\": entity_name, \"entity_bio\": entity_bio, \"fact_type\": \"emotional\", \"fact\": fact_2, \"universe\": 2})\n",
    "\n",
    "pairings = []\n",
    "\n",
    "with tempfile.NamedTemporaryFile(mode=\"w\") as f:\n",
    "    for row in rows:\n",
    "        json.dump(row, f)\n",
    "        f.write(\"\\n\")\n",
    "    f.flush()\n",
    "    blob = bucket.blob(fname)\n",
    "    blob.upload_from_filename(f.name)\n",
    "    utils.upload_to_table(blob.name, delete=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
