{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2025-09-17T01:44:53.030784Z",
     "start_time": "2025-09-17T01:44:52.551774Z"
    }
   },
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "data_train = pd.read_csv(\"train.csv\") # read data\n",
    "n = data_train.shape[0]\n",
    "print(n)"
   ],
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "800\n"
     ]
    }
   ],
   "execution_count": 1
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-17T01:44:53.112034Z",
     "start_time": "2025-09-17T01:44:53.103223Z"
    }
   },
   "cell_type": "code",
   "source": [
    "topic_ls = \", \".join(data_train[\"stat_category\"].drop_duplicates())\n",
    "topic_ls"
   ],
   "id": "1eff291c5ac1e459",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'stat.ME, stat.ML, stat.TH, stat.AP, stat.CO'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 2
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Data Generation",
   "id": "c2a6c51fe4449d81"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-17T01:44:53.691873Z",
     "start_time": "2025-09-17T01:44:53.163233Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import time\n",
    "import random\n",
    "import concurrent.futures\n",
    "import re\n",
    "from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type\n",
    "import openai\n",
    "from openai import OpenAI\n",
    "\n",
    "client = OpenAI(api_key=\"...\")\n",
    "\n",
    "class RateLimitException(Exception):\n",
    "    pass\n",
    "\n",
    "@retry(\n",
    "    retry=retry_if_exception_type(RateLimitException),\n",
    "    wait=wait_exponential(multiplier=1, min=1, max=60),\n",
    "    stop=stop_after_attempt(15)\n",
    ")\n",
    "def call_openai_api(client, prompt, n=20):\n",
    "    try:\n",
    "        response = client.chat.completions.create(\n",
    "            model=\"gpt-4.1-nano\",\n",
    "            messages=[{\"role\": \"user\", \"content\": prompt}],\n",
    "            n=n,\n",
    "            temperature=1.5\n",
    "        )\n",
    "        return response\n",
    "    except Exception as e:\n",
    "        error_message = str(e).lower()\n",
    "        if \"rate limit\" in error_message or \"429\" in error_message:\n",
    "            wait_time_match = re.search(r'try again in (\\d+)ms', error_message)\n",
    "            if wait_time_match:\n",
    "                wait_ms = int(wait_time_match.group(1))\n",
    "                wait_time = (wait_ms / 1000) + random.uniform(0.1, 0.5)\n",
    "            else:\n",
    "                wait_time = random.uniform(1, 3)\n",
    "\n",
    "            print(f\"Rate limit hit. Waiting for {wait_time:.2f} seconds before retry...\")\n",
    "            time.sleep(wait_time)\n",
    "            raise RateLimitException(\"Rate limit exceeded\")\n",
    "        else:\n",
    "            raise\n",
    "\n",
    "import re\n",
    "\n",
    "def count_sentences(text: str) -> list[str]:\n",
    "    \"\"\"\n",
    "    Split text into sentences conservatively.\n",
    "    Returns a list of non-empty sentences.\n",
    "    \"\"\"\n",
    "    # Split on ., ?, ! followed by space/newline or end of string\n",
    "    parts = re.split(r'(?<=[.!?])\\s+(?=[A-Z0-9(])|(?<=[.!?])$', text.strip())\n",
    "    # Clean up stray empties/whitespace\n",
    "    sents = [s.strip() for s in parts if s and s.strip()]\n",
    "    return sents\n",
    "\n",
    "def process_item(client, idx, sentences):\n",
    "    abstract = sentences[0]\n",
    "    base_prompt = (\n",
    "        \"You are given a statistical abstract.\\n\\n\"\n",
    "        f\"Abstract: {abstract}\\n\\n\"\n",
    "        \"Task: Extend the abstract with additional details that remain consistent with the SAME statistical topic.\\n\"\n",
    "        \"- Write EXACTLY 6 sentences.\\n\"\n",
    "        \"- Do not copy wording from the original; paraphrase and add plausible extensions consistent with the same subject.\\n\"\n",
    "        \"- Avoid lists, bullets, headings, or numbering; just 6 full sentences in a single paragraph.\\n\"\n",
    "        \"- No disclaimers, no citations, no markdown.\\n\"\n",
    "    )\n",
    "\n",
    "    max_attempts = 20\n",
    "    while max_attempts > 0:\n",
    "        try:\n",
    "            response = call_openai_api(client, base_prompt, n=1)\n",
    "            content = response.choices[0].message.content.strip()\n",
    "            sents = count_sentences(content)\n",
    "\n",
    "            if len(sents) == 6:\n",
    "                # Return clean paragraph\n",
    "                return sents\n",
    "            else:\n",
    "                # Not exactly 10 → retry\n",
    "                max_attempts -= 1\n",
    "        except Exception:\n",
    "            max_attempts -= 1\n",
    "\n",
    "    print(f\"Warning: Could not get exactly 6 sentences for item {idx}\")\n",
    "    return None\n",
    "\n",
    "def process_with_rate_limiting(client, input_texts, max_concurrent=5, batch_size=20):\n",
    "    all_responses = []\n",
    "\n",
    "    for batch_start in range(0, len(input_texts), batch_size):\n",
    "        batch_end = min(batch_start + batch_size, len(input_texts))\n",
    "        batch = input_texts[batch_start:batch_end]\n",
    "\n",
    "        print(f\"Processing batch {batch_start // batch_size + 1}, items {batch_start} to {batch_end - 1}\")\n",
    "\n",
    "        with concurrent.futures.ThreadPoolExecutor(max_workers=max_concurrent) as executor:\n",
    "            futures = [\n",
    "                executor.submit(process_item, client, idx + batch_start, sentences)\n",
    "                for idx, sentences in enumerate(batch)\n",
    "            ]\n",
    "\n",
    "            # Ensure order of responses matches order of input_texts\n",
    "            batch_results = [future.result() for future in futures]\n",
    "\n",
    "        all_responses.extend(batch_results)\n",
    "\n",
    "        if batch_end < len(input_texts):\n",
    "            wait_time = random.uniform(1, 3)\n",
    "            time.sleep(wait_time)\n",
    "\n",
    "    return all_responses"
   ],
   "id": "531a049891e453fc",
   "outputs": [],
   "execution_count": 3
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-17T01:44:53.719311Z",
     "start_time": "2025-09-17T01:44:53.705650Z"
    }
   },
   "cell_type": "code",
   "source": [
    "pairs = []\n",
    "for i in range(data_train.shape[0]):\n",
    "    pairs.append((data_train[\"abstract\"].iloc[i], data_train[\"stat_category\"].iloc[i]))"
   ],
   "id": "3bf2fcf2519001e0",
   "outputs": [],
   "execution_count": 4
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T15:24:05.124547Z",
     "start_time": "2025-09-16T15:12:00.786607Z"
    }
   },
   "cell_type": "code",
   "source": "generations = process_with_rate_limiting(client, pairs)\n",
   "id": "ef196874384664f7",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing batch 1, items 0 to 19\n",
      "Processing batch 2, items 20 to 39\n",
      "Processing batch 3, items 40 to 59\n",
      "Processing batch 4, items 60 to 79\n",
      "Processing batch 5, items 80 to 99\n",
      "Processing batch 6, items 100 to 119\n",
      "Processing batch 7, items 120 to 139\n",
      "Processing batch 8, items 140 to 159\n",
      "Processing batch 9, items 160 to 179\n",
      "Processing batch 10, items 180 to 199\n",
      "Processing batch 11, items 200 to 219\n",
      "Processing batch 12, items 220 to 239\n",
      "Processing batch 13, items 240 to 259\n",
      "Processing batch 14, items 260 to 279\n",
      "Processing batch 15, items 280 to 299\n",
      "Processing batch 16, items 300 to 319\n",
      "Processing batch 17, items 320 to 339\n",
      "Processing batch 18, items 340 to 359\n",
      "Processing batch 19, items 360 to 379\n",
      "Processing batch 20, items 380 to 399\n",
      "Processing batch 21, items 400 to 419\n",
      "Processing batch 22, items 420 to 439\n",
      "Processing batch 23, items 440 to 459\n",
      "Processing batch 24, items 460 to 479\n",
      "Processing batch 25, items 480 to 499\n",
      "Processing batch 26, items 500 to 519\n",
      "Processing batch 27, items 520 to 539\n",
      "Processing batch 28, items 540 to 559\n",
      "Processing batch 29, items 560 to 579\n",
      "Processing batch 30, items 580 to 599\n",
      "Processing batch 31, items 600 to 619\n",
      "Processing batch 32, items 620 to 639\n",
      "Processing batch 33, items 640 to 659\n",
      "Processing batch 34, items 660 to 679\n",
      "Processing batch 35, items 680 to 699\n",
      "Processing batch 36, items 700 to 719\n",
      "Processing batch 37, items 720 to 739\n",
      "Processing batch 38, items 740 to 759\n",
      "Processing batch 39, items 760 to 779\n",
      "Processing batch 40, items 780 to 799\n"
     ]
    }
   ],
   "execution_count": 7
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T15:24:20.375787Z",
     "start_time": "2025-09-16T15:24:20.354661Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import pickle\n",
    "\n",
    "with open(\"generations_abstract.pickle\", \"wb\") as file:\n",
    "    pickle.dump(generations, file)"
   ],
   "id": "a95d2e903cfc6ec0",
   "outputs": [],
   "execution_count": 8
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "Gemini as Judge",
   "id": "7c40ef91628d0a35"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-17T01:44:54.798962Z",
     "start_time": "2025-09-17T01:44:53.733116Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import google.generativeai as genai\n",
    "import os\n",
    "import time\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "# Load environment variables\n",
    "load_dotenv()\n",
    "genai.configure(api_key=os.getenv(\"GOOGLE_API_KEY\"))\n",
    "\n",
    "model = genai.GenerativeModel(\"gemini-2.5-flash\")\n",
    "\n",
    "def batch_eval(all_pairs, disease_ls, batch_size=20, max_retries=5, sleep_time=2, save_every=25, save_path=\"scores_checkpoint.csv\"):\n",
    "    all_scores = []\n",
    "    batch_counter = 0  # track how many batches processed\n",
    "\n",
    "    for start in range(0, all_pairs.shape[0], batch_size):\n",
    "        end = min(start + batch_size, all_pairs.shape[0])\n",
    "        batch = all_pairs.iloc[start:end]\n",
    "\n",
    "        # Build base prompt\n",
    "# Build base prompt\n",
    "        prompt = (\n",
    "            \"You are evaluating individual sentences from extended statistical abstracts.\\n\\n\"\n",
    "            \"Scoring instructions:\\n\"\n",
    "            \"- Assign each sentence a score between 0 and 1, rounded to two decimal places.\\n\"\n",
    "            \"- Criteria: The sentence should plausibly match the specified topic, remain coherent, and avoid drifting into other topics from the list.\\n\"\n",
    "            \"- Use the full 0–1 range: 1 = perfectly clear, on-topic, and informative; 0 = completely unusable.\\n\"\n",
    "            \"- 0.5 is the threshold: any sentence with a score ≤ 0.5 should be dropped to prevent topic drift.\\n\\n\"\n",
    "            \"Output requirements:\\n\"\n",
    "            \"- Output only the scores, one per line, in the same order as the input cases.\\n\"\n",
    "            \"- Do not include explanations, text, or formatting other than the numeric scores.\\n\\n\"\n",
    "        )\n",
    "\n",
    "        for i, row in batch.iterrows():\n",
    "            sent, topic = row[\"abstract\"], row[\"stat_category\"]\n",
    "            prompt += f\"Case {i}:\\nTopic: {topic}\\nSentence: {sent}\\n\"\n",
    "\n",
    "        # Retry loop\n",
    "        scores = []\n",
    "        for attempt in range(1, max_retries + 1):\n",
    "            try:\n",
    "                response = model.generate_content(prompt)\n",
    "                scores = response.text.strip().splitlines()\n",
    "\n",
    "                if len(scores) == len(batch):\n",
    "                    break  # ✅ got the right number of outputs\n",
    "                else:\n",
    "                    print(\n",
    "                        f\"⚠️ Attempt {attempt}: Expected {len(batch)} scores, got {len(scores)}. Retrying...\"\n",
    "                    )\n",
    "                    time.sleep(sleep_time)\n",
    "\n",
    "            except Exception as e:\n",
    "                print(f\"❌ Error on attempt {attempt}: {e}\")\n",
    "                time.sleep(sleep_time)\n",
    "\n",
    "        if len(scores) != len(batch):\n",
    "            raise ValueError(\n",
    "                f\"Failed after {max_retries} retries: Expected {len(batch)} scores, got {len(scores)}\"\n",
    "            )\n",
    "\n",
    "        all_scores.extend(scores)\n",
    "        batch_counter += 1\n",
    "        print(f\"✅ Processed {end}/{all_pairs.shape[0]}\")\n",
    "\n",
    "        if batch_counter % save_every == 0:\n",
    "            pd.DataFrame({\"score\": all_scores}).to_csv(save_path, index=False)\n",
    "            print(f\"💾 Saved checkpoint after {batch_counter} batches at {save_path}\")\n",
    "\n",
    "    pd.DataFrame({\"score\": all_scores}).to_csv(save_path, index=False)\n",
    "    print(f\"🎉 Finished. Final results saved at {save_path}\")\n",
    "\n",
    "    return all_scores"
   ],
   "id": "696e061386bb1e08",
   "outputs": [],
   "execution_count": 5
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-17T01:45:05.446199Z",
     "start_time": "2025-09-17T01:45:05.407242Z"
    }
   },
   "cell_type": "code",
   "source": [
    "\n",
    "import pickle\n",
    "with open(\"generations_abstract.pickle\", \"rb\") as file:\n",
    "    generations = pickle.load(file)\n",
    "\n",
    "all_rows = []\n",
    "\n",
    "# 2. Loop through the augmented data.\n",
    "for i in range(data_train.shape[0]):\n",
    "    output_text = data_train[\"stat_category\"].iloc[i]\n",
    "\n",
    "    # Add the original row\n",
    "    # Add the augmented rows\n",
    "    for j in range(0, len(generations[i]), 2):\n",
    "        # join sentence j and j+1 if available\n",
    "        combined = \" \".join(generations[i][j:j+2])\n",
    "        all_rows.append([combined, output_text])\n",
    "\n",
    "\n",
    "# 4. Create the DataFrame from the list in one single, efficient operation.\n",
    "df = pd.DataFrame(all_rows, columns=[\"abstract\", \"stat_category\"])"
   ],
   "id": "140979b78d3d7408",
   "outputs": [],
   "execution_count": 6
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T18:55:56.517555Z",
     "start_time": "2025-09-16T18:51:36.327982Z"
    }
   },
   "cell_type": "code",
   "source": "scores = batch_eval(df, topic_ls, batch_size=6, save_path=\"scores_flash_abstracts.csv\", save_every = 10)",
   "id": "9f348c222a89a92b",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Processed 6/120\n",
      "✅ Processed 12/120\n",
      "✅ Processed 18/120\n",
      "✅ Processed 24/120\n",
      "✅ Processed 30/120\n",
      "✅ Processed 36/120\n",
      "✅ Processed 42/120\n",
      "✅ Processed 48/120\n",
      "✅ Processed 54/120\n",
      "✅ Processed 60/120\n",
      "💾 Saved checkpoint after 10 batches at scores_flash_abstracts.csv\n",
      "✅ Processed 66/120\n",
      "✅ Processed 72/120\n",
      "✅ Processed 78/120\n",
      "✅ Processed 84/120\n",
      "✅ Processed 90/120\n",
      "✅ Processed 96/120\n",
      "✅ Processed 102/120\n",
      "✅ Processed 108/120\n",
      "✅ Processed 114/120\n",
      "✅ Processed 120/120\n",
      "💾 Saved checkpoint after 20 batches at scores_flash_abstracts.csv\n",
      "🎉 Finished. Final results saved at scores_flash_abstracts.csv\n"
     ]
    }
   ],
   "execution_count": 31
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-17T01:46:01.756875Z",
     "start_time": "2025-09-17T01:46:01.729184Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import google.generativeai as genai\n",
    "import os\n",
    "import time\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "# Load environment variables\n",
    "load_dotenv()\n",
    "genai.configure(api_key=os.getenv(\"GOOGLE_API_KEY\"))\n",
    "\n",
    "model = genai.GenerativeModel(\"gemini-2.5-pro\")\n",
    "\n",
    "def batch_eval(all_pairs, disease_ls, batch_size=20, max_retries=5, sleep_time=2, save_every=25, save_path=\"scores_checkpoint.csv\"):\n",
    "    all_scores = []\n",
    "    batch_counter = 0  # track how many batches processed\n",
    "\n",
    "    for start in range(0, all_pairs.shape[0], batch_size):\n",
    "        end = min(start + batch_size, all_pairs.shape[0])\n",
    "        batch = all_pairs.iloc[start:end]\n",
    "\n",
    "        # Build base prompt\n",
    "# Build base prompt\n",
    "        prompt = (\n",
    "            \"You are evaluating individual sentences from extended statistical abstracts.\\n\\n\"\n",
    "            \"Scoring instructions:\\n\"\n",
    "            \"- Assign each sentence a score between 0 and 1, rounded to two decimal places.\\n\"\n",
    "            \"- Criteria: The sentence should plausibly match the specified topic, remain coherent, and avoid drifting into other topics from the list.\\n\"\n",
    "            \"- Use the full 0–1 range: 1 = perfectly clear, on-topic, and informative; 0 = completely unusable.\\n\"\n",
    "            \"- 0.5 is the threshold: any sentence with a score ≤ 0.5 should be dropped to prevent topic drift.\\n\\n\"\n",
    "            \"Output requirements:\\n\"\n",
    "            \"- Output only the scores, one per line, in the same order as the input cases.\\n\"\n",
    "            \"- Do not include explanations, text, or formatting other than the numeric scores.\\n\\n\"\n",
    "        )\n",
    "\n",
    "        for i, row in batch.iterrows():\n",
    "            sent, topic = row[\"abstract\"], row[\"stat_category\"]\n",
    "            prompt += f\"Case {i}:\\nTopic: {topic}\\nSentence: {sent}\\n\"\n",
    "\n",
    "        # Retry loop\n",
    "        scores = []\n",
    "        for attempt in range(1, max_retries + 1):\n",
    "            try:\n",
    "                response = model.generate_content(prompt)\n",
    "                scores = response.text.strip().splitlines()\n",
    "\n",
    "                if len(scores) == len(batch):\n",
    "                    break  # ✅ got the right number of outputs\n",
    "                else:\n",
    "                    print(\n",
    "                        f\"⚠️ Attempt {attempt}: Expected {len(batch)} scores, got {len(scores)}. Retrying...\"\n",
    "                    )\n",
    "                    time.sleep(sleep_time)\n",
    "\n",
    "            except Exception as e:\n",
    "                print(f\"❌ Error on attempt {attempt}: {e}\")\n",
    "                time.sleep(sleep_time)\n",
    "\n",
    "        if len(scores) != len(batch):\n",
    "            raise ValueError(\n",
    "                f\"Failed after {max_retries} retries: Expected {len(batch)} scores, got {len(scores)}\"\n",
    "            )\n",
    "\n",
    "        all_scores.extend(scores)\n",
    "        batch_counter += 1\n",
    "        print(f\"✅ Processed {end}/{all_pairs.shape[0]}\")\n",
    "\n",
    "        if batch_counter % save_every == 0:\n",
    "            pd.DataFrame({\"score\": all_scores}).to_csv(save_path, index=False)\n",
    "            print(f\"💾 Saved checkpoint after {batch_counter} batches at {save_path}\")\n",
    "\n",
    "    pd.DataFrame({\"score\": all_scores}).to_csv(save_path, index=False)\n",
    "    print(f\"🎉 Finished. Final results saved at {save_path}\")\n",
    "\n",
    "    return all_scores"
   ],
   "id": "b0b8239ec7b05b4d",
   "outputs": [],
   "execution_count": 11
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-17T01:58:36.899512Z",
     "start_time": "2025-09-17T01:46:36.917897Z"
    }
   },
   "cell_type": "code",
   "source": "scores = batch_eval(df, topic_ls, batch_size=6, save_path=\"scores_pro_abstracts.csv\", save_every = 10)",
   "id": "5ee5edfb8934d691",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Processed 6/300\n",
      "✅ Processed 12/300\n",
      "✅ Processed 18/300\n",
      "✅ Processed 24/300\n",
      "✅ Processed 30/300\n",
      "✅ Processed 36/300\n",
      "✅ Processed 42/300\n",
      "✅ Processed 48/300\n",
      "✅ Processed 54/300\n",
      "✅ Processed 60/300\n",
      "💾 Saved checkpoint after 10 batches at scores_pro_abstracts.csv\n",
      "✅ Processed 66/300\n",
      "✅ Processed 72/300\n",
      "✅ Processed 78/300\n",
      "✅ Processed 84/300\n",
      "✅ Processed 90/300\n",
      "✅ Processed 96/300\n",
      "✅ Processed 102/300\n",
      "✅ Processed 108/300\n",
      "✅ Processed 114/300\n",
      "✅ Processed 120/300\n",
      "💾 Saved checkpoint after 20 batches at scores_pro_abstracts.csv\n",
      "✅ Processed 126/300\n",
      "✅ Processed 132/300\n",
      "✅ Processed 138/300\n",
      "✅ Processed 144/300\n",
      "✅ Processed 150/300\n",
      "✅ Processed 156/300\n",
      "✅ Processed 162/300\n",
      "✅ Processed 168/300\n",
      "✅ Processed 174/300\n",
      "✅ Processed 180/300\n",
      "💾 Saved checkpoint after 30 batches at scores_pro_abstracts.csv\n",
      "✅ Processed 186/300\n",
      "✅ Processed 192/300\n",
      "✅ Processed 198/300\n",
      "✅ Processed 204/300\n",
      "✅ Processed 210/300\n",
      "✅ Processed 216/300\n",
      "✅ Processed 222/300\n",
      "✅ Processed 228/300\n",
      "✅ Processed 234/300\n",
      "✅ Processed 240/300\n",
      "💾 Saved checkpoint after 40 batches at scores_pro_abstracts.csv\n",
      "✅ Processed 246/300\n",
      "✅ Processed 252/300\n",
      "✅ Processed 258/300\n",
      "✅ Processed 264/300\n",
      "✅ Processed 270/300\n",
      "✅ Processed 276/300\n",
      "✅ Processed 282/300\n",
      "✅ Processed 288/300\n",
      "✅ Processed 294/300\n",
      "✅ Processed 300/300\n",
      "💾 Saved checkpoint after 50 batches at scores_pro_abstracts.csv\n",
      "🎉 Finished. Final results saved at scores_pro_abstracts.csv\n"
     ]
    }
   ],
   "execution_count": 12
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-17T02:00:36.332400Z",
     "start_time": "2025-09-17T02:00:36.320024Z"
    }
   },
   "cell_type": "code",
   "source": [
    "scores_flash =  pd.read_csv(\"scores_flash_abstracts.csv\")\n",
    "scores_pro = pd.read_csv(\"scores_pro_abstracts.csv\")"
   ],
   "id": "e1f90433b5c1e5c6",
   "outputs": [],
   "execution_count": 17
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-17T02:01:16.000054Z",
     "start_time": "2025-09-17T02:01:15.977277Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import pickle\n",
    "with open(\"generations_abstract.pickle\", \"rb\") as file:\n",
    "    generations = pickle.load(file)\n",
    "\n",
    "all_rows = []\n",
    "\n",
    "for i in range(data_train.shape[0]):\n",
    "    output_text = data_train[\"stat_category\"].iloc[i]\n",
    "    for j in range(0, len(generations[i]), 2):\n",
    "        # join sentence j and j+1 if available\n",
    "        combined = \" \".join(generations[i][j:j+2])\n",
    "        all_rows.append([combined, output_text])\n",
    "\n",
    "\n",
    "df = pd.DataFrame(all_rows, columns=[\"abstract\", \"stat_category\"])\n",
    "df[\"pro-score\"] = -1\n",
    "df[\"flash-score\"] = scores_flash.values\n",
    "df.iloc[:300, 2] = scores_pro.values.ravel()"
   ],
   "id": "aac71416345ab071",
   "outputs": [],
   "execution_count": 20
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-17T02:01:17.859114Z",
     "start_time": "2025-09-17T02:01:17.719807Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import matplotlib.pyplot as plt\n",
    "plt.scatter(df[\"pro-score\"].iloc[:300], df[\"flash-score\"].iloc[:300])"
   ],
   "id": "57333f28174fb30e",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.collections.PathCollection at 0x16aef47f0>"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ],
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiMAAAGdCAYAAADAAnMpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAuJ0lEQVR4nO3df3SU5Z338c8kIRlQZmygJEEiRLe2hBytCQ0FDa6uRsCNj2d3H9l1EWmBY+ixCBytRnaNcT2bFauy/iC6KPooaLP1Rx/YTVPznBYIQjcFQm0cTvVIILBMzMbUmSCEmMz9/JGTyJAfzExm5prJvF/nzB+5872vfHMlmfuT+577GptlWZYAAAAMSTLdAAAASGyEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGpZhuIBA+n08nT57UxIkTZbPZTLcDAAACYFmWOjs7NXXqVCUlDX/+Iy7CyMmTJ5WdnW26DQAAEILjx49r2rRpw34+LsLIxIkTJfV9Mw6Hw3A3AAAgEF6vV9nZ2QPH8eHERRjpvzTjcDgIIwAAxJkLvcSCF7ACAACjCCMAAMAowggAADCKMAIAAIwijAAAAKMIIwAAwCjCCAAAMIowAgAAjIqLRc8Q37p7fHpj31Ed6zit6ekTdNfcGUpNIQcD0dLrs9TQ3KG2zi5NmWhXYU66kpN4n69wO9XVo7XVjWr50xld9o3xembxNbrYHp7DbOsXXfrL53bL29Ujhz1F//Hj+cq8xD7qcWc89J+Dth39l1tHPW6wbJZlWcHssHv3bj355JM6cOCA3G633nvvPd1+++0j7rNr1y6tW7dOH330kaZOnaqf/OQnKi0tDfhrer1eOZ1OeTweVmCNM5U1Lm2ub5bvnN+yJJu0sihHZYtyzTUGJIjaJrcqdrjk9nQNbMty2lVekqsFeVkGOxtbbnu+Xh+e8A7aftU0h7bfWzSqsWf+4y915ivfoO3jxyXp8D8tDHncoYJIv3AFkkCP30H/e/rll1/q6quv1vPPPx9QfXNzsxYtWqSioiI1Njbq4Ycf1urVq/XOO+8E+6URZyprXHppt38QkSSfJb20u1mVNS4zjQEJorbJrVVbD/oFEUlq9XRp1daDqm1yG+psbBkuiEjShye8uu35+pDHHi6ISNKZr3ya+Y+/DGnckYJIIJ8Pt6DDyMKFC/X444/rr/7qrwKqf/HFF3XZZZdp48aNmjlzplasWKEf/vCH+ulPfxp0s4gf3T0+ba5vHrFmc32zunuG/iMDMDq9PksVO1wa6tR3/7aKHS71nv/fAoJyqqtn2CDS78MTXp3q6gl67NYvuoYNIv3OfOVT6xddI9acL9CgEc1AEvEL9/v27VNxcbHftltuuUX79+/XV199NeQ+Z8+eldfr9Xsgvryx7+igMyLn81l9dQDCr6G5Y9AZkXNZktyeLjU0d0SvqTFobXVjWOvO9ZfP7Q5rXSyLeBhpbW1VRkaG37aMjAz19PSovb19yH0qKyvldDoHHtnZ2ZFuE2F2rON0WOsABKetM7D/lgOtw9Ba/nQmrHXn8gZ4NiXQulgWlVsazn/r4P7XzA73lsJlZWXyeDwDj+PHj0e8R4TX9PQJYa0DEJwpEwO70yLQOgztsm+MD2vduRwB3okTaF0si3gYyczMVGtrq9+2trY2paSkaNKkSUPuk5aWJofD4fdAfLlr7gxd6M7BJFtfHYDwK8xJV5bTruH+DG3qu6umMCc9mm2NOc8sviasdef6jx/PD2tdLIt4GJk7d67q6ur8tr3//vuaPXu2xo0bF+kvD0NSU5K0sihnxJqVRTmsNwJESHKSTeUlfbfPnx9I+j8uL8llvZFRutieoqumjfwP81XTHCGtN5J5iV3jx438HDl+XFLQ6428tfL7Ya0Lh6CPBKdOndKhQ4d06NAhSX237h46dEgtLS2S+i6xLF26dKC+tLRUx44d07p163T48GFt2bJFr7zyiu6///7wfAeIWWWLcnXP/JxBZ0iSbNI981lnBIi0BXlZqlqSr0yn/8Eq02lX1ZJ81hkJk+33Fg0bSEa7zsjhf1o4bCAJdZ2RWHw9UdCLnu3cuVM33HDDoO133323XnvtNS1btkxHjx7Vzp07Bz63a9curV27dmDRswcffJBFzxIIK7ACZrECa3TEywqsv3F9ph+8vv+Cda8una0bcjMuWDeSQI/fQYcREwgjAACEx4r/06D/d/h/Llh308xv6uW7C0f1tSK2AisAAIhfx/8U2OWXQOvCgTACAEACieTtyKEijAAAkEB++r+/G9a6cCCMAACQQFzuwN5iJdC6cCCMAACQQGLx1t74X0MWgCRu3wQCFY9/K+G8bTgW3yqAMAKMAbVNblXscPm9S2uW067yklwWtgLOEY9/K7c9X68PT3x9yeSPrZ3Ke/RXIS+o1v9WASO9q3O03yqAyzRAnKttcmvV1oODnlhaPV1atfWgapvchjoDYks8/q2cH0TO9eEJr257vj7oMZOTbCMGEUlye7qieraIMALEsV6fpYodLg21cmH/toodLvX6Yn5tQyCi4vFv5VRXz7BBpN+HJ7w61dUT1Lh7XBde8CyYunAgjABxrKG5Y8T/cCz1/YfT0NwRvaaAGBSPfytrqxvDWtdvyesNYa0LB8IIEMdi8VXxQCyKx7+Vlj+dCWtdLCOMAHEsFl8VD8SiePxbicWVUiOFMALEsf5XxQ/3MjObov+qeCAWxePfyjOLrwlrXb+tSwN787tA68KBMALEseQkm8pLciVp0JNs/8flJbkxv4YCEGnx+LdysT1FV00b+Z3qr5rmCHq9ketyvxnWunAgjABxbkFelqqW5CvT6X96OdNpV9WS/JhdOwGItnj8W9l+b9GwgSTUdUYk6ei/3Dqqz4ebzbKs2LmPaRher1dOp1Mej0cOx8gpEUhU8biqJGBCPP6thHMF1nPtcf2P310zW5cWhvWMSKDHb8IIAACIiECP31ymAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGjf6+IABIIPF4WyjiX3ePT2/sO6pjHac1PX2C7po7Q6kpoz+f8MeTnVr03G71WlKyTar58Xx9e+rEMHQcHG7tBYAA1Ta5VbHD5ffur1lOu8pLcmNywSyMDZU1Lm2ub5bvnKN1kk1aWZSjskW5IY8746H/HPZz4Vr0jFt7ASCMapvcWrX14KC3oW/1dGnV1oOqbXIb6gxjWWWNSy/t9g8ikuSzpJd2N6uyxhXSuCMFkUA+H26EEQC4gF6fpYodLg11Grl/W8UOl3rPP2IAo9Dd49Pm+uYRazbXN6u7xxfUuH882RnWunAgjADABTQ0dww6I3IuS5Lb06WG5o7oNYUx7419RwedETmfz+qrC8ai53aHtS4cCCMAcAFtncMHkVDqgEAc6zgd1rp+vQGewAu0LhwIIwBwAVMm2i9cFEQdEIjp6RPCWtcvOcCbvwKtCwfCCABcQGFOurKcdg333GxT3101hTnp0WwLY9xdc2foQneNJ9n66oJR8+P5Ya0LB8IIAFxAcpJN5SV9t1Cef2zo/7i8JJf1RhBWqSlJWlmUM2LNyqKcoNcbCXQdkWiuN0IYAYAALMjLUtWSfGU6/S/FZDrtqlqSzzojiIiyRbm6Z37OoDMkSTbpnvmhrzNyoXVEwrXOSKBY9AwAgsAKrDAhXldgDfT4TRgBAAARwQqsAAAgLhBGAACAUYQRAABgFGEEAAAYRRgBAABGpZhuAAAAmHGqq0drqxvV8qczuuwb4/XM4mt0sT360YAwAgBAArrt+Xp9eMI78PEfWzuV9+ivdNU0h7bfWxTVXrhMAwBAgjk/iJzrwxNe3fZ8fVT7IYwAAJBATnX1DBtE+n14wqtTXT1R6ogwAgBAQllb3RjWunAgjAAAkEBa/nQmrHXhQBgBACCBXPaN8WGtCwfCCAAACeSZxdeEtS4cCCMAACSQi+0pumra8O+gK0lXTXNEdb0RwggAAAlm+71FwwYSE+uMsOgZAAAJaPu9RazACgAAzLrYnqLNd3/PdBtcpgEAAGYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARnE3DeJWr89SQ3OH2jq7NGWiXYU56UpOspluCwDCLlLPd7HyPBpSGNm0aZOefPJJud1uzZo1Sxs3blRR0fALpGzbtk0bNmzQJ598IqfTqQULFuinP/2pJk2aFHLjSGy1TW5V7HDJ7eka2JbltKu8JFcL8rIMdgYA4RWp57tYeh4N+jJNdXW11qxZo/Xr16uxsVFFRUVauHChWlpahqzfs2ePli5dquXLl+ujjz7Sz3/+c/3ud7/TihUrRt08ElNtk1urth70+wOSpFZPl1ZtPajaJrehzgAgvCL1fBdrz6NBh5Gnn35ay5cv14oVKzRz5kxt3LhR2dnZqqqqGrL+t7/9rWbMmKHVq1crJydH1113ne655x7t379/1M0j8fT6LFXscMka4nP92yp2uNTrG6oCAOJHpJ7vYvF5NKgw0t3drQMHDqi4uNhve3Fxsfbu3TvkPvPmzdOJEydUU1Mjy7L02Wef6e2339att9467Nc5e/asvF6v3wOQpIbmjkFJ/lyWJLenSw3NHdFrCgAiIFLPd7H4PBpUGGlvb1dvb68yMjL8tmdkZKi1tXXIfebNm6dt27Zp8eLFSk1NVWZmpi655BI999xzw36dyspKOZ3OgUd2dnYwbWIMa+sc/g8olDoAiFWRer6LxefRkG7ttdn8X2lrWdagbf1cLpdWr16tRx55RAcOHFBtba2am5tVWlo67PhlZWXyeDwDj+PHj4fSJsagKRPtYa0DgFgVqee7WHweDepumsmTJys5OXnQWZC2trZBZ0v6VVZW6tprr9UDDzwgSbrqqqt00UUXqaioSI8//riysga/YjctLU1paWnBtIYEUZiTriynXa2eriGvd9okZTr7bk8DgHgWqee7WHweDerMSGpqqgoKClRXV+e3va6uTvPmzRtyn9OnTyspyf/LJCcnS+o7owIEIznJpvKSXEl9fzDn6v+4vCSX9UYAxL1IPd/F4vNo0Jdp1q1bp5dffllbtmzR4cOHtXbtWrW0tAxcdikrK9PSpUsH6ktKSvTuu++qqqpKR44c0QcffKDVq1ersLBQU6dODd93goSxIC9LVUvylen0P4WY6bSrakk+64wAGDMi9XwXa8+jNiuE0xObNm3Shg0b5Ha7lZeXp2eeeUbz58+XJC1btkxHjx7Vzp07B+qfe+45vfjii2pubtYll1yiG2+8UU888YQuvfTSgL6e1+uV0+mUx+ORw+EItl2MUbGyciAARFq8rsAa6PE7pDASbYQRAADiT6DHb94oDwAAGEUYAQAARhFGAACAUYQRAABgVFCLngEAgLEjVu5KJIwAAJCAapvcqtjh8nvTvCynXeUluVFfZ4TLNAAAJJjaJrdWbT046N17Wz1dWrX1oGqb3FHthzACAEAC6fVZqtjhGvJ9afq3VexwqdcXvWXICCMAACSQhuaOQWdEzmVJcnu61NDcEbWeCCMAACSQts7hg0godeFAGAEAIIFMmWi/cFEQdeFAGAEAIIEU5qQry2nXcDfw2tR3V01hTnrUeiKMAACQQJKTbCovyZWkQYGk/+PyktyorjdCGAEAIMEsyMtS1ZJ8ZTr9L8VkOu2qWpIf9XVGWPQMAIAEtCAvSzfnZrICKwAAMCc5yaa5V0wy3QaXaQAAgFmEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGJViugEAAKKp12epoblDbZ1dmjLRrsKcdCUn2Uy3ldBCOjOyadMm5eTkyG63q6CgQPX19SPWnz17VuvXr9f06dOVlpamK664Qlu2bAmpYQAAQlXb5NZ1T/xaf7f5t7rvZ4f0d5t/q+ue+LVqm9ymW0toQZ8Zqa6u1po1a7Rp0yZde+21eumll7Rw4UK5XC5ddtllQ+5zxx136LPPPtMrr7yiP/uzP1NbW5t6enpG3TwAAIGqbXJr1daDss7b3urp0qqtB1W1JF8L8rKM9JbobJZlnf9zGdGcOXOUn5+vqqqqgW0zZ87U7bffrsrKykH1tbW1+tu//VsdOXJE6enpITXp9XrldDrl8XjkcDhCGgMAkLh6fZaue+LXcnu6hvy8TVKm0649D97IJZswCvT4HdRlmu7ubh04cEDFxcV+24uLi7V3794h99m+fbtmz56tDRs26NJLL9WVV16p+++/X2fOnBn265w9e1Zer9fvAQBAqBqaO4YNIpJkSXJ7utTQ3BG9pjAgqMs07e3t6u3tVUZGht/2jIwMtba2DrnPkSNHtGfPHtntdr333ntqb2/Xj370I3V0dAz7upHKykpVVFQE0xoAAMNq6xw+iIRSh/AK6QWsNpv/KSzLsgZt6+fz+WSz2bRt2zYVFhZq0aJFevrpp/Xaa68Ne3akrKxMHo9n4HH8+PFQ2gQAQJI0ZaI9rHUIr6DCyOTJk5WcnDzoLEhbW9ugsyX9srKydOmll8rpdA5smzlzpizL0okTJ4bcJy0tTQ6Hw+8BAECoCnPSleW0a7hXg9gkZTn7bvNF9AUVRlJTU1VQUKC6ujq/7XV1dZo3b96Q+1x77bU6efKkTp06NbDt448/VlJSkqZNmxZCywAABCc5yabyklxJGhRI+j8uL8nlxauGBH2ZZt26dXr55Ze1ZcsWHT58WGvXrlVLS4tKS0sl9V1iWbp06UD9nXfeqUmTJukHP/iBXC6Xdu/erQceeEA//OEPNX78+PB9JwAAjGBBXpaqluQr0+l/KSbTaee2XsOCXmdk8eLF+vzzz/XYY4/J7XYrLy9PNTU1mj59uiTJ7XarpaVloP7iiy9WXV2dfvzjH2v27NmaNGmS7rjjDj3++OPh+y4AAAjAgrws3ZybyQqsMSbodUZMYJ0RAADiT0TWGQEAAAg3wggAADCKMAIAAIwijAAAAKMIIwAAwCjCCAAAMIowAgAAjCKMAAAAowgjAADAKMIIAAAwijACAACMIowAAACjCCMAAMAowggAADCKMAIAAIwijAAAAKMIIwAAwCjCCAAAMIowAgAAjCKMAAAAowgjAADAKMIIAAAwijACAACMIowAAACjCCMAAMAowggAADCKMAIAAIwijAAAAKMIIwAAwCjCCAAAMIowAgAAjCKMAAAAowgjAADAKMIIAAAwijACAACMIowAAACjCCMAAMAowggAADCKMAIAAIwijAAAAKMIIwAAwCjCCAAAMIowAgAAjCKMAAAAowgjAADAKMIIAAAwijACAACMIowAAACjCCMAAMAowggAADCKMAIAAIwijAAAAKMIIwAAwCjCCAAAMIowAgAAjEox3QAAAOfr9VlqaO5QW2eXpky0qzAnXclJtpgeu7vHpzf2HdWxjtOanj5Bd82dodSU8PzPH6mxI9lzMGyWZVnB7rRp0yY9+eSTcrvdmjVrljZu3KiioqIL7vfBBx/o+uuvV15eng4dOhTw1/N6vXI6nfJ4PHI4HMG2CwCII7VNblXscMnt6RrYluW0q7wkVwvysmJy7MoalzbXN8t3zhE1ySatLMpR2aLc0bQcsbEj2XO/QI/fQcef6upqrVmzRuvXr1djY6OKioq0cOFCtbS0jLifx+PR0qVL9Rd/8RfBfkkAQIKobXJr1daDfmFBklo9XVq19aBqm9wxN3ZljUsv7fY/qEuSz5Je2t2syhpXqC1HbOxI9hyKoMPI008/reXLl2vFihWaOXOmNm7cqOzsbFVVVY243z333KM777xTc+fODblZAMDY1euzVLHDpaFO1/dvq9jhUu/5R1CDY3f3+LS5vnnEms31zeru8QU1biTHjmTPoQoqjHR3d+vAgQMqLi72215cXKy9e/cOu9+rr76qTz/9VOXl5QF9nbNnz8rr9fo9AABjW0Nzx6CzFueyJLk9XWpo7oiZsd/Yd3TQ2YXz+ay+umBFauxI9hyqoMJIe3u7ent7lZGR4bc9IyNDra2tQ+7zySef6KGHHtK2bduUkhLY62UrKyvldDoHHtnZ2cG0CQCIQ22dw4eFUOqiMfaxjtNhrYvG2JHsOVQhvWTWZvN/1bFlWYO2SVJvb6/uvPNOVVRU6Morrwx4/LKyMnk8noHH8ePHQ2kTABBHpky0h7UuGmNPT58Q1rpojB3JnkMVVBiZPHmykpOTB50FaWtrG3S2RJI6Ozu1f/9+3XvvvUpJSVFKSooee+wx/f73v1dKSop+/etfD/l10tLS5HA4/B4AgLGtMCddWU67hrvJ1qa+O18Kc9JjZuy75s7Qhe4KTrL11QUrUmNHsudQBRVGUlNTVVBQoLq6Or/tdXV1mjdv3qB6h8OhP/zhDzp06NDAo7S0VN/+9rd16NAhzZkzZ3TdAwDGjOQkm8pL+m4pPf9Y2f9xeUluSGuCRGrs1JQkrSzKGbFmZVFOSGt3RGrsSPYcqqAXPVu3bp3uuusuzZ49W3PnztW//du/qaWlRaWlpZL6LrH893//t15//XUlJSUpLy/Pb/8pU6bIbrcP2g4AwIK8LFUtyR+0FkhmGNYCidTY/WtyRGLNjkiNHcmeQxHyomcbNmyQ2+1WXl6ennnmGc2fP1+StGzZMh09elQ7d+4cct9HH31Uv/jFL1j0DAAwLFZgjc7YkV6BNdDjd0hhJNoIIwAAxJ+IrcAKAAAQToQRAABgFGEEAAAYRRgBAABGEUYAAIBRQa8zAgAAxoZI3kIdDMIIAAAJqLbJPWgBuKwwLC4XCi7TAACQYGqb3Fq19aBfEJGkVk+XVm09qNomd1T7IYwAAJBAen2WKna4NNSKp/3bKna41OuL3pqohBEAABJIQ3PHoDMi57IkuT1damjuiFpPhBEAABJIW+fwQSSUunAgjAAAkECmTLSHtS4cCCMAACSQwpx0ZTntGu4GXpv67qopzEmPWk+EEQAAEkhykk3lJbmSNCiQ9H9cXpIb1fVGCCMAACSYBXlZqlqSr0yn/6WYTKddVUvyo77OCIueAQCQgBbkZenm3ExWYAUAAOYkJ9k094pJptvgMg0AADCLMAIAAIwijAAAAKMIIwAAwCjCCAAAMCph76bp9VkxcTsTAGBs6O7x6Y19R3Ws47Smp0/QXXNnKDUlPP/zhzJ2IMe5SPYcDJtlWdF7j+AQeb1eOZ1OeTweORyOUY9X2+RWxQ6X37sWZjntKi/JjfpCLwCA+FdZ49Lm+mb5zjmiJtmklUU5KluUG/WxAznORbLnfoEevxMujNQ2ubVq60Gd/033Z0UTK88BAOJXZY1LL+1uHvbz98wP/eAeytiBHOcaW/4UsZ7PFejxO6FeM9Lrs1SxwzXoByRpYFvFDpd6fTGfzwAAMaC7x6fN9cMf1CVpc32zunt8URk7kONc+f9tiljPoUqoMNLQ3OF3yup8liS3p0sNzR3RawoAELfe2HdUF/r/1Wf11UVj7ECOc591dkes51AlVBhp6xz+BxRKHQAgsR3rOB3WutGOHc7jVyg9hyqhwsiUifYLFwVRBwBIbNPTJ4S1brRjh/P4FUrPoUqoMFKYk64sp13D3cBrU9+rjQtz0qPZFgAgTt01d4YutCpEkq2vLhpjB3Kcy5iYGrGeQ5VQYSQ5yabykr5XB5//c+j/uLwkl/VGAAABSU1J0sqinBFrVhblhLR2RyhjB3Kcq/hfeRHrOVQJFUYkaUFelqqW5CvT6X8qK9Np57ZeAEDQyhbl6p75OYPONiTZRn+LbChjB3Kci2TPoUi4dUb6sQIrACCcWIF1MBY9AwAARrHoGQAAiAuEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRKaYbQOB6fZYamjvU1tmlKRPtKsxJV3KSzXRbFxSvfccb5jk6mOf4x8/wa7EyFyGFkU2bNunJJ5+U2+3WrFmztHHjRhUVFQ1Z++6776qqqkqHDh3S2bNnNWvWLD366KO65ZZbRtV4oqltcqtih0tuT9fAtiynXeUluVqQl2Wws5HFa9/xhnmODuY5/vEz/FoszUXQl2mqq6u1Zs0arV+/Xo2NjSoqKtLChQvV0tIyZP3u3bt18803q6amRgcOHNANN9ygkpISNTY2jrr5RFHb5NaqrQf9fmEkqdXTpVVbD6q2yW2os5HFa9/xhnmODuY5/vEz/FqszYXNsiwrmB3mzJmj/Px8VVVVDWybOXOmbr/9dlVWVgY0xqxZs7R48WI98sgjAdV7vV45nU55PB45HI5g2o17vT5L1z3x60G/MP1skjKddu158MaYOs0Yr33HG+Y5Opjn+MfP8GvRnItAj99BnRnp7u7WgQMHVFxc7Le9uLhYe/fuDWgMn8+nzs5OpaenD1tz9uxZeb1ev0eiamjuGPYXRpIsSW5PlxqaO6LXVADite94wzxHB/Mc//gZfi0W5yKoMNLe3q7e3l5lZGT4bc/IyFBra2tAYzz11FP68ssvdccddwxbU1lZKafTOfDIzs4Ops0xpa1z+F+YUOqiJV77jjfMc3Qwz/GPn+HXYnEuQrq112bzP21jWdagbUN566239Oijj6q6ulpTpkwZtq6srEwej2fgcfz48VDaHBOmTLSHtS5a4rXveMM8RwfzHP/4GX4tFuciqDAyefJkJScnDzoL0tbWNuhsyfmqq6u1fPly/fu//7tuuummEWvT0tLkcDj8HomqMCddWU67hot6NvW9+rkwZ/jLXibEa9/xhnmODuY5/vEz/FoszkVQYSQ1NVUFBQWqq6vz215XV6d58+YNu99bb72lZcuW6c0339Stt94aWqcJKjnJpvKSXEka9IvT/3F5SW7MveAqXvuON8xzdDDP8Y+f4ddicS6Cvkyzbt06vfzyy9qyZYsOHz6stWvXqqWlRaWlpZL6LrEsXbp0oP6tt97S0qVL9dRTT+n73/++Wltb1draKo/HE77vYoxbkJelqiX5ynT6nzLLdNpVtSQ/Zu+Nj9e+4w3zHB3Mc/zjZ/i1WJuLoG/tlfoWPduwYYPcbrfy8vL0zDPPaP78+ZKkZcuW6ejRo9q5c6ck6c///M+1a9euQWPcfffdeu211wL6eol8a++5YmWlvGDFa9/xhnmODuY5/vEz/Fqk5yLQ43dIYSTaCCMAAMSfiKwzAgAAEG6EEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGBUiukGEBt6fZYamjvU1tmlKRPtKsxJV3KSLebHBgDEP8IIVNvkVsUOl9yeroFtWU67yktytSAvK2bHBgCMDVymSXC1TW6t2nrQLyxIUqunS6u2HlRtkzsmxwYAjB2EkQTW67NUscMla4jP9W+r2OFSr2+oCnNjAwDGFsJIAmto7hh01uJcliS3p0sNzR0xNTYAYGwhjCSwts7hw0IoddEaGwAwthBGEtiUifaw1kVrbADA2EIYSWCFOenKcto13E22NvXd+VKYkx5TYwMAxhbCSAJLTrKpvCRXkgaFhv6Py0tyQ1oTJJJjAwDGFsJIgluQl6WqJfnKdPpfLsl02lW1JH9Ua4FEcmwAwNhhsywr5u+t9Hq9cjqd8ng8cjgcptsZk1iBFQAQboEev1mBFZL6LqvMvWJS3I0NAIh/XKYBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARhFGAACAUYQRAABgFGEEAAAYRRgBAABGEUYAAIBRhBEAAGAUYQQAABhFGAEAAEYRRgAAgFGEEQAAYBRhBAAAGEUYAQAARqWYbsCUM929+ucal45+flozJk3Qw4tyNT412XRbAAAknJDOjGzatEk5OTmy2+0qKChQfX39iPW7du1SQUGB7Ha7Lr/8cr344oshNRsuK1//nWY+Uqs3ftui+k/a9cZvWzTzkVqtfP13RvsCACARBR1GqqurtWbNGq1fv16NjY0qKirSwoUL1dLSMmR9c3OzFi1apKKiIjU2Nurhhx/W6tWr9c4774y6+VCsfP13qnO1Dfm5OlcbgQQAgCizWZZlBbPDnDlzlJ+fr6qqqoFtM2fO1O23367KyspB9Q8++KC2b9+uw4cPD2wrLS3V73//e+3bty+gr+n1euV0OuXxeORwOIJp18+Z7l7NfKT2gnWHH1vAJRsAAEYp0ON3UGdGuru7deDAARUXF/ttLy4u1t69e4fcZ9++fYPqb7nlFu3fv19fffXVkPucPXtWXq/X7xEO/1zjCmsdAAAYvaDCSHt7u3p7e5WRkeG3PSMjQ62trUPu09raOmR9T0+P2tvbh9ynsrJSTqdz4JGdnR1Mm8M6+vnpsNYBAIDRC+kFrDabze9jy7IGbbtQ/VDb+5WVlcnj8Qw8jh8/Hkqbg8yYNCGsdQAAYPSCCiOTJ09WcnLyoLMgbW1tg85+9MvMzByyPiUlRZMmTRpyn7S0NDkcDr9HODy8KDesdQAAYPSCCiOpqakqKChQXV2d3/a6ujrNmzdvyH3mzp07qP7999/X7NmzNW7cuCDbHZ3xqcm6OXfKiDU3507hxasAAERR0Jdp1q1bp5dffllbtmzR4cOHtXbtWrW0tKi0tFRS3yWWpUuXDtSXlpbq2LFjWrdunQ4fPqwtW7bolVde0f333x++7yIIm5d+b9hAcnPuFG1e+r0odwQAQGILegXWxYsX6/PPP9djjz0mt9utvLw81dTUaPr06ZIkt9vtt+ZITk6OampqtHbtWr3wwguaOnWqnn32Wf31X/91+L6LIG1e+j1WYAUAIEYEvc6ICeFaZwQAAERPRNYZAQAACDfCCAAAMIowAgAAjCKMAAAAowgjAADAKMIIAAAwijACAACMIowAAACjCCMAAMCooJeDN6F/kViv12u4EwAAEKj+4/aFFnuPizDS2dkpScrOzjbcCQAACFZnZ6ecTuewn4+L96bx+Xw6efKkJk6cKJvNFrZxvV6vsrOzdfz4cd7zJsKY6+hgnqODeY4O5jk6IjnPlmWps7NTU6dOVVLS8K8MiYszI0lJSZo2bVrExnc4HPyiRwlzHR3Mc3Qwz9HBPEdHpOZ5pDMi/XgBKwAAMIowAgAAjEroMJKWlqby8nKlpaWZbmXMY66jg3mODuY5Opjn6IiFeY6LF7ACAICxK6HPjAAAAPMIIwAAwCjCCAAAMIowAgAAjBrzYWTTpk3KycmR3W5XQUGB6uvrR6zftWuXCgoKZLfbdfnll+vFF1+MUqfxLZh5fvfdd3XzzTfrm9/8phwOh+bOnatf/epXUew2vgX7O93vgw8+UEpKir773e9GtsExIth5Pnv2rNavX6/p06crLS1NV1xxhbZs2RKlbuNXsPO8bds2XX311ZowYYKysrL0gx/8QJ9//nmUuo1Pu3fvVklJiaZOnSqbzaZf/OIXF9wn6sdCawz72c9+Zo0bN87avHmz5XK5rPvuu8+66KKLrGPHjg1Zf+TIEWvChAnWfffdZ7lcLmvz5s3WuHHjrLfffjvKnceXYOf5vvvus5544gmroaHB+vjjj62ysjJr3Lhx1sGDB6PcefwJdq77ffHFF9bll19uFRcXW1dffXV0mo1joczzbbfdZs2ZM8eqq6uzmpubrf/6r/+yPvjggyh2HX+Cnef6+norKSnJ+td//VfryJEjVn19vTVr1izr9ttvj3Ln8aWmpsZav3699c4771iSrPfee2/EehPHwjEdRgoLC63S0lK/bd/5zneshx56aMj6n/zkJ9Z3vvMdv2333HOP9f3vfz9iPY4Fwc7zUHJzc62KiopwtzbmhDrXixcvtv7hH/7BKi8vJ4wEINh5/uUvf2k5nU7r888/j0Z7Y0aw8/zkk09al19+ud+2Z5991po2bVrEehxrAgkjJo6FY/YyTXd3tw4cOKDi4mK/7cXFxdq7d++Q++zbt29Q/S233KL9+/frq6++iliv8SyUeT6fz+dTZ2en0tPTI9HimBHqXL/66qv69NNPVV5eHukWx4RQ5nn79u2aPXu2NmzYoEsvvVRXXnml7r//fp05cyYaLcelUOZ53rx5OnHihGpqamRZlj777DO9/fbbuvXWW6PRcsIwcSyMizfKC0V7e7t6e3uVkZHhtz0jI0Otra1D7tPa2jpkfU9Pj9rb25WVlRWxfuNVKPN8vqeeekpffvml7rjjjki0OGaEMteffPKJHnroIdXX1yslZcz+uYdVKPN85MgR7dmzR3a7Xe+9957a29v1ox/9SB0dHbxuZBihzPO8efO0bds2LV68WF1dXerp6dFtt92m5557LhotJwwTx8Ixe2akn81m8/vYsqxB2y5UP9R2+At2nvu99dZbevTRR1VdXa0pU6ZEqr0xJdC57u3t1Z133qmKigpdeeWV0WpvzAjmd9rn88lms2nbtm0qLCzUokWL9PTTT+u1117j7MgFBDPPLpdLq1ev1iOPPKIDBw6otrZWzc3NKi0tjUarCSXax8Ix+6/S5MmTlZycPChht7W1DUp8/TIzM4esT0lJ0aRJkyLWazwLZZ77VVdXa/ny5fr5z3+um266KZJtjgnBznVnZ6f279+vxsZG3XvvvZL6DpqWZSklJUXvv/++brzxxqj0Hk9C+Z3OysrSpZde6vdW6TNnzpRlWTpx4oS+9a1vRbTneBTKPFdWVuraa6/VAw88IEm66qqrdNFFF6moqEiPP/44Z6/DxMSxcMyeGUlNTVVBQYHq6ur8ttfV1WnevHlD7jN37txB9e+//75mz56tcePGRazXeBbKPEt9Z0SWLVumN998k+u9AQp2rh0Oh/7whz/o0KFDA4/S0lJ9+9vf1qFDhzRnzpxotR5XQvmdvvbaa3Xy5EmdOnVqYNvHH3+spKQkTZs2LaL9xqtQ5vn06dNKSvI/bCUnJ0v6+j93jJ6RY2HEXhobA/pvG3vllVcsl8tlrVmzxrrooouso0ePWpZlWQ899JB11113DdT33860du1ay+VyWa+88gq39gYg2Hl+8803rZSUFOuFF16w3G73wOOLL74w9S3EjWDn+nzcTROYYOe5s7PTmjZtmvU3f/M31kcffWTt2rXL+ta3vmWtWLHC1LcQF4Kd51dffdVKSUmxNm3aZH366afWnj17rNmzZ1uFhYWmvoW40NnZaTU2NlqNjY2WJOvpp5+2GhsbB26hjoVj4ZgOI5ZlWS+88II1ffp0KzU11crPz7d27do18Lm7777buv766/3qd+7caV1zzTVWamqqNWPGDKuqqirKHcenYOb5+uuvtyQNetx9993RbzwOBfs7fS7CSOCCnefDhw9bN910kzV+/Hhr2rRp1rp166zTp09Huev4E+w8P/vss1Zubq41fvx4Kysry/r7v/9768SJE1HuOr785je/GfE5NxaOhTbL4twWAAAwZ8y+ZgQAAMQHwggAADCKMAIAAIwijAAAAKMIIwAAwCjCCAAAMIowAgAAjCKMAAAAowgjAADAKMIIAAAwijACAACMIowAAACj/j8exB4pMAUT6wAAAABJRU5ErkJggg=="
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "execution_count": 21
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-17T03:21:27.830695Z",
     "start_time": "2025-09-17T03:21:27.704448Z"
    }
   },
   "cell_type": "code",
   "source": "df.to_csv(\"data_extension_abstracts.csv\", index = False)",
   "id": "ac8d77320512d130",
   "outputs": [],
   "execution_count": 22
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
