{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Synthetic Paraphrased and Negated Caption Generation using 2 LLMs (Phi-4 & Mistral)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import ollama"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('ccneg_paraphrased.csv') # Load csv file with original caption (e.g. CC-Neg dataset). The file should contain a column of filename and a column of caption. Actual file may vary by name."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def verify_with_mistral(original, paraphrased, false_sentence):\n",
    "    \"\"\" Use Mistral to verify the generated sentences \"\"\"\n",
    "    \n",
    "    # Verify paraphrased sentence (should be similar)\n",
    "    prompt_verify_paraphrase = f\"\"\"\n",
    "    The original sentence is: \"{original}\"\n",
    "    The generated paraphrase is: \"{paraphrased}\"\n",
    "    Does the paraphrase accurately retain the meaning of the original? Respond with \"Yes\" or \"No\".\n",
    "    \"\"\"\n",
    "    response_paraphrase = ollama.chat(model=\"mistral\", \n",
    "                                      messages=[{\"role\": \"user\", \"content\": prompt_verify_paraphrase}], \n",
    "                                      options={\"temperature\": 0.1})\n",
    "    is_paraphrase_valid = \"Yes\" in response_paraphrase[\"message\"][\"content\"]\n",
    "\n",
    "    # Verify false sentence (should contradict)\n",
    "    prompt_verify_false = f\"\"\"\n",
    "    The original sentence is: \"{original}\"\n",
    "    The generated false sentence is: \"{false_sentence}\"\n",
    "    Does the false sentence contradict or misrepresent the original? Respond with \"Yes\" or \"No\".\n",
    "    \"\"\"\n",
    "    response_false = ollama.chat(model=\"mistral\", \n",
    "                                 messages=[{\"role\": \"user\", \"content\": prompt_verify_false}], \n",
    "                                 options={\"temperature\": 0.1})\n",
    "    is_false_valid = \"Yes\" in response_false[\"message\"][\"content\"]\n",
    "\n",
    "    return is_paraphrase_valid, is_false_valid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_and_verify(text):\n",
    "    \"\"\"Generate similar and false sentences with phi-4 and verify with mistral\"\"\"\n",
    "    \n",
    "    # Generate similar and false sentences using phi-4\n",
    "    prompt_paraphrase = f\"Rephrase the following sentence with the same meaning:\\n{text}\"\n",
    "    response_paraphrase = ollama.chat(model=\"phi4\", \n",
    "                                      messages=[{\"role\": \"user\", \"content\": prompt_paraphrase}], \n",
    "                                      options={\"temperature\": 0.1})\n",
    "    paraphrased_sentence = response_paraphrase[\"message\"][\"content\"]\n",
    "\n",
    "    prompt_false = f\"Generate a completely false version of this sentence:\\n{text}\"\n",
    "    response_false = ollama.chat(model=\"phi4\", \n",
    "                                 messages=[{\"role\": \"user\", \"content\": prompt_false}], \n",
    "                                 options={\"temperature\": 0.1})\n",
    "    false_sentence = response_false[\"message\"][\"content\"]\n",
    "\n",
    "    # Verify the outputs with Mistral\n",
    "    is_paraphrase_valid, is_false_valid = verify_with_mistral(text, paraphrased_sentence, false_sentence)\n",
    "\n",
    "    # If mistral rejects, regenerate using mistral\n",
    "    if not is_paraphrase_valid:\n",
    "        response_paraphrase = ollama.chat(model=\"mistral\", \n",
    "                                          messages=[{\"role\": \"user\", \"content\": prompt_paraphrase}], \n",
    "                                          options={\"temperature\": 0.1})\n",
    "        paraphrased_sentence = response_paraphrase[\"message\"][\"content\"]\n",
    "\n",
    "    if not is_false_valid:\n",
    "        response_false = ollama.chat(model=\"mistral\", \n",
    "                                     messages=[{\"role\": \"user\", \"content\": prompt_false}], \n",
    "                                     options={\"temperature\": 0.1})\n",
    "        false_sentence = response_false[\"message\"][\"content\"]\n",
    "\n",
    "    return paraphrased_sentence, false_sentence, is_paraphrase_valid, is_false_valid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply the function to generate and verify new sentences\n",
    "df[[\"paraphrased\", \"negation\", \"cap_Mistral_Verified_Similar\", \"cap_Mistral_Verified_False\"]] = \\\n",
    "    df[\"caption\"].apply(lambda x: pd.Series(generate_and_verify(x)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('ccneg_paraphrased.csv', index=False) # This is the updated version with new caption columns and verification from Mistral. This is the file for SemCLIP training."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "multimodal",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
