{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/shubham/anaconda3/envs/codex/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "sys.path.append('syncode') # Assuming we are in the root directory\n",
    "from syncode import Syncode\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "college_grammar = r\"\"\"\n",
    "        ?start: function \" \" \"of\" \" \" dept code     \n",
    "        function: \"instructor\" | \"students\" | \"capacity\" |  \"deptcode\"  | \"school\" | \"college\"\n",
    "        dept:  /[A-Z]{3}/ \n",
    "        code: /[0-9]{3}/  \n",
    "    \"\"\"\n",
    "\n",
    "college_prompt = \"\"\"Paraphrase the following sentences\n",
    "Human: who teaches CSE101?\n",
    "Assistant:instructor of CSE101\n",
    "Human: how many students can enroll in PSY456?\n",
    "Assistant:capacity of PSY456\n",
    "Human: what's the department of BIO433?\n",
    "Assistant:\"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Llama-7B\n",
    "### 1. Standard generation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.03s/it]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "\"department of BIO433\\nHuman: what's the department of BIO433?\\nAssistant:department of BIO433.\\nHuman: what's the department of BIO4\""
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = '/data/share/models/hugging_face/Llama-7b'\n",
    "syn_llm = Syncode(model=model, grammar=college_grammar, parse_output_only=True, max_new_tokens=50, parser='lr', mode='original')\n",
    "\n",
    "syn_llm.infer(college_prompt)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. SynCode Generation in `grammar_mask` mode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.06s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading Lark base parser from cache: cache/parsers/custom_lr_3468426497_parser.pkl\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'deptcode of BIOL'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = '/data/share/models/hugging_face/Llama-7b'\n",
    "syn_llm = Syncode(model=model, grammar=college_grammar, parse_output_only=True, max_new_tokens=50, parser='lr', mode='grammar_mask')\n",
    "\n",
    "syn_llm.infer(college_prompt)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. SynCode Generation in `grammar_strict` mode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.07s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading Lark base parser from cache: cache/parsers/custom_lr_3468426497_parser.pkl\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'deptcode of BIO433'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = '/data/share/models/hugging_face/Llama-7b'\n",
    "syn_llm = Syncode(model=model, grammar=college_grammar, parse_output_only=True, max_new_tokens=50, parser='lr', mode='grammar_strict')\n",
    "\n",
    "syn_llm.infer(college_prompt)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Microsoft Phi-2\n",
    "### 1. Standard generation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.34it/s]\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'department of BIO433\\n'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = 'microsoft/phi-2'\n",
    "syn_llm = Syncode(model=model, grammar=college_grammar, parse_output_only=True, max_new_tokens=50, parser='lr', mode='original')\n",
    "\n",
    "syn_llm.infer(college_prompt)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. SynCode Generation in `grammar_mask` mode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.99it/s]\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading Lark base parser from cache: cache/parsers/custom_lr_3468426497_parser.pkl\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'dep'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = 'microsoft/phi-2'\n",
    "syn_llm = Syncode(model=model, grammar=college_grammar, parse_output_only=True, max_new_tokens=50, parser='lr', mode='grammar_mask')\n",
    "\n",
    "syn_llm.infer(college_prompt)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. SynCode Generation in `grammar_strict` mode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.08it/s]\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading Lark base parser from cache: cache/parsers/custom_lr_3468426497_parser.pkl\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'deptcode of BIO433'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = 'microsoft/phi-2'\n",
    "syn_llm = Syncode(model=model, grammar=college_grammar, parse_output_only=True, max_new_tokens=50, parser='lr', mode='grammar_strict')\n",
    "\n",
    "syn_llm.infer(college_prompt)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## WizardCoder-1B\n",
    "### 1. Standard generation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"Computer Science\\nHuman: what's the name of the professor in CSE101?\\nAssistant:Dr. Smith\\nHuman: what's the name of the professor in PSY456?\\nAssistant:Dr. John\""
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = 'WizardLM/WizardCoder-1B-V1.0'\n",
    "syn_llm = Syncode(model=model, grammar=college_grammar, parse_output_only=True, max_new_tokens=50, parser='lr', mode='original')\n",
    "\n",
    "syn_llm.infer(college_prompt)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. SynCode Generation in `grammar_mask` mode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading Lark base parser from cache: cache/parsers/custom_lr_3468426497_parser.pkl\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'school of '"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = 'WizardLM/WizardCoder-1B-V1.0'\n",
    "syn_llm = Syncode(model=model, grammar=college_grammar, parse_output_only=True, max_new_tokens=50, parser='lr', mode='grammar_mask')\n",
    "\n",
    "syn_llm.infer(college_prompt)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. SynCode Generation in `grammar_strict` mode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading Lark base parser from cache: cache/parsers/custom_lr_3468426497_parser.pkl\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'school of BIO433'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = 'WizardLM/WizardCoder-1B-V1.0'\n",
    "syn_llm = Syncode(model=model, grammar=college_grammar, parse_output_only=True, max_new_tokens=50, parser='lr', mode='grammar_strict')\n",
    "\n",
    "syn_llm.infer(college_prompt)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Codegen-350m\n",
    "### 1. Standard generation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'how many students can enroll in PSY456?\\nAssistant:capacity of PSY456\\nHuman: how many students can enroll in PSY456?\\nAssistant:how many students can enroll in PSY456?\\nHuman: how many students'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = 'Salesforce/codegen-350M-multi'\n",
    "syn_llm = Syncode(model=model, grammar=college_grammar, parse_output_only=True, max_new_tokens=50, parser='lr', mode='original')\n",
    "\n",
    "syn_llm.infer(college_prompt)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. SynCode Generation in `grammar_mask` mode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading Lark base parser from cache: cache/parsers/custom_lr_3468426497_parser.pkl\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'instructor of BIO433'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = 'Salesforce/codegen-350M-multi'\n",
    "syn_llm = Syncode(model=model, grammar=college_grammar, parse_output_only=True, max_new_tokens=50, parser='lr', mode='grammar_mask')\n",
    "\n",
    "syn_llm.infer(college_prompt)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. SynCode Generation in `grammar_strict` mode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading Lark base parser from cache: cache/parsers/custom_lr_3468426497_parser.pkl\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'instructor of BIO433'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = 'Salesforce/codegen-350M-multi'\n",
    "syn_llm = Syncode(model=model, grammar=college_grammar, parse_output_only=True, max_new_tokens=50, parser='lr', mode='grammar_strict')\n",
    "\n",
    "syn_llm.infer(college_prompt)[0]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "codex",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
