{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/yash/Desktop/Lacoco/len-gen/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 05-07 19:02:47 [__init__.py:239] Automatically detected platform cpu.\n"
     ]
    }
   ],
   "source": [
    "from __future__ import annotations\n",
    "\n",
    "import tqdm\n",
    "import random\n",
    "import argparse\n",
    "import jsonlines\n",
    "from pathlib import Path\n",
    "\n",
    "from typing import Dict, List, Any\n",
    "from banks.registries import DirectoryPromptRegistry\n",
    "\n",
    "from vllm import LLM, SamplingParams\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "\n",
    "MODELS: Dict[str, str] = {\n",
    "    'llama3_8B': \"/local/common_models/Llama-3.1-8B\", \n",
    "    'llama3_70B': \"/local/common_models/Llama-3.1-70B\",\n",
    "    'gemma3_1B': \"/local/common_models/gemma-3-1b-pt\",\n",
    "    'gemma3_27B': \"/local/common_models/gemma-3-27b-pt\",\n",
    "    'qwen2.5_7B': \"/local/common_models/Qwen2.5-7B\",\n",
    "    'qwen2.5_32B': \"/local/common_models/Qwen2.5-32B\",\n",
    "}\n",
    "\n",
    "DATA_PATHS: Dict[tuple] = {\n",
    "    # 'datasets/realistic/git_tasks.jsonl': ['revert', 'cherrypick'],\n",
    "    '../datasets/realistic/stacktrace_tasks.jsonl': ['traceb', 'tracef'],\n",
    "}\n",
    "\n",
    "FEW_SHOT_CREATE: Dict[str, List[str]] = {\n",
    "    'traceb': [\"=== NEW TRACEBACK ===\", \"dummy\", \"=== ANSWER ===\", \"<start>\", \"dummy\", \"<end>\"],\n",
    "    'tracef': [\"=== NEW TRACEBACK ===\", \"dummy\", \"=== ANSWER ===\", \"<start>\", \"dummy\", \"<end>\"],\n",
    "    'revert': [\"=== NEW HISTORY ===\", \"dummy\", \"=== ANSWER ===\", \"<start>\", \"dummy\", \"<end>\"],\n",
    "    'cherrypick': [\"=== NEW HISTORY ===\", \"dummy\", \"=== ANSWER ===\", \"<start>\", \"dummy\", \"<end>\"]\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def load_dataset(data_path: Path, task_key: str) -> List[Dict]:\n",
    "    \"\"\"Load JSONL records and organise them per induction setting.\"\"\"    \n",
    "    records: List[Dict] = []\n",
    "    data_path = Path(data_path)\n",
    "    with jsonlines.open(data_path, \"r\") as reader:\n",
    "        records.extend(reader)\n",
    "    \n",
    "    assert task_key in ['traceb', 'tracef', 'revert', 'cherrypick']\n",
    "    dataset = []\n",
    "    for rec in records:\n",
    "        dataset.append({\n",
    "            'input': rec['snippet'], \n",
    "            'target': rec[task_key],\n",
    "        })\n",
    "    return dataset\n",
    "\n",
    "\n",
    "def get_prompts_from_registry(prompt_path: str, config: str = 'exact'):\n",
    "    \"\"\"\n",
    "    Load task and system prompts from a prompt registry directory.\n",
    "    \"\"\"\n",
    "    registry = DirectoryPromptRegistry(Path(prompt_path), force_reindex=True)\n",
    "    task_prompt = registry.get(name=f\"task_{config}\")\n",
    "    return task_prompt\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "class PromptBuilder:\n",
    "    \"\"\"Construct few shot prompts according to *PromptVariant*.\"\"\"\n",
    "\n",
    "    def __init__(\n",
    "        self,\n",
    "        shots: int,\n",
    "        prompt_dir: Path,\n",
    "        task_key: str,\n",
    "        test_data_path: Path,\n",
    "        tokenizer: AutoTokenizer,\n",
    "    ) -> None:\n",
    "\n",
    "        self.shots = shots\n",
    "        self.task_key = task_key\n",
    "        self.tokenizer = tokenizer\n",
    "\n",
    "        self.dataset = load_dataset(test_data_path, task_key=task_key)\n",
    "        self.task_prompt = get_prompts_from_registry(prompt_dir, task_key)\n",
    "\n",
    "    def _make_example_line(self, input_str: str, target: str | None) -> str:\n",
    "        \"\"\"Return an *atomised* example line (with or without target).\"\"\"\n",
    "        relevant_list = FEW_SHOT_CREATE[self.task_key].copy()\n",
    "        # Insert the input string into the dummy placeholder\n",
    "        relevant_list[1] = input_str\n",
    "        relevant_list[4] = target\n",
    "        # Each line in the list is joined by a newline character\n",
    "        return \"\\n\".join(relevant_list)\n",
    "\n",
    "    def build_prompt(self, curr_record: Dict) -> List[int]:\n",
    "        \"\"\"Return a fully instantiated prompt string\"\"\"\n",
    "        few_shot_pool = [rec for rec in self.dataset if rec != curr_record]\n",
    "        few_shot_examples = random.sample(few_shot_pool, k=self.shots)\n",
    "        # Add the few‑shot examples\n",
    "        examples = [self._make_example_line(ex[\"input\"], ex[\"target\"]) for ex in few_shot_examples]\n",
    "        # Each example is separated by a \\n\\n\n",
    "        examples_block = '\\n\\n'.join(examples)\n",
    "        # Add the current code snippet to the relevant block : seems correct ! \n",
    "        curr_prompt = self.task_prompt.text({'few_shot_block': examples_block, 'snippet': curr_record['input']})\n",
    "        ids = [self.tokenizer.bos_token_id] + self.tokenizer.encode(curr_prompt, add_special_tokens=False)\n",
    "        print(ids)\n",
    "        return ids\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "class ModelEvaluator:\n",
    "    \"\"\"\n",
    "    Handles vLLM-based inference for batch prompt evaluation.\n",
    "    \"\"\"\n",
    "    def __init__(self, model_path: str, temperature: float, max_tokens: int, seed: int, tensor_parallel_size: int):\n",
    "        # Load tokenizer and LLM engine\n",
    "        print(f\"Loading tokenizer from {model_path}...\")\n",
    "        self.tokenizer = AutoTokenizer.from_pretrained(model_path, add_prefix_space=False)\n",
    "        self.tokenizer.pad_token = self.tokenizer.eos_token\n",
    "\n",
    "        print(f\"Initializing vLLM engine with model {model_path}...\")\n",
    "        self.llm = LLM(model=model_path, tensor_parallel_size=tensor_parallel_size, seed=seed, skip_tokenizer_init=False)\n",
    "        self.sampling_params = SamplingParams(\n",
    "            max_tokens=max_tokens,\n",
    "            temperature=temperature,\n",
    "            top_p=1.0,\n",
    "            stop=[\"\\n<end>\", \"<end>\"],   # both variants, safe on all traces\n",
    "        )\n",
    "\n",
    "\n",
    "    def run(self, builder, batch_size: int) -> List[Dict[str, Any]]:\n",
    "        \"\"\"\n",
    "        Generate outputs for a list of prompt strings in batches.\n",
    "        Returns list of (generated_text, usage_stats).\n",
    "        \"\"\"\n",
    "        records = builder.dataset\n",
    "        results = []\n",
    "        for i in tqdm.tqdm(range(0, len(records), batch_size)):\n",
    "            curr_inp_batch = records[i : i + batch_size]\n",
    "            \n",
    "            token_batches = [builder.build_prompt(record) for record in curr_inp_batch]\n",
    "            responses = self.llm.generate(\n",
    "                prompt_token_ids=token_batches,\n",
    "                sampling_params=self.sampling_params,\n",
    "            )\n",
    "            for resp, curr_input in zip(responses, curr_inp_batch):\n",
    "                gen_ids = resp.outputs[0].token_ids\n",
    "                out_text = self.tokenizer.decode(gen_ids).strip()\n",
    "                results.append({\n",
    "                    'completion_tokens': len(gen_ids),\n",
    "                    'input_text': curr_input['input'],\n",
    "                    'full_answer': out_text,\n",
    "                    'gold_ans': curr_input['target'],\n",
    "                    'exact_match': out_text == curr_input['target']\n",
    "                })\n",
    "                print(results)\n",
    "            break\n",
    "        return results\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def save_to_jsonl(out_path: Path, records: list[dict]) -> None:\n",
    "    \"\"\"\n",
    "    Save a list of dictionaries to a JSON Lines file.\n",
    "\n",
    "    :param out_path: Path to the output file\n",
    "    :param records: List of dicts to write\n",
    "    \"\"\"\n",
    "    out_path.parent.mkdir(parents=True, exist_ok=True)\n",
    "    print(f\"Saving results to {out_path}\")\n",
    "    with jsonlines.open(out_path, mode='w') as writer:\n",
    "        writer.write_all(records)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading tokenizer from Qwen/Qwen2.5-1.5B...\n",
      "Initializing vLLM engine with model Qwen/Qwen2.5-1.5B...\n",
      "INFO 05-07 19:02:49 [config.py:2673] For macOS with Apple Silicon, currently bfloat16 is not supported. Setting dtype to float16.\n",
      "WARNING 05-07 19:02:49 [config.py:2704] Casting torch.bfloat16 to torch.float16.\n",
      "INFO 05-07 19:02:54 [config.py:600] This model supports multiple tasks: {'classify', 'reward', 'embed', 'generate', 'score'}. Defaulting to 'generate'.\n",
      "WARNING 05-07 19:02:54 [arg_utils.py:1708] device type=cpu is not supported by the V1 Engine. Falling back to V0. \n",
      "WARNING 05-07 19:02:54 [arg_utils.py:1581] The model has a long context length (131072). This may causeOOM during the initial memory profiling phase, or result in low performance due to small KV cache size. Consider setting --max-model-len to a smaller value.\n",
      "INFO 05-07 19:02:54 [config.py:1634] Disabled the custom all-reduce kernel because it is not supported on current platform.\n",
      "WARNING 05-07 19:02:54 [cpu.py:106] Environment variable VLLM_CPU_KVCACHE_SPACE (GiB) for CPU backend is not set, using 4 by default.\n",
      "WARNING 05-07 19:02:54 [cpu.py:119] uni is not supported on CPU, fallback to mp distributed executor backend.\n",
      "WARNING 05-07 19:02:54 [cpu.py:163] Default to spawn method on MacOS. If this is not desired, set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly.\n",
      "INFO 05-07 19:02:54 [llm_engine.py:242] Initializing a V0 LLM engine (v0.8.3) with config: model='Qwen/Qwen2.5-1.5B', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cpu, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=20, served_model_name=Qwen/Qwen2.5-1.5B, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=False, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={\"splitting_ops\":[],\"compile_sizes\":[],\"cudagraph_capture_sizes\":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],\"max_capture_size\":256}, use_cached_outputs=False, \n",
      "INFO 05-07 19:02:56 [cpu.py:45] Using Torch SDPA backend.\n",
      "INFO 05-07 19:02:56 [importing.py:16] Triton not installed or not compatible; certain GPU-related functions will not be available.\n",
      "INFO 05-07 19:02:56 [parallel_state.py:957] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0\n",
      "INFO 05-07 19:02:56 [weight_utils.py:265] Using model weights format ['*.safetensors']\n",
      "INFO 05-07 19:02:56 [weight_utils.py:315] No model.safetensors.index.json found in remote.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]\n",
      "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:02<00:00,  2.86s/it]\n",
      "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:02<00:00,  2.86s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 05-07 19:02:59 [loader.py:447] Loading weights took 2.87 seconds\n",
      "INFO 05-07 19:02:59 [executor_base.py:112] # cpu blocks: 9362, # CPU blocks: 0\n",
      "INFO 05-07 19:02:59 [executor_base.py:117] Maximum concurrency for 131072 tokens per request: 1.14x\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 05-07 19:03:00 [llm_engine.py:448] init engine (profile, create kv cache, warmup model) took 0.83 seconds\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# Initialize evaluator\n",
    "evalr = ModelEvaluator(\n",
    "    model_path='Qwen/Qwen2.5-1.5B',\n",
    "    temperature=0,\n",
    "    max_tokens=200,\n",
    "    seed=20,\n",
    "    tensor_parallel_size=1\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/1500 [00:00<?, ?it/s]/var/folders/lk/jzr5f72x3kbchh0b31kq9xh00000gn/T/ipykernel_67097/2278168425.py:11: DeprecationWarning: The keyword arguments {'prompt_token_ids'} are deprecated and will be removed in a future update. Please use the 'prompts' parameter instead.\n",
      "  outputs = evalr.run(builder, 1)\n",
      "  0%|          | 0/1500 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "prompt must be a string, array of strings, array of tokens, or array of token arrays",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb Cell 7\u001b[0m line \u001b[0;36m1\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb#W6sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m \u001b[39mfor\u001b[39;00m task_type \u001b[39min\u001b[39;00m task_info_list:\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb#W6sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m     builder \u001b[39m=\u001b[39m PromptBuilder(\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb#W6sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m         shots\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m,\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb#W6sZmlsZQ%3D%3D?line=5'>6</a>\u001b[0m         prompt_dir\u001b[39m=\u001b[39mPath(\u001b[39m'\u001b[39m\u001b[39m../prompts/realistic/codeassist/\u001b[39m\u001b[39m'\u001b[39m),\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb#W6sZmlsZQ%3D%3D?line=8'>9</a>\u001b[0m         tokenizer\u001b[39m=\u001b[39mevalr\u001b[39m.\u001b[39mtokenizer\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb#W6sZmlsZQ%3D%3D?line=9'>10</a>\u001b[0m     )\n\u001b[0;32m---> <a href='vscode-notebook-cell:/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb#W6sZmlsZQ%3D%3D?line=10'>11</a>\u001b[0m     outputs \u001b[39m=\u001b[39m evalr\u001b[39m.\u001b[39;49mrun(builder, \u001b[39m1\u001b[39;49m)\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb#W6sZmlsZQ%3D%3D?line=11'>12</a>\u001b[0m     \u001b[39mbreak\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb#W6sZmlsZQ%3D%3D?line=12'>13</a>\u001b[0m     \u001b[39m# out_file = Path('') /f\"{task_type}\" / f\"{args.model}_{data_path.stem}.jsonl\"\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb#W6sZmlsZQ%3D%3D?line=13'>14</a>\u001b[0m     \u001b[39m# save_to_jsonl(out_file, outputs)\u001b[39;00m\n",
      "\u001b[1;32m/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb Cell 7\u001b[0m line \u001b[0;36m3\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb#W6sZmlsZQ%3D%3D?line=28'>29</a>\u001b[0m curr_inp_batch \u001b[39m=\u001b[39m records[i : i \u001b[39m+\u001b[39m batch_size]\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb#W6sZmlsZQ%3D%3D?line=30'>31</a>\u001b[0m token_batches \u001b[39m=\u001b[39m [builder\u001b[39m.\u001b[39mbuild_prompt(record) \u001b[39mfor\u001b[39;00m record \u001b[39min\u001b[39;00m curr_inp_batch]\n\u001b[0;32m---> <a href='vscode-notebook-cell:/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb#W6sZmlsZQ%3D%3D?line=31'>32</a>\u001b[0m responses \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mllm\u001b[39m.\u001b[39;49mgenerate(\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb#W6sZmlsZQ%3D%3D?line=32'>33</a>\u001b[0m     prompt_token_ids\u001b[39m=\u001b[39;49mtoken_batches,\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb#W6sZmlsZQ%3D%3D?line=33'>34</a>\u001b[0m     sampling_params\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msampling_params,\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb#W6sZmlsZQ%3D%3D?line=34'>35</a>\u001b[0m )\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb#W6sZmlsZQ%3D%3D?line=35'>36</a>\u001b[0m \u001b[39mfor\u001b[39;00m resp, curr_input \u001b[39min\u001b[39;00m \u001b[39mzip\u001b[39m(responses, curr_inp_batch):\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/yash/Desktop/Lacoco/len-gen/realistic/check_program_qwen.ipynb#W6sZmlsZQ%3D%3D?line=36'>37</a>\u001b[0m     gen_ids \u001b[39m=\u001b[39m resp\u001b[39m.\u001b[39moutputs[\u001b[39m0\u001b[39m]\u001b[39m.\u001b[39mtoken_ids\n",
      "File \u001b[0;32m~/Desktop/Lacoco/len-gen/.venv/lib/python3.12/site-packages/vllm/utils.py:1131\u001b[0m, in \u001b[0;36mdeprecate_kwargs.<locals>.wrapper.<locals>.inner\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m   1124\u001b[0m             msg \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m{\u001b[39;00madditional_message\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m   1126\u001b[0m         warnings\u001b[39m.\u001b[39mwarn(\n\u001b[1;32m   1127\u001b[0m             \u001b[39mDeprecationWarning\u001b[39;00m(msg),\n\u001b[1;32m   1128\u001b[0m             stacklevel\u001b[39m=\u001b[39m\u001b[39m3\u001b[39m,  \u001b[39m# The inner function takes up one level\u001b[39;00m\n\u001b[1;32m   1129\u001b[0m         )\n\u001b[0;32m-> 1131\u001b[0m \u001b[39mreturn\u001b[39;00m fn(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
      "File \u001b[0;32m~/Desktop/Lacoco/len-gen/.venv/lib/python3.12/site-packages/vllm/entrypoints/llm.py:437\u001b[0m, in \u001b[0;36mLLM.generate\u001b[0;34m(self, prompts, sampling_params, prompt_token_ids, use_tqdm, lora_request, prompt_adapter_request, guided_options_request, priority)\u001b[0m\n\u001b[1;32m    434\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mjoin(messages))\n\u001b[1;32m    436\u001b[0m \u001b[39mif\u001b[39;00m prompt_token_ids \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m--> 437\u001b[0m     parsed_prompts \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_convert_v1_inputs(\n\u001b[1;32m    438\u001b[0m         prompts\u001b[39m=\u001b[39;49mcast(Optional[Union[\u001b[39mstr\u001b[39;49m, \u001b[39mlist\u001b[39;49m[\u001b[39mstr\u001b[39;49m]]], prompts),\n\u001b[1;32m    439\u001b[0m         prompt_token_ids\u001b[39m=\u001b[39;49mprompt_token_ids,\n\u001b[1;32m    440\u001b[0m     )\n\u001b[1;32m    441\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    442\u001b[0m     parsed_prompts \u001b[39m=\u001b[39m cast(Union[PromptType, Sequence[PromptType]],\n\u001b[1;32m    443\u001b[0m                           prompts)\n",
      "File \u001b[0;32m~/Desktop/Lacoco/len-gen/.venv/lib/python3.12/site-packages/vllm/entrypoints/llm.py:1245\u001b[0m, in \u001b[0;36mLLM._convert_v1_inputs\u001b[0;34m(self, prompts, prompt_token_ids)\u001b[0m\n\u001b[1;32m   1242\u001b[0m     prompts \u001b[39m=\u001b[39m [p[\u001b[39m\"\u001b[39m\u001b[39mcontent\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39mfor\u001b[39;00m p \u001b[39min\u001b[39;00m parse_and_batch_prompt(prompts)]\n\u001b[1;32m   1243\u001b[0m \u001b[39mif\u001b[39;00m prompt_token_ids \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m   1244\u001b[0m     prompt_token_ids \u001b[39m=\u001b[39m [\n\u001b[0;32m-> 1245\u001b[0m         p[\u001b[39m\"\u001b[39m\u001b[39mcontent\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39mfor\u001b[39;00m p \u001b[39min\u001b[39;00m parse_and_batch_prompt(prompt_token_ids)\n\u001b[1;32m   1246\u001b[0m     ]\n\u001b[1;32m   1248\u001b[0m num_requests \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m   1249\u001b[0m \u001b[39mif\u001b[39;00m prompts \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n",
      "File \u001b[0;32m~/Desktop/Lacoco/len-gen/.venv/lib/python3.12/site-packages/vllm/inputs/parse.py:68\u001b[0m, in \u001b[0;36mparse_and_batch_prompt\u001b[0;34m(prompt)\u001b[0m\n\u001b[1;32m     61\u001b[0m         \u001b[39mif\u001b[39;00m is_list_of(prompt[\u001b[39m0\u001b[39m], \u001b[39mint\u001b[39m):\n\u001b[1;32m     62\u001b[0m             \u001b[39m# case 4: array of token arrays\u001b[39;00m\n\u001b[1;32m     63\u001b[0m             \u001b[39mreturn\u001b[39;00m [\n\u001b[1;32m     64\u001b[0m                 ParsedTokens(content\u001b[39m=\u001b[39melem, is_tokens\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[1;32m     65\u001b[0m                 \u001b[39mfor\u001b[39;00m elem \u001b[39min\u001b[39;00m prompt\n\u001b[1;32m     66\u001b[0m             ]\n\u001b[0;32m---> 68\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mTypeError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mprompt must be a string, array of strings, \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m     69\u001b[0m                 \u001b[39m\"\u001b[39m\u001b[39marray of tokens, or array of token arrays\u001b[39m\u001b[39m\"\u001b[39m)\n",
      "\u001b[0;31mTypeError\u001b[0m: prompt must be a string, array of strings, array of tokens, or array of token arrays"
     ]
    }
   ],
   "source": [
    "\n",
    "for data_path, task_info_list in DATA_PATHS.items():\n",
    "    data_path = Path(data_path)\n",
    "    for task_type in task_info_list:\n",
    "        builder = PromptBuilder(\n",
    "            shots=1,\n",
    "            prompt_dir=Path('../prompts/realistic/codeassist/'),\n",
    "            task_key=task_type,\n",
    "            test_data_path= Path(data_path),\n",
    "            tokenizer=evalr.tokenizer\n",
    "        )\n",
    "        outputs = evalr.run(builder, 1)\n",
    "        break\n",
    "        # out_file = Path('') /f\"{task_type}\" / f\"{args.model}_{data_path.stem}.jsonl\"\n",
    "        # save_to_jsonl(out_file, outputs)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
