{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dcd51bc7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from tqdm import tqdm\n",
    "import json\n",
    "import pandas as pd\n",
    "\n",
    "\n",
    "exp = []\n",
    "for filename in tqdm(os.listdir(\"INSERT YOURS/explanations/\")):\n",
    "    with open(os.path.join(\"INSERT YOURS/explanations/\", filename)) as f:\n",
    "        lines = f.readlines()\n",
    "        for line in lines:\n",
    "            exp.append(json.loads(line))\n",
    "\n",
    "exp_df = []\n",
    "for feat in exp:\n",
    "    exp_df.append({\n",
    "        \"index\": int(feat[\"index\"]),\n",
    "        \"desc\": feat[\"description\"],\n",
    "    })\n",
    "\n",
    "exp_df = pd.DataFrame(exp_df)\n",
    "exp_df = exp_df[~ exp_df[\"index\"].duplicated()]\n",
    "\n",
    "exp_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb0326e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_request_line(feature_idx, feature_desc, tag):\n",
    "    messages = [\n",
    "        {\n",
    "            \"role\": \"user\",\n",
    "            \"content\": (\n",
    "                f\"Here is the description of a certain textual property: \\\"{feature_desc.strip()}\\\". \"\n",
    "                f\"Is this property related to {tag}? \"\n",
    "                \"Respond with one word only: yes or no.\"\n",
    "            )\n",
    "        },\n",
    "    ]\n",
    "\n",
    "    return {\n",
    "        \"custom_id\": f\"{feature_idx}-{tag.replace(' ', '_')}\", \n",
    "        \"method\": \"POST\",\n",
    "        \"url\": \"/v1/chat/completions\",\n",
    "        \"body\": {\n",
    "            \"model\": \"gpt-4.1-nano-2025-04-14\", \n",
    "            \"messages\": messages, \n",
    "            \"max_completion_tokens\": 3\n",
    "        }\n",
    "    }\n",
    "\n",
    "tags = [\n",
    "    \"computer code, programming languages, or math\",\n",
    "    \"syntax or text structure\"\n",
    "]\n",
    "\n",
    "lines = []\n",
    "for tag in tags:\n",
    "    for _, row in exp_df.iterrows():\n",
    "        lines.append(create_request_line(row[\"index\"], row[\"desc\"], tag))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99ff273d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open(\"INSERT YOURS.jsonl\", \"w\") as f:\n",
    "    for line in lines:\n",
    "        f.write(json.dumps(line) + \"\\n\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbc6a70e",
   "metadata": {},
   "outputs": [],
   "source": [
    "OPENAI_KEY = \"INSERT YOURS\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a83bf2c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "from openai import OpenAI\n",
    "client = OpenAI(api_key=OPENAI_KEY)\n",
    "\n",
    "batch_input_file = client.files.create(\n",
    "    file=open(\"INSERT YOURS.jsonl\", \"rb\"),\n",
    "    purpose=\"batch\"\n",
    ")\n",
    "\n",
    "batch_input_file_id = batch_input_file.id\n",
    "batch = client.batches.create(\n",
    "    input_file_id=batch_input_file_id,\n",
    "    endpoint=\"/v1/chat/completions\",\n",
    "    completion_window=\"24h\",\n",
    "    metadata={\n",
    "        \"description\": \"feature tagging\"\n",
    "    }\n",
    ")\n",
    "\n",
    "batch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eebd854c",
   "metadata": {},
   "outputs": [],
   "source": [
    "batch = client.batches.retrieve(\"INSERT YOURS\")\n",
    "print(batch)\n",
    "print(batch.status)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ac2e622",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_response = client.files.content(\"INSERT YOURS\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "47b0e215",
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_response_line(line):\n",
    "    unjsoned = json.loads(line)\n",
    "    custom_id = unjsoned[\"custom_id\"]\n",
    "    feature_idx = int(custom_id.split(\"-\")[0])\n",
    "    tag = custom_id.split(\"-\")[1].replace(\"_\", \" \")\n",
    "    response = unjsoned[\"response\"][\"body\"][\"choices\"][0][\"message\"][\"content\"]\n",
    "    return {\n",
    "        \"feature_idx\": feature_idx,\n",
    "        \"tag\": tag,\n",
    "        \"response\": response.strip().lower()\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76bce546",
   "metadata": {},
   "outputs": [],
   "source": [
    "results = [parse_response_line(line) for line in file_response.iter_lines()]\n",
    "results = pd.DataFrame(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c0fa7b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "results[\"response\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23c89366",
   "metadata": {},
   "outputs": [],
   "source": [
    "results_is_code = results[results[\"tag\"] == \"computer code, programming languages, or math\"][[\"feature_idx\", \"response\"]]\n",
    "results_is_structure = results[results[\"tag\"] == \"syntax or text structure\"][[\"feature_idx\", \"response\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "617e2dbe",
   "metadata": {},
   "outputs": [],
   "source": [
    "exp_df_w_code = exp_df.join(results_is_code.set_index(\"feature_idx\"), on=\"index\")\n",
    "exp_df_w_code = exp_df_w_code.rename(columns={\"response\": \"is_code\"}).join(results_is_structure.set_index(\"feature_idx\"), on=\"index\")\n",
    "exp_df_final = exp_df_w_code.rename(columns={\"response\": \"is_syntax\"})\n",
    "exp_df_final"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79e45eb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "exp_df_final[\"is_code\"] = (exp_df_final[\"is_code\"] == \"yes\").astype(int)\n",
    "exp_df_final[\"is_syntax\"] = (exp_df_final[\"is_syntax\"] == \"yes\").astype(int)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dfa4fe24",
   "metadata": {},
   "outputs": [],
   "source": [
    "exp_df_final.to_csv(\"INSERT YOURS.csv\", index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dlenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
