{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 104,
   "id": "30891363-4484-4511-98eb-412d5d5f48ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# 0. Download openhermes2_5.json from huggingface: https://huggingface.co/datasets/teknium/OpenHermes-2.5\n",
    "\n",
    "with open(\"openhermes2_5.json\", \"r\") as f:\n",
    "    openhermes = json.loads(f.read())\n",
    "df = pd.DataFrame(openhermes)\n",
    "\n",
    "\n",
    "def conversation_to_message_history(conversation):\n",
    "    message_history = []\n",
    "    for text in conversation:\n",
    "        if text[\"from\"] == \"human\":\n",
    "            role=\"user\"\n",
    "        elif text[\"from\"] == \"gpt\":\n",
    "            role=\"assistant\"\n",
    "        elif text[\"from\"] == \"system\":\n",
    "            role=\"system\"\n",
    "        \n",
    "        message_history.append({\"role\": role, \"content\": text[\"value\"]})\n",
    "\n",
    "    return message_history\n",
    "\n",
    "df[\"message_histories\"] = df[\"conversations\"].apply(lambda x: conversation_to_message_history(x))\n",
    "df[\"category\"] = df[\"category\"].apply(lambda x: np.nan_to_num(x))\n",
    "freqs = {category: len(df)/(sum(df[\"category\"] == category)) for category in df[\"category\"].unique()}\n",
    "\n",
    "freqs[\"experience\"] = 0.0\n",
    "freqs[\"stylized_response\"] = 0.0\n",
    "freqs[\"joke\"] = 0.0\n",
    "freqs[\"trivia\"] = 0.0\n",
    "freqs[\"roleplay\"] = 0.0\n",
    "freqs[\"riddle\"] = 0.0\n",
    "freqs[\"greeting\"] = 0.0\n",
    "\n",
    "df[\"weights\"] = df[\"category\"].apply(lambda x: freqs[x])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "id": "4cd1149c-37a2-4332-b26c-a6d6cfd32bf5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def randomly_select_n_examples(df, n, seed=42):\n",
    "    return df.sample(n=n,random_state=seed, weights=df[\"weights\"])\n",
    "\n",
    "df_small = randomly_select_n_examples(df, 15500, seed=27)\n",
    "examples = [x[1][\"message_histories\"] for x in df_small.iterrows()]\n",
    "\n",
    "with open(\"openhermes_15500.json\", \"w\") as f:\n",
    "    json.dump(obj=examples, fp=f)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
