{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e710944c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import random\n",
    "\n",
    "# Read the TSV file\n",
    "file_path = \"shuffled_dedup_entities.tsv\"\n",
    "data = pd.read_csv(file_path, sep='\\t', header=None, names=[\"text_preceding\", \"text\", \"text_following\"], on_bad_lines='skip')\n",
    "\n",
    "# Filter rows where preceding and following text lengths are within the specified range\n",
    "data = data[(data['text_preceding'].str.len() > 0) & (data['text_preceding'].str.len() < 400) & \n",
    "            (data['text_following'].str.len() > 0) & (data['text_following'].str.len() < 400)]\n",
    "\n",
    "# Group by 'text' and count the number of samples in each category\n",
    "grouped = data.groupby(\"text\")\n",
    "entity_counts = grouped.size()\n",
    "print(entity_counts)\n",
    "\n",
    "# Randomly select 10 categories\n",
    "valid_entities = entity_counts[(entity_counts >= 2000) & (entity_counts <= 3000)].index\n",
    "print(f\"Number of categories meeting the criteria: {len(valid_entities)}\")\n",
    "random_sample = random.sample(valid_entities.tolist(), 10)\n",
    "\n",
    "# Sample 2000 context entries for each selected category\n",
    "sampled_data = []\n",
    "for entity in random_sample:\n",
    "    entity_data = grouped.get_group(entity)\n",
    "    sampled_entity_data = entity_data.sample(n=2000, random_state=42)  # Fixed random seed for consistency\n",
    "    sampled_data.append(sampled_entity_data)\n",
    "\n",
    "# Combine all sampled results\n",
    "final_data = pd.concat(sampled_data)\n",
    "\n",
    "# Save to a new file\n",
    "output_path = \"OneShotWikiLinks.csv\"\n",
    "final_data.to_csv(output_path, index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
