{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "542209c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from collections import Counter\n",
    "import csv\n",
    "\n",
    "# List to store parsed results from each line\n",
    "data_list = []\n",
    "\n",
    "# Open the file and read line by line\n",
    "with open('tst.json', 'r') as file:\n",
    "    for line in file:\n",
    "        try:\n",
    "            # Parse each line and add it to the list\n",
    "            data = json.loads(line)\n",
    "            data_list.append(data)\n",
    "        except json.JSONDecodeError as e:\n",
    "            print(f\"Error parsing line: {e}\")\n",
    "            \n",
    "# Open the file and read line by line\n",
    "with open('trn.json', 'r') as file:\n",
    "    for line in file:\n",
    "        try:\n",
    "            # Parse each line and add it to the list\n",
    "            data = json.loads(line)\n",
    "            data_list.append(data)\n",
    "        except json.JSONDecodeError as e:\n",
    "            print(f\"Error parsing line: {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b18f1e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Step 1: Filter out data where content does not exceed 500 characters\n",
    "filtered_by_content = [data for data in data_list if len(data['content']) <= 500]\n",
    "print(f\"Number of data entries with content not exceeding 500 characters: {len(filtered_by_content)}\")\n",
    "\n",
    "# Step 2: Filter out data where target_ind contains only one element\n",
    "single_label_data = [data for data in filtered_by_content if len(data['target_ind']) == 1]\n",
    "print(f\"Number of single-label data entries: {len(single_label_data)}\")\n",
    "\n",
    "# Step 3: Count the number of entries for each label\n",
    "label_counts = Counter(data['target_ind'][0] for data in single_label_data)\n",
    "print(\"Number of data entries for each label:\")\n",
    "for label, count in label_counts.items():\n",
    "    print(f\"Label {label}: {count} entries\")\n",
    "\n",
    "# Step 4: Select the top 10 labels with the most data entries\n",
    "top_10_labels = [label for label, _ in label_counts.most_common(10)]\n",
    "print(f\"Top 10 labels with the most data entries: {top_10_labels}\")\n",
    "\n",
    "# Step 5: Filter out data that contains only these labels\n",
    "filtered_data = [data for data in single_label_data if data['target_ind'][0] in top_10_labels]\n",
    "\n",
    "# Print final results\n",
    "print(f\"Final number of filtered data entries: {len(filtered_data)}\")\n",
    "\n",
    "# Step 6: Count the number of entries for each label after filtering\n",
    "final_label_counts = Counter(data['target_ind'][0] for data in filtered_data)\n",
    "print(\"Number of data entries for each label after filtering:\")\n",
    "for label, count in final_label_counts.items():\n",
    "    print(f\"Label {label}: {count} entries\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2df7667d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Step 1: Define the CSV file name\n",
    "csv_file = \"AmazonCat_10arms.csv\"\n",
    "\n",
    "# Step 2: Save as a CSV file\n",
    "with open(csv_file, mode=\"w\", newline=\"\", encoding=\"utf-8\") as file:\n",
    "    writer = csv.writer(file)\n",
    "\n",
    "    # Write the header\n",
    "    writer.writerow([\"uid\", \"target_ind\", \"title\", \"content\"])\n",
    "\n",
    "    # Write each data entry\n",
    "    for data in filtered_data:\n",
    "        writer.writerow([\n",
    "            data[\"uid\"],  # uid\n",
    "            \",\".join(map(str, data[\"target_ind\"])),  # Convert target_ind to a comma-separated string\n",
    "            data[\"title\"],  # title\n",
    "            data[\"content\"]  # content\n",
    "        ])\n",
    "\n",
    "print(f\"Filtered data has been successfully saved to '{csv_file}'\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
