{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "648cd1c4-cc39-4957-9d44-f74f5404e3e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyarrow as pa\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a06957a2-0810-4daf-904a-2834a68da6c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "lengths_path = \"\" # Where is the parquet file with the length of each article in tokens \n",
    "db_name = \"\" # SQL database name, e.g. 'interp'\n",
    "username = \"\" # DB username, e.g. 'username'\n",
    "url = \"\" # DB url/IP address, e.g. 127.0.0.1\n",
    "connection_string = \"\" # SQL connection string, e.g. 'postgresql://username@localhost/interp'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bdf5dd59-9f28-481a-9319-1c9ba0b53bb4",
   "metadata": {},
   "outputs": [],
   "source": [
    "with pa.OSFile(lengths_path, 'rb') as source:\n",
    "    with pa.ipc.open_file(source) as reader:\n",
    "       df = reader.read_pandas()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bb802437-5cd3-4320-a186-ae8800ea57c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"length\"].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "added949-b23e-4b0e-99cb-92f4f3e08c7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.sort_values(by=\"length\",ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0af99601-67a2-42c6-9586-f0afc4bdd943",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.iloc[0:3921600]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46f1181c-9b03-4a31-9217-20824572564f",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f4e64ac-8945-47dc-97d9-dfd408bb4d09",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f80f744-7b5c-4ec4-9c0a-3e082817d012",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import psycopg2\n",
    "from psycopg2.extras import execute_values\n",
    "from concurrent.futures import ProcessPoolExecutor\n",
    "import os\n",
    "\n",
    "# Database connection parameters\n",
    "DB_PARAMS = {\n",
    "    'dbname': db_name,\n",
    "    'user': username,\n",
    "    'host': url,\n",
    "    'port': 5432  # Change if your PostgreSQL uses a different port\n",
    "}\n",
    "\n",
    "# Number of processes to use\n",
    "NUM_PROCESSES = os.cpu_count()\n",
    "\n",
    "# Batch size for updates\n",
    "BATCH_SIZE = 10000\n",
    "\n",
    "def update_batch(uids):\n",
    "    conn = psycopg2.connect(**DB_PARAMS)\n",
    "    cur = conn.cursor()\n",
    "    \n",
    "    try:\n",
    "        # Prepare the update query\n",
    "        update_query = \"\"\"\n",
    "        UPDATE wikipedia_text\n",
    "        SET acts = -1\n",
    "        WHERE uid = ANY(%s)\n",
    "        \"\"\"\n",
    "        \n",
    "        # Execute the update\n",
    "        cur.execute(update_query, (uids,))\n",
    "        conn.commit()\n",
    "    except Exception as e:\n",
    "        print(f\"Error updating batch: {e}\")\n",
    "        conn.rollback()\n",
    "    finally:\n",
    "        cur.close()\n",
    "        conn.close()\n",
    "\n",
    "def process_uids(uids):\n",
    "    for i in range(0, len(uids), BATCH_SIZE):\n",
    "        batch = uids[i:i+BATCH_SIZE]\n",
    "        update_batch(batch)\n",
    "\n",
    "def main(df):\n",
    "    # Convert Series to list\n",
    "    uid_list = df['uid'].tolist()\n",
    "\n",
    "    # Split UIDs into chunks for each process\n",
    "    chunk_size = len(uid_list) // NUM_PROCESSES\n",
    "    uid_chunks = [uid_list[i:i+chunk_size] for i in range(0, len(uid_list), chunk_size)]\n",
    "\n",
    "    # Use ProcessPoolExecutor to update in parallel\n",
    "    with ProcessPoolExecutor(max_workers=NUM_PROCESSES) as executor:\n",
    "        executor.map(process_uids, uid_chunks)\n",
    "\n",
    "    print(\"Update completed.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "febc491e-e916-4495-900f-1ca9a66a08cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "main(new_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8bcd162-b44d-42d6-9a38-e0c20fb021b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import connectorx as cx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf8af2cd-7a58-431d-a088-dc4f081e6d4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_wikipedia_text_to_dataframe(k=None,num_threads=64):\n",
    "    # Connection parameters\n",
    "    conn_params = connection_string\n",
    "    if k:\n",
    "        query = f\"\"\"\n",
    "        SELECT uid, lang, lang_id\n",
    "        FROM wikipedia_text\n",
    "        LIMIT {k};\n",
    "        \"\"\"\n",
    "    else:\n",
    "        query = f\"\"\"\n",
    "        SELECT uid, lang, lang_id\n",
    "        FROM wikipedia_text\n",
    "        \"\"\"\n",
    "    if k:\n",
    "        df = cx.read_sql(conn_params, query, return_type=\"pandas\")\n",
    "    else:\n",
    "        df = cx.read_sql(conn_params, query, partition_on=\"uid\", partition_num=num_threads,return_type=\"pandas\")\n",
    "\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53300749-9582-46b5-a895-8b575ee9fc35",
   "metadata": {},
   "outputs": [],
   "source": [
    "lang_df = load_wikipedia_text_to_dataframe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a9b9c15-c575-40bb-9614-83c9060cdcf5",
   "metadata": {},
   "outputs": [],
   "source": [
    "lang_df_test = df.join(lang_df,on=\"uid\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b888cc5-4285-41dd-877c-0c68369f7d67",
   "metadata": {},
   "outputs": [],
   "source": [
    "lang_df = lang_df.sort_values(by=\"uid\").reset_index(drop=True)\n",
    "df = df.sort_values(by=\"uid\").reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09aea37b-a0e1-4bdb-93ad-ebd5aa551e60",
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"lang_id\"] = lang_df[\"lang_id\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "03007b23-469d-466f-bc9f-4cb98d8eeda6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def select_rows(df,id_df):\n",
    "    total_active_users = id_df['active_users'].sum()\n",
    "\n",
    "    # Calculate the number of rows to select for each lang_id\n",
    "    rows_per_lang = id_df.apply(lambda row: int(3921600 * (row['active_users'] / total_active_users)), axis=1)\n",
    "\n",
    "    # Initialize a list to store selected rows and a counter for unselected rows\n",
    "    selected_rows = []\n",
    "    unselected_count = 0\n",
    "\n",
    "    # Process each lang_id\n",
    "    for lang_id in range(128):\n",
    "        # Get the number of rows to select for this lang_id\n",
    "        n = rows_per_lang[lang_id]\n",
    "    \n",
    "        # Filter df for the current lang_id, ensure length >= 255, and sort by length in descending order\n",
    "        lang_df = df[(df['lang_id'] == lang_id) & (df['length'] >= 255)].sort_values('length', ascending=False)\n",
    "    \n",
    "        # Calculate the number of rows to select\n",
    "        num_rows = min(len(lang_df), n)\n",
    "        select_range = min(len(lang_df), 2*n)\n",
    "        rows_to_select = int(0.5 * select_range)\n",
    "    \n",
    "        # Select rows\n",
    "        if num_rows < n:\n",
    "            selected = lang_df.head(num_rows)\n",
    "            unselected_count += n - num_rows\n",
    "        else:\n",
    "            selected = lang_df.head(select_range).sample(n=rows_to_select, random_state=42)\n",
    "    \n",
    "        selected_rows.append(selected)\n",
    "        unselected_count += n - len(selected)\n",
    "\n",
    "    # Combine all selected rows\n",
    "    result_df = pd.concat(selected_rows)\n",
    "\n",
    "    # Calculate how many more rows need to be selected\n",
    "    remaining_rows = 3921600 - len(result_df)\n",
    "\n",
    "    # Select remaining rows from lang_id 0\n",
    "    lang_0_df = df[(df['lang_id'] == 0) & (df['length'] >= 255)].sort_values('length', ascending=False)\n",
    "    lang_0_selected = lang_0_df[~lang_0_df.index.isin(result_df.index)]\n",
    "    additional_rows = lang_0_selected.head(remaining_rows + unselected_count)\n",
    "\n",
    "    # Add the additional rows to the result\n",
    "    result_df = pd.concat([result_df, additional_rows])\n",
    "\n",
    "    # Ensure we have exactly 3921600 rows\n",
    "    result_df = result_df.head(3921600)\n",
    "\n",
    "    print(f\"Total rows selected: {len(result_df)}\")\n",
    "    return result_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66ce9614-0717-47f7-a844-ea82bb0f7a50",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "id_df = pd.read_csv(\"table.csv\")[['WP\\ncode','Active\\nusers']].drop_duplicates().reset_index(drop=True)\n",
    "\n",
    "# Create an 'id' field starting at 0\n",
    "id_df['id'] = range(len(id_df))\n",
    "\n",
    "# Ensure 'en' and 'simple' have the same id value\n",
    "en_index = id_df[id_df['WP\\ncode'] == 'en'].index\n",
    "simple_index = id_df[id_df['WP\\ncode'] == 'simple'].index\n",
    "\n",
    "if not en_index.empty and not simple_index.empty:\n",
    "    en_value = id_df.loc[en_index, 'id'].values[0]\n",
    "    id_df.loc[simple_index, 'id'] = en_value\n",
    "    \n",
    "    # Adjust the ids for rows after 'simple'\n",
    "    rows_to_adjust = id_df.index > simple_index[0]\n",
    "    id_df.loc[rows_to_adjust, 'id'] -= 1\n",
    "\n",
    "id_df = id_df.rename(columns={'Active\\nusers':'active_users','id': 'lang_id'})\n",
    "id_df = id_df.loc[id_df['lang_id'] < 128]\n",
    "id_df['active_users'] = id_df['active_users'].str.replace(',', '').astype(int)\n",
    "id_df = id_df.groupby(['lang_id'], as_index=False)['active_users'].sum()\n",
    "id_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "459e1aeb-a465-451d-aa4a-9d98e1796f40",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.loc[df[\"lang_id\"] < 128]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "617ee6ee-f576-4ca1-ae75-f17cac078c84",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df = select_rows(df,id_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e8bb68b-442c-4b76-b3cd-3111e36c3413",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df[\"length\"].min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d944982-8fa7-484e-8c4a-bd227b93976c",
   "metadata": {},
   "outputs": [],
   "source": [
    "N = 25  # You can adjust this number as needed\n",
    "\n",
    "# Calculate the percentage of each language\n",
    "new_df[\"lang_id_str\"] = new_df[\"lang_id\"].astype(str)\n",
    "lang_percentages = new_df['lang_id_str'].value_counts(normalize=True) * 100\n",
    "\n",
    "# Separate top N languages and group the rest as 'Other'\n",
    "top_n = lang_percentages.nlargest(N)\n",
    "other = pd.Series({'Other': lang_percentages[N:].sum()})\n",
    "lang_percentages_grouped = pd.concat([top_n, other])\n",
    "\n",
    "# Sort the percentages in descending order\n",
    "lang_percentages_sorted = lang_percentages_grouped.sort_values(ascending=False)\n",
    "\n",
    "# Create a figure and axis\n",
    "fig, ax = plt.subplots(figsize=(12, 6))\n",
    "\n",
    "# Create the bar chart\n",
    "bars = ax.bar(lang_percentages_sorted.index, lang_percentages_sorted.values)\n",
    "\n",
    "# Customize the chart\n",
    "ax.set_xlabel('Language')\n",
    "ax.set_ylabel('Percentage')\n",
    "ax.set_title(f'Percentage of Rows by Language (Top {N} + Other)')\n",
    "\n",
    "# Rotate x-axis labels for better readability\n",
    "plt.xticks(rotation=45, ha='right')\n",
    "\n",
    "# Add percentage labels on top of each bar\n",
    "for bar in bars:\n",
    "    height = bar.get_height()\n",
    "    ax.text(bar.get_x() + bar.get_width()/2., height,\n",
    "            f'{height:.1f}%',\n",
    "            ha='center', va='bottom')\n",
    "\n",
    "# Adjust layout to prevent cutting off labels\n",
    "plt.tight_layout()\n",
    "\n",
    "# Show the plot\n",
    "plt.savefig(\"lang_stats.png\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2080cc9c-d225-41db-8266-0e3a60381027",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df['uid']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b3f40fa-dcbb-4351-9a2f-f073299a2eb2",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
