{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "af3e30ba",
   "metadata": {},
   "source": [
    "# UN Country/Region Data Cleaning\n",
    "\n",
    "This notebook loads the UN Methodology dataset and outputs a cleaned CSV with columns: country name, ISO 3166-1 alpha-3 code, and region. The region is chosen by backoff: Intermediate Region Name (if present), else Sub-region Name, else Region Name."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "9a4abe21",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "efe25cdd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the UN Methodology file\n",
    "un_m49_cleaned = pd.read_csv(\n",
    "    'UNSD — Methodology.csv',\n",
    "    delimiter=';',\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "8092d664",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to select region with backoff\n",
    "def select_region(row):\n",
    "    if pd.notnull(row['Intermediate Region Name']):\n",
    "        return row['Intermediate Region Name']\n",
    "    elif pd.notnull(row['Sub-region Name']):\n",
    "        return row['Sub-region Name']\n",
    "    else:\n",
    "        return row['Region Name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e0f1492e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country_name</th>\n",
       "      <th>country_code</th>\n",
       "      <th>region</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>62</th>\n",
       "      <td>Aruba</td>\n",
       "      <td>ABW</td>\n",
       "      <td>Caribbean</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>141</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>AFG</td>\n",
       "      <td>Southern Asia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>Angola</td>\n",
       "      <td>AGO</td>\n",
       "      <td>Middle Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60</th>\n",
       "      <td>Anguilla</td>\n",
       "      <td>AIA</td>\n",
       "      <td>Caribbean</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>178</th>\n",
       "      <td>Åland Islands</td>\n",
       "      <td>ALA</td>\n",
       "      <td>Northern Europe</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      country_name country_code           region\n",
       "62           Aruba          ABW        Caribbean\n",
       "141    Afghanistan          AFG    Southern Asia\n",
       "29          Angola          AGO    Middle Africa\n",
       "60        Anguilla          AIA        Caribbean\n",
       "178  Åland Islands          ALA  Northern Europe"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Clean and select relevant columns\n",
    "cleaned = un_m49_cleaned[\n",
    "    ['Country or Area', 'ISO-alpha3 Code', 'Intermediate Region Name', 'Sub-region Name', 'Region Name']\n",
    "].copy()\n",
    "cleaned['region'] = cleaned.apply(select_region, axis=1)\n",
    "cleaned = cleaned.rename(columns={'Country or Area': 'country_name', 'ISO-alpha3 Code': 'country_code'})[\n",
    "    ['country_name', 'country_code', 'region']\n",
    "]\n",
    "cleaned = cleaned.dropna(subset=['country_code'])  # Drop rows without a country code\n",
    "cleaned = cleaned.drop_duplicates(subset=['country_code'])\n",
    "cleaned = cleaned.sort_values('country_code')  # Sort by country_code\n",
    "cleaned.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39052cc4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Output cleaned data to CSV\n",
    "cleaned.to_csv('un_m49_cleaned_cleaned.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
