{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1ee1a4a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "def load_train_test(train_path=None, test_path=None):\n",
    "    \"\"\"Load train and test CSVs and print their sizes.\"\"\"\n",
    "    if train_path and test_path:\n",
    "        train_df = pd.read_csv(train_path)\n",
    "        test_df = pd.read_csv(test_path)\n",
    "        print(f\"Train size: {len(train_df)} rows\")\n",
    "        print(f\"Test size:  {len(test_df)} rows\")\n",
    "        return train_df, test_df\n",
    "    else:\n",
    "        print(\"Train/test split not provided.\")\n",
    "        return None, None\n",
    "\n",
    "def load_full_dataset(full_path):\n",
    "    \"\"\"Load full dataset for IR calculation.\"\"\"\n",
    "    return pd.read_csv(full_path)\n",
    "\n",
    "def calculate_imbalance_ratio(df, target_column):\n",
    "    \"\"\"Compute imbalance ratio: (# majority class) / (# minority class)\"\"\"\n",
    "    class_counts = df[target_column].value_counts()\n",
    "    if len(class_counts) < 2:\n",
    "        raise ValueError(\"Only one class present in the dataset.\")\n",
    "    majority = class_counts.iloc[0]\n",
    "    minority = class_counts.iloc[-1]\n",
    "    return round(majority / minority, 1)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca6988aa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train size: 8000 rows\n",
      "Test size:  2000 rows\n",
      "Imbalance Ratio (IR) for target column 'Exited': 3.9\n"
     ]
    }
   ],
   "source": [
    "# Churn Modelling\n",
    "train_path = \"/data/processed/train_churn_modelling.csv\"\n",
    "test_path = \"/data/processed/test_churn_modelling.csv\"\n",
    "full_data_path = \"/data/clean/churn_modelling.csv\"\n",
    "target_column = \"Exited\"  # or \"income\", etc.\n",
    "\n",
    "# Optional: load splits if needed\n",
    "train_df, test_df = load_train_test(train_path, test_path)\n",
    "\n",
    "# Load full dataset and compute IR\n",
    "full_df = load_full_dataset(full_data_path)\n",
    "ir = calculate_imbalance_ratio(full_df, target_column)\n",
    "\n",
    "print(f\"Imbalance Ratio (IR) for target column '{target_column}': {ir}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "54a2106c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train size: 24111 rows\n",
      "Test size:  6028 rows\n",
      "Imbalance Ratio (IR) for target column 'income': 3.0\n"
     ]
    }
   ],
   "source": [
    "# Adult\n",
    "train_path = \"/data/processed/train_adult.csv\"\n",
    "test_path = \"/data/processed/test_adult.csv\"\n",
    "full_data_path = \"/data/clean/adult.csv\"\n",
    "target_column = \"income\"  # or \"income\", etc.\n",
    "\n",
    "# Optional: load splits if needed\n",
    "train_df, test_df = load_train_test(train_path, test_path)\n",
    "\n",
    "# Load full dataset and compute IR\n",
    "full_df = load_full_dataset(full_data_path)\n",
    "ir = calculate_imbalance_ratio(full_df, target_column)\n",
    "\n",
    "print(f\"Imbalance Ratio (IR) for target column '{target_column}': {ir}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1f4a5db7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train size: 24000 rows\n",
      "Test size:  6000 rows\n",
      "Imbalance Ratio (IR) for target column 'default.payment.next.month': 3.5\n"
     ]
    }
   ],
   "source": [
    "# Default CC\n",
    "train_path = \"/data/processed/train_default_cc.csv\"\n",
    "test_path = \"/data/processed/test_default_cc.csv\"\n",
    "full_data_path = \"/data/clean/default_cc.csv\"\n",
    "target_column = \"default.payment.next.month\"  # or \"income\", etc.\n",
    "\n",
    "# Optional: load splits if needed\n",
    "train_df, test_df = load_train_test(train_path, test_path)\n",
    "\n",
    "# Load full dataset and compute IR\n",
    "full_df = load_full_dataset(full_data_path)\n",
    "ir = calculate_imbalance_ratio(full_df, target_column)\n",
    "\n",
    "print(f\"Imbalance Ratio (IR) for target column '{target_column}': {ir}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ef196b4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train size: 40378 rows\n",
      "Test size:  10095 rows\n",
      "Imbalance Ratio (IR) for target column 'Class': 105.7\n"
     ]
    }
   ],
   "source": [
    "# Credit Card\n",
    "train_path = \"/data/processed/train_creditcard.csv\"\n",
    "test_path = \"/data/processed/test_creditcard.csv\"\n",
    "full_data_path = \"/data/clean/creditcard.csv\"\n",
    "target_column = \"Class\"  # or \"income\", etc.\n",
    "\n",
    "# Optional: load splits if needed\n",
    "train_df, test_df = load_train_test(train_path, test_path)\n",
    "\n",
    "# Load full dataset and compute IR\n",
    "full_df = load_full_dataset(full_data_path)\n",
    "ir = calculate_imbalance_ratio(full_df, target_column)\n",
    "\n",
    "print(f\"Imbalance Ratio (IR) for target column '{target_column}': {ir}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a23c47f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train size: 8000 rows\n",
      "Test size:  2000 rows\n",
      "Imbalance Ratio (IR) for target column 'Target': 28.5\n"
     ]
    }
   ],
   "source": [
    "# Machine Predictive Maintenance\n",
    "train_path = \"/data/processed/train_machine_predictive_maintenance.csv\"\n",
    "test_path = \"/data/processed/test_machine_predictive_maintenance.csv\"\n",
    "full_data_path = \"/data/clean/machine_predictive_maintenance.csv\"\n",
    "target_column = \"Target\"  # or \"income\", etc.\n",
    "\n",
    "# Optional: load splits if needed\n",
    "train_df, test_df = load_train_test(train_path, test_path)\n",
    "\n",
    "# Load full dataset and compute IR\n",
    "full_df = load_full_dataset(full_data_path)\n",
    "ir = calculate_imbalance_ratio(full_df, target_column)\n",
    "\n",
    "print(f\"Imbalance Ratio (IR) for target column '{target_column}': {ir}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bca39f97",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train size: 12080 rows\n",
      "Test size:  3020 rows\n",
      "Imbalance Ratio (IR) for target column 'FraudFound_P': 15.9\n"
     ]
    }
   ],
   "source": [
    "# Vehicle Insurance Claim\n",
    "train_path = \"/data/processed/train_vehicle_insurance_claim.csv\"\n",
    "test_path = \"/data/processed/test_vehicle_insurance_claim.csv\"\n",
    "full_data_path = \"/data/clean/vehicle_insurance_claim.csv\"\n",
    "target_column = \"FraudFound_P\"  # or \"income\", etc.\n",
    "\n",
    "# Optional: load splits if needed\n",
    "train_df, test_df = load_train_test(train_path, test_path)\n",
    "\n",
    "# Load full dataset and compute IR\n",
    "full_df = load_full_dataset(full_data_path)\n",
    "ir = calculate_imbalance_ratio(full_df, target_column)\n",
    "\n",
    "print(f\"Imbalance Ratio (IR) for target column '{target_column}': {ir}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "74bb188e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "syngen",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
