{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "from sklearn.base import BaseEstimator, ClassifierMixin\n",
    "\n",
    "class SyntheticDataGenerator(BaseEstimator):\n",
    "    \"\"\"\n",
    "    Synthetic data generator for benchmarking probabilistic classifiers.\n",
    "    Adheres to sklearn's API for integration.\n",
    "    \"\"\"\n",
    "    def __init__(self, input_dim=10, n_classes=3, hidden_layers=(32, 32), random_seed=None):\n",
    "        \"\"\"\n",
    "        Parameters:\n",
    "        - input_dim: Number of input features\n",
    "        - n_classes: Number of output classes\n",
    "        - hidden_layers: Tuple specifying hidden layer sizes\n",
    "        - random_seed: Random seed for reproducibility\n",
    "        \"\"\"\n",
    "        self.input_dim = input_dim\n",
    "        self.n_classes = n_classes\n",
    "        self.hidden_layers = hidden_layers\n",
    "        self.random_seed = random_seed\n",
    "        self._initialize_model()\n",
    "\n",
    "    def _initialize_model(self):\n",
    "        if self.random_seed is not None:\n",
    "            torch.manual_seed(self.random_seed)\n",
    "            np.random.seed(self.random_seed)\n",
    "        \n",
    "        layers = []\n",
    "        last_dim = self.input_dim\n",
    "        for hidden_dim in self.hidden_layers:\n",
    "            layers.append(nn.Linear(last_dim, hidden_dim))\n",
    "            layers.append(nn.ReLU())\n",
    "            last_dim = hidden_dim\n",
    "        layers.append(nn.Linear(last_dim, self.n_classes))  # Output layer\n",
    "        self.network = nn.Sequential(*layers)\n",
    "\n",
    "    def generate_data(self, n_samples=1000):\n",
    "        \"\"\"\n",
    "        Generates synthetic data and ground truth probabilities.\n",
    "\n",
    "        Parameters:\n",
    "        - n_samples: Number of samples to generate\n",
    "\n",
    "        Returns:\n",
    "        - X: Feature matrix (n_samples, input_dim)\n",
    "        - y: Class labels (n_samples,)\n",
    "        - P: Ground truth class probabilities (n_samples, n_classes)\n",
    "        \"\"\"\n",
    "        X = np.random.randn(n_samples, self.input_dim).astype(np.float32)  # Random features\n",
    "        P = self.predict_proba(X)  # Use predict_proba for probabilities\n",
    "        y = np.array([np.random.choice(self.n_classes, p=p) for p in P])  # Sample labels\n",
    "        return X, y, P\n",
    "\n",
    "    def fit(self, X, y):\n",
    "        \"\"\"Placeholder to comply with sklearn's API.\"\"\"\n",
    "        pass\n",
    "\n",
    "    def predict_proba(self, X):\n",
    "        \"\"\"\n",
    "        Returns the ground truth probability distribution for the given inputs.\n",
    "\n",
    "        Parameters:\n",
    "        - X: Input features (numpy array of shape [n_samples, input_dim])\n",
    "\n",
    "        Returns:\n",
    "        - P: Ground truth class probabilities (numpy array of shape [n_samples, n_classes])\n",
    "        \"\"\"\n",
    "        X_tensor = torch.tensor(X, dtype=torch.float32)\n",
    "        with torch.no_grad():\n",
    "            logits = self.network(X_tensor)\n",
    "            P = F.softmax(logits, dim=1).numpy()\n",
    "        return P\n",
    "\n",
    "    def predict(self, X):\n",
    "        \"\"\"\n",
    "        Predicts class labels by sampling from the ground truth probabilities.\n",
    "\n",
    "        Parameters:\n",
    "        - X: Input features (numpy array of shape [n_samples, input_dim])\n",
    "\n",
    "        Returns:\n",
    "        - y: Predicted class labels (numpy array of shape [n_samples])\n",
    "        \"\"\"\n",
    "        P = self.predict_proba(X)\n",
    "        y = np.array([np.random.choice(self.n_classes, p=p) for p in P])\n",
    "        return y\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate synthetic data\n",
    "generator = SyntheticDataGenerator(input_dim=5, n_classes=3, hidden_layers=(16, 16), random_seed=42)\n",
    "X, _, _ = generator.generate_data(n_samples=10)\n",
    "\n",
    "# Predict probabilities\n",
    "P = generator.predict_proba(X)\n",
    "print(\"Predicted probabilities:\", P)\n",
    "\n",
    "# Predict labels\n",
    "y = generator.predict(X)\n",
    "print(\"Predicted labels:\", y)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import log_loss\n",
    "\n",
    "# Generate dataset\n",
    "X, y, P_true = generator.generate_data(n_samples=1000)\n",
    "\n",
    "# Train a classifier\n",
    "clf = LogisticRegression(max_iter=1000)\n",
    "clf.fit(X, y)\n",
    "\n",
    "# Evaluate\n",
    "P_pred = clf.predict_proba(X)\n",
    "print(\"Log-loss:\", log_loss(y, P_pred, labels=range(generator.n_classes)))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "cp_rank",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
