{
  "Selected_candidate": {
    "pr_number": 11042,
    "pr_title": "[MRG + 1] Ensuring that the OneHotEncoder outputs sparse matrix with given dtype #11034",
    "pr_body": "#### Reference Issues/PRs\r\nOriginal discussion at #11034\r\n\r\n#### What does this implement/fix? Explain your changes.\r\n",
    "issue_id": 11034,
    "issue_title": "OneHotEncoder does not output scipy sparse matrix of given dtype",
    "issue_body": "#### Description\r\nOneHotEncoder ignores the specified dtype in the construction of the sparse array when mixed input data are passed, i.e with both categorical and real data type\r\n\r\n#### Steps/Code to Reproduce\r\n```python\r\nimport numpy as np\r\n\r\nfrom sklearn.preprocessing import OneHotEncoder\r\nenc = OneHotEncoder(dtype=np.float32, categorical_features=[0, 1])\r\n\r\nx = np.array([[0, 1, 0, 0], [1, 2, 0, 0]], dtype=int)\r\nsparse = enc.fit(x).transform(x)\r\n```\r\n\r\n#### Expected Results\r\n```python\r\nsparse: <2x6 sparse matrix of type '<class 'numpy.float32'>'\r\n\twith 4 stored elements in COOrdinate format>\r\n```\r\n\r\n#### Actual Results\r\n```python\r\nsparse: <2x6 sparse matrix of type '<class 'numpy.float64'>'\r\n\twith 4 stored elements in COOrdinate format>\r\n```\r\n\r\n#### Versions\r\n__Platform__: Linux-4.13.0-38-generic-x86_64-with-debian-stretch-sid\r\n__Python__: 3.6.3 |Anaconda custom (64-bit)| (default, Oct 13 2017, 12:02:49) [GCC 7.2.0]\r\n__NumPy__: NumPy \r\n__SciPy__: SciPy 1.0.1\r\n__Scikit-Learn__: Scikit-Learn 0.19.1\r\n",
    "issue_closed_at": "2018-06-06T09:03:02Z",
    "base_commit": "f049ec72eb70443ec8d7826066c4246035677c11",
    "changes": [
      {
        "file": "sklearn/preprocessing/data.py",
        "type": "function",
        "name": "add_dummy_feature",
        "class_name": null,
        "code": "def add_dummy_feature(X, value=1.0):\n    \"\"\"Augment dataset with an additional dummy feature.\n\n    This is useful for fitting an intercept term with implementations which\n    cannot otherwise fit it directly.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix}, shape [n_samples, n_features]\n        Data.\n\n    value : float\n        Value to use for the dummy feature.\n\n    Returns\n    -------\n\n    X : {array, sparse matrix}, shape [n_samples, n_features + 1]\n        Same data with dummy feature added as first column.\n\n    Examples\n    --------\n\n    >>> from sklearn.preprocessing import add_dummy_feature\n    >>> add_dummy_feature([[0, 1], [1, 0]])\n    array([[1., 0., 1.],\n           [1., 1., 0.]])\n    \"\"\"\n    X = check_array(X, accept_sparse=['csc', 'csr', 'coo'], dtype=FLOAT_DTYPES)\n    n_samples, n_features = X.shape\n    shape = (n_samples, n_features + 1)\n    if sparse.issparse(X):\n        if sparse.isspmatrix_coo(X):\n            # Shift columns to the right.\n            col = X.col + 1\n            # Column indices of dummy feature are 0 everywhere.\n            col = np.concatenate((np.zeros(n_samples), col))\n            # Row indices of dummy feature are 0, ..., n_samples-1.\n            row = np.concatenate((np.arange(n_samples), X.row))\n            # Prepend the dummy feature n_samples times.\n            data = np.concatenate((np.ones(n_samples) * value, X.data))\n            return sparse.coo_matrix((data, (row, col)), shape)\n        elif sparse.isspmatrix_csc(X):\n            # Shift index pointers since we need to add n_samples elements.\n            indptr = X.indptr + n_samples\n            # indptr[0] must be 0.\n            indptr = np.concatenate((np.array([0]), indptr))\n            # Row indices of dummy feature are 0, ..., n_samples-1.\n            indices = np.concatenate((np.arange(n_samples), X.indices))\n            # Prepend the dummy feature n_samples times.\n            data = np.concatenate((np.ones(n_samples) * value, X.data))\n            return sparse.csc_matrix((data, indices, indptr), shape)\n        else:\n            klass = X.__class__\n            return klass(add_dummy_feature(X.tocoo(), value))\n    else:\n        return np.hstack((np.ones((n_samples, 1)) * value, X))"
      },
      {
        "file": "sklearn/preprocessing/data.py",
        "type": "function",
        "name": "_transform_selected",
        "class_name": null,
        "code": "def _transform_selected(X, transform, selected=\"all\", copy=True):\n    \"\"\"Apply a transform function to portion of selected features\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix}, shape [n_samples, n_features]\n        Dense array or sparse matrix.\n\n    transform : callable\n        A callable transform(X) -> X_transformed\n\n    copy : boolean, optional\n        Copy X even if it could be avoided.\n\n    selected: \"all\" or array of indices or mask\n        Specify which features to apply the transform to.\n\n    Returns\n    -------\n    X : array or sparse matrix, shape=(n_samples, n_features_new)\n    \"\"\"\n    X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES)\n\n    if isinstance(selected, six.string_types) and selected == \"all\":\n        return transform(X)\n\n    if len(selected) == 0:\n        return X\n\n    n_features = X.shape[1]\n    ind = np.arange(n_features)\n    sel = np.zeros(n_features, dtype=bool)\n    sel[np.asarray(selected)] = True\n    not_sel = np.logical_not(sel)\n    n_selected = np.sum(sel)\n\n    if n_selected == 0:\n        # No features selected.\n        return X\n    elif n_selected == n_features:\n        # All features selected.\n        return transform(X)\n    else:\n        X_sel = transform(X[:, ind[sel]])\n        X_not_sel = X[:, ind[not_sel]]\n\n        if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):\n            return sparse.hstack((X_sel, X_not_sel))\n        else:\n            return np.hstack((X_sel, X_not_sel))"
      },
      {
        "file": "sklearn/preprocessing/data.py",
        "type": "function",
        "name": "_transform_selected",
        "class_name": null,
        "code": "def _transform_selected(X, transform, selected=\"all\", copy=True):\n    \"\"\"Apply a transform function to portion of selected features\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix}, shape [n_samples, n_features]\n        Dense array or sparse matrix.\n\n    transform : callable\n        A callable transform(X) -> X_transformed\n\n    copy : boolean, optional\n        Copy X even if it could be avoided.\n\n    selected: \"all\" or array of indices or mask\n        Specify which features to apply the transform to.\n\n    Returns\n    -------\n    X : array or sparse matrix, shape=(n_samples, n_features_new)\n    \"\"\"\n    X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES)\n\n    if isinstance(selected, six.string_types) and selected == \"all\":\n        return transform(X)\n\n    if len(selected) == 0:\n        return X\n\n    n_features = X.shape[1]\n    ind = np.arange(n_features)\n    sel = np.zeros(n_features, dtype=bool)\n    sel[np.asarray(selected)] = True\n    not_sel = np.logical_not(sel)\n    n_selected = np.sum(sel)\n\n    if n_selected == 0:\n        # No features selected.\n        return X\n    elif n_selected == n_features:\n        # All features selected.\n        return transform(X)\n    else:\n        X_sel = transform(X[:, ind[sel]])\n        X_not_sel = X[:, ind[not_sel]]\n\n        if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):\n            return sparse.hstack((X_sel, X_not_sel))\n        else:\n            return np.hstack((X_sel, X_not_sel))"
      },
      {
        "file": "sklearn/preprocessing/data.py",
        "type": "function",
        "name": "fit_transform",
        "class_name": "OneHotEncoder",
        "code": "def fit_transform(self, X, y=None):\n        \"\"\"Fit OneHotEncoder to X, then transform X.\n\n        Equivalent to self.fit(X).transform(X), but more convenient and more\n        efficient. See fit for the parameters, transform for the return value.\n\n        Parameters\n        ----------\n        X : array-like, shape [n_samples, n_feature]\n            Input array of type int.\n        \"\"\"\n        return _transform_selected(X, self._fit_transform,\n                                   self.categorical_features, copy=True)"
      },
      {
        "file": "sklearn/preprocessing/data.py",
        "type": "function",
        "name": "transform",
        "class_name": "CategoricalEncoder",
        "code": "def transform(self, X):\n        \"\"\"Transform X using specified encoding scheme.\n\n        Parameters\n        ----------\n        X : array-like, shape [n_samples, n_features]\n            The data to encode.\n\n        Returns\n        -------\n        X_out : sparse matrix or a 2-d array\n            Transformed input.\n\n        \"\"\"\n        X_temp = check_array(X, dtype=None)\n        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):\n            X = check_array(X, dtype=np.object)\n        else:\n            X = X_temp\n\n        n_samples, n_features = X.shape\n        X_int = np.zeros_like(X, dtype=np.int)\n        X_mask = np.ones_like(X, dtype=np.bool)\n\n        for i in range(n_features):\n            Xi = X[:, i]\n            valid_mask = np.in1d(Xi, self.categories_[i])\n\n            if not np.all(valid_mask):\n                if self.handle_unknown == 'error':\n                    diff = np.unique(X[~valid_mask, i])\n                    msg = (\"Found unknown categories {0} in column {1}\"\n                           \" during transform\".format(diff, i))\n                    raise ValueError(msg)\n                else:\n                    # Set the problematic rows to an acceptable value and\n                    # continue `The rows are marked `X_mask` and will be\n                    # removed later.\n                    X_mask[:, i] = valid_mask\n                    Xi = Xi.copy()\n                    Xi[~valid_mask] = self.categories_[i][0]\n            X_int[:, i] = self._label_encoders_[i].transform(Xi)\n\n        if self.encoding == 'ordinal':\n            return X_int.astype(self.dtype, copy=False)\n\n        mask = X_mask.ravel()\n        n_values = [cats.shape[0] for cats in self.categories_]\n        n_values = np.array([0] + n_values)\n        feature_indices = np.cumsum(n_values)\n\n        indices = (X_int + feature_indices[:-1]).ravel()[mask]\n        indptr = X_mask.sum(axis=1).cumsum()\n        indptr = np.insert(indptr, 0, 0)\n        data = np.ones(n_samples * n_features)[mask]\n\n        out = sparse.csr_matrix((data, indices, indptr),\n                                shape=(n_samples, feature_indices[-1]),\n                                dtype=self.dtype)\n        if self.encoding == 'onehot-dense':\n            return out.toarray()\n        else:\n            return out"
      }
    ]
  },
  "Justification": "Candidate E is the most helpful because it addresses the issue of handling mixed input data types, which is conceptually similar to the CURRENT bug involving nullable pandas dtypes. The problems associated with dtype in pandas and sklearn are also relevant, as both involve the conversion and behavior of data types in functions. This candidate emphasizes the handling of dtypes and provides insights that could be instrumental when ensuring that the `unique_labels` function effectively processes the nullable dtypes without raising errors. The similarities in user expectations concerning the output dtype make this candidate particularly relevant for debugging the CURRENT bug.",
  "instance_id": "scikit-learn__scikit-learn-25638",
  "repo": "scikit-learn/scikit-learn",
  "created_at": "2023-02-17T22:17:50Z",
  "problem_statement": "Support nullable pandas dtypes in `unique_labels`\n### Describe the workflow you want to enable\n\nI would like to be able to pass the nullable pandas dtypes (\"Int64\", \"Float64\", \"boolean\") into sklearn's `unique_labels` function. Because the dtypes become `object` dtype when converted to numpy arrays we get `ValueError: Mix type of y not allowed, got types {'binary', 'unknown'}`:\r\n\r\nRepro with sklearn 1.2.1\r\n```py \r\n    import pandas as pd\r\n    import pytest\r\n    from sklearn.utils.multiclass import unique_labels\r\n    \r\n    for dtype in [\"Int64\", \"Float64\", \"boolean\"]:\r\n        y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)\r\n        y_predicted = pd.Series([0, 0, 1, 1, 0, 1, 1, 1, 1], dtype=\"int64\")\r\n\r\n        with pytest.raises(ValueError, match=\"Mix type of y not allowed, got types\"):\r\n            unique_labels(y_true, y_predicted)\r\n```\n\n### Describe your proposed solution\n\nWe should get the same behavior as when `int64`, `float64`, and `bool` dtypes are used, which is no error:  \r\n\r\n```python\r\n    import pandas as pd\r\n    from sklearn.utils.multiclass import unique_labels\r\n    \r\n    for dtype in [\"int64\", \"float64\", \"bool\"]:\r\n        y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)\r\n        y_predicted = pd.Series([0, 0, 1, 1, 0, 1, 1, 1, 1], dtype=\"int64\")\r\n\r\n        unique_labels(y_true, y_predicted)\r\n```\n\n### Describe alternatives you've considered, if relevant\n\nOur current workaround is to convert the data to numpy arrays with the corresponding dtype that works prior to passing it into `unique_labels`.\n\n### Additional context\n\n_No response_\n",
  "patch": "diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py\n--- a/sklearn/utils/multiclass.py\n+++ b/sklearn/utils/multiclass.py\n@@ -155,14 +155,25 @@ def is_multilabel(y):\n     if hasattr(y, \"__array__\") or isinstance(y, Sequence) or is_array_api:\n         # DeprecationWarning will be replaced by ValueError, see NEP 34\n         # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html\n+        check_y_kwargs = dict(\n+            accept_sparse=True,\n+            allow_nd=True,\n+            force_all_finite=False,\n+            ensure_2d=False,\n+            ensure_min_samples=0,\n+            ensure_min_features=0,\n+        )\n         with warnings.catch_warnings():\n             warnings.simplefilter(\"error\", np.VisibleDeprecationWarning)\n             try:\n-                y = xp.asarray(y)\n-            except (np.VisibleDeprecationWarning, ValueError):\n+                y = check_array(y, dtype=None, **check_y_kwargs)\n+            except (np.VisibleDeprecationWarning, ValueError) as e:\n+                if str(e).startswith(\"Complex data not supported\"):\n+                    raise\n+\n                 # dtype=object should be provided explicitly for ragged arrays,\n                 # see NEP 34\n-                y = xp.asarray(y, dtype=object)\n+                y = check_array(y, dtype=object, **check_y_kwargs)\n \n     if not (hasattr(y, \"shape\") and y.ndim == 2 and y.shape[1] > 1):\n         return False\n@@ -302,15 +313,27 @@ def type_of_target(y, input_name=\"\"):\n     # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html\n     # We therefore catch both deprecation (NumPy < 1.24) warning and\n     # value error (NumPy >= 1.24).\n+    check_y_kwargs = dict(\n+        accept_sparse=True,\n+        allow_nd=True,\n+        force_all_finite=False,\n+        ensure_2d=False,\n+        ensure_min_samples=0,\n+        ensure_min_features=0,\n+    )\n+\n     with warnings.catch_warnings():\n         warnings.simplefilter(\"error\", np.VisibleDeprecationWarning)\n         if not issparse(y):\n             try:\n-                y = xp.asarray(y)\n-            except (np.VisibleDeprecationWarning, ValueError):\n+                y = check_array(y, dtype=None, **check_y_kwargs)\n+            except (np.VisibleDeprecationWarning, ValueError) as e:\n+                if str(e).startswith(\"Complex data not supported\"):\n+                    raise\n+\n                 # dtype=object should be provided explicitly for ragged arrays,\n                 # see NEP 34\n-                y = xp.asarray(y, dtype=object)\n+                y = check_array(y, dtype=object, **check_y_kwargs)\n \n     # The old sequence of sequences format\n     try:\n"
}