{
  "instance_id": "scikit-learn__scikit-learn-25638",
  "repo": "scikit-learn/scikit-learn",
  "created_at": "2023-02-17T22:17:50Z",
  "problem_statement": "Support nullable pandas dtypes in `unique_labels`\n### Describe the workflow you want to enable\n\nI would like to be able to pass the nullable pandas dtypes (\"Int64\", \"Float64\", \"boolean\") into sklearn's `unique_labels` function. Because the dtypes become `object` dtype when converted to numpy arrays we get `ValueError: Mix type of y not allowed, got types {'binary', 'unknown'}`:\r\n\r\nRepro with sklearn 1.2.1\r\n```py \r\n    import pandas as pd\r\n    import pytest\r\n    from sklearn.utils.multiclass import unique_labels\r\n    \r\n    for dtype in [\"Int64\", \"Float64\", \"boolean\"]:\r\n        y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)\r\n        y_predicted = pd.Series([0, 0, 1, 1, 0, 1, 1, 1, 1], dtype=\"int64\")\r\n\r\n        with pytest.raises(ValueError, match=\"Mix type of y not allowed, got types\"):\r\n            unique_labels(y_true, y_predicted)\r\n```\n\n### Describe your proposed solution\n\nWe should get the same behavior as when `int64`, `float64`, and `bool` dtypes are used, which is no error:  \r\n\r\n```python\r\n    import pandas as pd\r\n    from sklearn.utils.multiclass import unique_labels\r\n    \r\n    for dtype in [\"int64\", \"float64\", \"bool\"]:\r\n        y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)\r\n        y_predicted = pd.Series([0, 0, 1, 1, 0, 1, 1, 1, 1], dtype=\"int64\")\r\n\r\n        unique_labels(y_true, y_predicted)\r\n```\n\n### Describe alternatives you've considered, if relevant\n\nOur current workaround is to convert the data to numpy arrays with the corresponding dtype that works prior to passing it into `unique_labels`.\n\n### Additional context\n\n_No response_\n",
  "patch": "diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py\n--- a/sklearn/utils/multiclass.py\n+++ b/sklearn/utils/multiclass.py\n@@ -155,14 +155,25 @@ def is_multilabel(y):\n     if hasattr(y, \"__array__\") or isinstance(y, Sequence) or is_array_api:\n         # DeprecationWarning will be replaced by ValueError, see NEP 34\n         # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html\n+        check_y_kwargs = dict(\n+            accept_sparse=True,\n+            allow_nd=True,\n+            force_all_finite=False,\n+            ensure_2d=False,\n+            ensure_min_samples=0,\n+            ensure_min_features=0,\n+        )\n         with warnings.catch_warnings():\n             warnings.simplefilter(\"error\", np.VisibleDeprecationWarning)\n             try:\n-                y = xp.asarray(y)\n-            except (np.VisibleDeprecationWarning, ValueError):\n+                y = check_array(y, dtype=None, **check_y_kwargs)\n+            except (np.VisibleDeprecationWarning, ValueError) as e:\n+                if str(e).startswith(\"Complex data not supported\"):\n+                    raise\n+\n                 # dtype=object should be provided explicitly for ragged arrays,\n                 # see NEP 34\n-                y = xp.asarray(y, dtype=object)\n+                y = check_array(y, dtype=object, **check_y_kwargs)\n \n     if not (hasattr(y, \"shape\") and y.ndim == 2 and y.shape[1] > 1):\n         return False\n@@ -302,15 +313,27 @@ def type_of_target(y, input_name=\"\"):\n     # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html\n     # We therefore catch both deprecation (NumPy < 1.24) warning and\n     # value error (NumPy >= 1.24).\n+    check_y_kwargs = dict(\n+        accept_sparse=True,\n+        allow_nd=True,\n+        force_all_finite=False,\n+        ensure_2d=False,\n+        ensure_min_samples=0,\n+        ensure_min_features=0,\n+    )\n+\n     with warnings.catch_warnings():\n         warnings.simplefilter(\"error\", np.VisibleDeprecationWarning)\n         if not issparse(y):\n             try:\n-                y = xp.asarray(y)\n-            except (np.VisibleDeprecationWarning, ValueError):\n+                y = check_array(y, dtype=None, **check_y_kwargs)\n+            except (np.VisibleDeprecationWarning, ValueError) as e:\n+                if str(e).startswith(\"Complex data not supported\"):\n+                    raise\n+\n                 # dtype=object should be provided explicitly for ragged arrays,\n                 # see NEP 34\n-                y = xp.asarray(y, dtype=object)\n+                y = check_array(y, dtype=object, **check_y_kwargs)\n \n     # The old sequence of sequences format\n     try:\n",
  "similar_bug_items": [
    {
      "pr_number": 19879,
      "pr_title": "FIX Error for sparse matrix in OrdinalEncoder.inverse_transform",
      "pr_body": "closes #19878\r\n\r\n`OrdinalEncoder.inverse_transform` should not support sparse matrix. It was already failing but with an obscure message.\r\nThis PR adds a non-regression test to check for the error message.",
      "issue_id": 19878,
      "issue_title": "OrdinalEncoder accept and failed with sparse matrix in inverse_transform",
      "issue_body": "`OrdinalEncoder` was documented to accept sparse matrix in `inverse_transform`.\r\nA check is internally done to accept sparse matrix. However, the `inverse_transform` will fail with this type of data.\r\nWe should remove the support in the check and make sure that we issue the right error.\r\n",
      "issue_closed_at": "2021-04-13T11:59:30Z",
      "base_commit": "767fd63c9ddddc46e288fdec2cca36a129529a8e",
      "changes": [
        {
          "file": "sklearn/preprocessing/_encoders.py",
          "type": "function",
          "name": "inverse_transform",
          "class_name": "OrdinalEncoder",
          "code": "def inverse_transform(self, X):\n        \"\"\"\n        Convert the data back to the original representation.\n\n        Parameters\n        ----------\n        X : {array-like, sparse matrix} of shape (n_samples, n_features)\n            The transformed data.\n\n        Returns\n        -------\n        X_tr : ndarray of shape (n_samples, n_features)\n            Inverse transformed array.\n        \"\"\"\n        check_is_fitted(self)\n        X = check_array(X, accept_sparse='csr', force_all_finite='allow-nan')\n\n        n_samples, _ = X.shape\n        n_features = len(self.categories_)\n\n        # validate shape of passed X\n        msg = (\"Shape of the passed X data is not correct. Expected {0} \"\n               \"columns, got {1}.\")\n        if X.shape[1] != n_features:\n            raise ValueError(msg.format(n_features, X.shape[1]))\n\n        # create resulting array of appropriate dtype\n        dt = np.find_common_type([cat.dtype for cat in self.categories_], [])\n        X_tr = np.empty((n_samples, n_features), dtype=dt)\n\n        found_unknown = {}\n\n        for i in range(n_features):\n            labels = X[:, i].astype('int64', copy=False)\n\n            # replace values of X[:, i] that were nan with actual indices\n            if i in self._missing_indices:\n                X_i_mask = _get_mask(X[:, i], np.nan)\n                labels[X_i_mask] = self._missing_indices[i]\n\n            if self.handle_unknown == 'use_encoded_value':\n                unknown_labels = labels == self.unknown_value\n                X_tr[:, i] = self.categories_[i][np.where(\n                    unknown_labels, 0, labels)]\n                found_unknown[i] = unknown_labels\n            else:\n                X_tr[:, i] = self.categories_[i][labels]\n\n        # insert None values for unknown values\n        if found_unknown:\n            X_tr = X_tr.astype(object, copy=False)\n\n            for idx, mask in found_unknown.items():\n                X_tr[mask, idx] = None\n\n        return X_tr"
        },
        {
          "file": "sklearn/preprocessing/_encoders.py",
          "type": "function",
          "name": "inverse_transform",
          "class_name": "OrdinalEncoder",
          "code": "def inverse_transform(self, X):\n        \"\"\"\n        Convert the data back to the original representation.\n\n        Parameters\n        ----------\n        X : {array-like, sparse matrix} of shape (n_samples, n_features)\n            The transformed data.\n\n        Returns\n        -------\n        X_tr : ndarray of shape (n_samples, n_features)\n            Inverse transformed array.\n        \"\"\"\n        check_is_fitted(self)\n        X = check_array(X, accept_sparse='csr', force_all_finite='allow-nan')\n\n        n_samples, _ = X.shape\n        n_features = len(self.categories_)\n\n        # validate shape of passed X\n        msg = (\"Shape of the passed X data is not correct. Expected {0} \"\n               \"columns, got {1}.\")\n        if X.shape[1] != n_features:\n            raise ValueError(msg.format(n_features, X.shape[1]))\n\n        # create resulting array of appropriate dtype\n        dt = np.find_common_type([cat.dtype for cat in self.categories_], [])\n        X_tr = np.empty((n_samples, n_features), dtype=dt)\n\n        found_unknown = {}\n\n        for i in range(n_features):\n            labels = X[:, i].astype('int64', copy=False)\n\n            # replace values of X[:, i] that were nan with actual indices\n            if i in self._missing_indices:\n                X_i_mask = _get_mask(X[:, i], np.nan)\n                labels[X_i_mask] = self._missing_indices[i]\n\n            if self.handle_unknown == 'use_encoded_value':\n                unknown_labels = labels == self.unknown_value\n                X_tr[:, i] = self.categories_[i][np.where(\n                    unknown_labels, 0, labels)]\n                found_unknown[i] = unknown_labels\n            else:\n                X_tr[:, i] = self.categories_[i][labels]\n\n        # insert None values for unknown values\n        if found_unknown:\n            X_tr = X_tr.astype(object, copy=False)\n\n            for idx, mask in found_unknown.items():\n                X_tr[mask, idx] = None\n\n        return X_tr"
        }
      ]
    },
    {
      "pr_number": 14878,
      "pr_title": "API Improves error msg when passing non-arrays to SimpleImputer",
      "pr_body": "Fixes #14877.\r\nAn alternative would be calling ``asarray`` but that would be less informative imho.\r\n\r\nStill needs a regression test. Probably one for list and one for dataframe?",
      "issue_id": 14877,
      "issue_title": "DataFrames not properly validated in SimpleImputer",
      "issue_body": "```python\r\nimport pandas as pd\r\nfrom sklearn.impute import SimpleImputer\r\n\r\nSimpleImputer().fit(pd.DataFrame({'a': ['b', 'c']}))\r\n```\r\nis not validated correctly:\r\n\r\n```pythontb\r\n---------------------------------------------------------------------------\r\nValueError                                Traceback (most recent call last)\r\n~/checkout/scikit-learn/sklearn/impute/_base.py in _validate_input(self, X)\r\n    198             X = check_array(X, accept_sparse='csc', dtype=dtype,\r\n--> 199                             force_all_finite=force_all_finite, copy=self.copy)\r\n    200         except ValueError as ve:\r\n\r\n~/checkout/scikit-learn/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)\r\n    496                 warnings.simplefilter('error', ComplexWarning)\r\n--> 497                 array = np.asarray(array, dtype=dtype, order=order)\r\n    498             except ComplexWarning:\r\n\r\n~/miniconda3/lib/python3.7/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)\r\n    537     \"\"\"\r\n--> 538     return array(a, dtype, copy=False, order=order)\r\n    539 \r\n\r\nValueError: could not convert string to float: 'b'\r\n\r\nDuring handling of the above exception, another exception occurred:\r\n\r\nAttributeError                            Traceback (most recent call last)\r\n<ipython-input-6-f08c4f6715ce> in <module>\r\n----> 1 SimpleImputer().fit(pd.DataFrame({'a': ['b', 'c']}))\r\n\r\n~/checkout/scikit-learn/sklearn/impute/_base.py in fit(self, X, y)\r\n    230         self : SimpleImputer\r\n    231         \"\"\"\r\n--> 232         X = self._validate_input(X)\r\n    233 \r\n    234         # default fill_value is 0 for numerical input and \"missing_value\"\r\n\r\n~/checkout/scikit-learn/sklearn/impute/_base.py in _validate_input(self, X)\r\n    202                 raise ValueError(\"Cannot use {0} strategy with non-numeric \"\r\n    203                                  \"data. Received datatype :{1}.\"\r\n--> 204                                  \"\".format(self.strategy, X.dtype.kind))\r\n    205             else:\r\n    206                 raise ve\r\n\r\n~/miniconda3/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name)\r\n   5065             if self._info_axis._can_hold_identifiers_and_holds_name(name):\r\n   5066                 return self[name]\r\n-> 5067             return object.__getattribute__(self, name)\r\n   5068 \r\n   5069     def __setattr__(self, name, value):\r\n\r\nAttributeError: 'DataFrame' object has no attribute 'dtype'\r\n```\r\n",
      "issue_closed_at": "2019-10-18T15:28:54Z",
      "base_commit": "b02217d8a5651760353e310701e749c1eaece6df",
      "changes": [
        {
          "file": "sklearn/impute/_base.py",
          "type": "function",
          "name": "_validate_input",
          "class_name": "MissingIndicator",
          "code": "def _validate_input(self, X):\n        if not is_scalar_nan(self.missing_values):\n            force_all_finite = True\n        else:\n            force_all_finite = \"allow-nan\"\n        X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None,\n                        force_all_finite=force_all_finite)\n        _check_inputs_dtype(X, self.missing_values)\n        if X.dtype.kind not in (\"i\", \"u\", \"f\", \"O\"):\n            raise ValueError(\"MissingIndicator does not support data with \"\n                             \"dtype {0}. Please provide either a numeric array\"\n                             \" (with a floating point or integer dtype) or \"\n                             \"categorical data represented either as an array \"\n                             \"with integer dtype or an array of string values \"\n                             \"with an object dtype.\".format(X.dtype))\n\n        if sparse.issparse(X) and self.missing_values == 0:\n            # missing_values = 0 not allowed with sparse data as it would\n            # force densification\n            raise ValueError(\"Sparse input with missing_values=0 is \"\n                             \"not supported. Provide a dense \"\n                             \"array instead.\")\n\n        return X"
        }
      ]
    },
    {
      "pr_number": 12279,
      "pr_title": "[MRG+1] Add check_is_fitted to non standard functions",
      "pr_body": "<!--\r\nThanks for contributing a pull request! Please ensure you have taken a look at\r\nthe contribution guidelines: https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md#pull-request-checklist\r\n-->\r\n\r\n#### Reference Issues/PRs\r\n<!--\r\nExample: Fixes #1234. See also #3456.\r\nPlease use keywords (e.g., Fixes) to create link to the issues or pull requests\r\nyou resolved, so that they will automatically be closed when your pull request\r\nis merged. See https://github.com/blog/1506-closing-issues-via-pull-requests\r\n-->\r\nFixes #12276 \r\n\r\n\r\n#### What does this implement/fix? Explain your changes.\r\n\r\nAdding `check_is_fitted` method to other non standard functions\r\n\r\n#### Any other comments?\r\n\r\n\r\n<!--\r\nPlease be aware that we are a loose team of volunteers so patience is\r\nnecessary; assistance handling other issues is very welcome. We value\r\nall user contributions, no matter how minor they are. If we are slow to\r\nreview, either the pull request needs some benchmarking, tinkering,\r\nconvincing, etc. or more likely the reviewers are simply busy. In either\r\ncase, we ask for your understanding during the review process.\r\nFor more information, see our FAQ on this topic:\r\nhttp://scikit-learn.org/dev/faq.html#why-is-my-pull-request-not-getting-any-attention.\r\n\r\nThanks for contributing!\r\n-->\r\n",
      "issue_id": 12276,
      "issue_title": "Calling NearestNeighbors.{kneighbors,radius_neighbors}_graph without first fitting should raise NotFittedError",
      "issue_body": "<!--\r\nIf your issue is a usage question, submit it here instead:\r\n- StackOverflow with the scikit-learn tag: http://stackoverflow.com/questions/tagged/scikit-learn\r\n- Mailing List: https://mail.python.org/mailman/listinfo/scikit-learn\r\nFor more information, see User Questions: http://scikit-learn.org/stable/support.html#user-questions\r\n-->\r\n\r\n<!-- Instructions For Filing a Bug: https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md#filing-bugs -->\r\n\r\n#### Description\r\n\r\nRunning prediction methods without running `fit` should raise `NotFittedError`. This is not the case for some non-standard methods.\r\n\r\n`check_is_fitted` should be applied for this purpose.\r\n\r\n#### Steps/Code to Reproduce\r\n\r\n```py\r\nfrom sklearn.neighbors import NearestNeighbors\r\nNearestNeighbors().kneighbors_graph([[1]])\r\n```\r\nor\r\n```py\r\nfrom sklearn.neighbors import NearestNeighbors\r\nNearestNeighbors().radius_neighbors_graph([[1]])\r\n```\r\n\r\n\r\n#### Expected Results\r\n\r\nNotFittedError raised\r\n\r\n#### Actual Results\r\nAttributeError raised\r\n\r\n#### Versions\r\n\r\n```\r\nSystem\r\n------\r\n    python: 3.5.2 |Continuum Analytics, Inc.| (default, Jul  2 2016, 17:52:12)  [GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]\r\nexecutable: /Users/joel/anaconda3/envs/scipy3k/bin/python\r\n   machine: Darwin-17.7.0-x86_64-i386-64bit\r\n\r\nBLAS\r\n----\r\n  lib_dirs: /Users/joel/anaconda3/envs/scipy3k/lib\r\n    macros: SCIPY_MKL_H=None, HAVE_CBLAS=None\r\ncblas_libs: mkl_rt, pthread\r\n\r\nPython deps\r\n-----------\r\nsetuptools: 37.0.0\r\n     scipy: 1.0.0\r\n       pip: 18.0\r\n     numpy: 1.14.1\r\n   sklearn: 0.21.dev0\r\n    pandas: 0.23.4\r\n    Cython: 0.28.5\r\n```",
      "issue_closed_at": "2018-10-19T13:46:09Z",
      "base_commit": "74b56dbc57d9295df8fb653adccb265da356b670",
      "changes": [
        {
          "file": "sklearn/neighbors/base.py",
          "type": "function",
          "name": "kneighbors_graph",
          "class_name": "KNeighborsMixin",
          "code": "def kneighbors_graph(self, X=None, n_neighbors=None,\n                         mode='connectivity'):\n        \"\"\"Computes the (weighted) graph of k-Neighbors for points in X\n\n        Parameters\n        ----------\n        X : array-like, shape (n_query, n_features), \\\n                or (n_query, n_indexed) if metric == 'precomputed'\n            The query point or points.\n            If not provided, neighbors of each indexed point are returned.\n            In this case, the query point is not considered its own neighbor.\n\n        n_neighbors : int\n            Number of neighbors for each sample.\n            (default is value passed to the constructor).\n\n        mode : {'connectivity', 'distance'}, optional\n            Type of returned matrix: 'connectivity' will return the\n            connectivity matrix with ones and zeros, in 'distance' the\n            edges are Euclidean distance between points.\n\n        Returns\n        -------\n        A : sparse matrix in CSR format, shape = [n_samples, n_samples_fit]\n            n_samples_fit is the number of samples in the fitted data\n            A[i, j] is assigned the weight of edge that connects i to j.\n\n        Examples\n        --------\n        >>> X = [[0], [3], [1]]\n        >>> from sklearn.neighbors import NearestNeighbors\n        >>> neigh = NearestNeighbors(n_neighbors=2)\n        >>> neigh.fit(X) # doctest: +ELLIPSIS\n        NearestNeighbors(algorithm='auto', leaf_size=30, ...)\n        >>> A = neigh.kneighbors_graph(X)\n        >>> A.toarray()\n        array([[1., 0., 1.],\n               [0., 1., 1.],\n               [1., 0., 1.]])\n\n        See also\n        --------\n        NearestNeighbors.radius_neighbors_graph\n        \"\"\"\n        if n_neighbors is None:\n            n_neighbors = self.n_neighbors\n\n        # kneighbors does the None handling.\n        if X is not None:\n            X = check_array(X, accept_sparse='csr')\n            n_samples1 = X.shape[0]\n        else:\n            n_samples1 = self._fit_X.shape[0]\n\n        n_samples2 = self._fit_X.shape[0]\n        n_nonzero = n_samples1 * n_neighbors\n        A_indptr = np.arange(0, n_nonzero + 1, n_neighbors)\n\n        # construct CSR matrix representation of the k-NN graph\n        if mode == 'connectivity':\n            A_data = np.ones(n_samples1 * n_neighbors)\n            A_ind = self.kneighbors(X, n_neighbors, return_distance=False)\n\n        elif mode == 'distance':\n            A_data, A_ind = self.kneighbors(\n                X, n_neighbors, return_distance=True)\n            A_data = np.ravel(A_data)\n\n        else:\n            raise ValueError(\n                'Unsupported mode, must be one of \"connectivity\" '\n                'or \"distance\" but got \"%s\" instead' % mode)\n\n        kneighbors_graph = csr_matrix((A_data, A_ind.ravel(), A_indptr),\n                                      shape=(n_samples1, n_samples2))\n\n        return kneighbors_graph"
        },
        {
          "file": "sklearn/neighbors/base.py",
          "type": "function",
          "name": "radius_neighbors_graph",
          "class_name": "RadiusNeighborsMixin",
          "code": "def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'):\n        \"\"\"Computes the (weighted) graph of Neighbors for points in X\n\n        Neighborhoods are restricted the points at a distance lower than\n        radius.\n\n        Parameters\n        ----------\n        X : array-like, shape = [n_samples, n_features], optional\n            The query point or points.\n            If not provided, neighbors of each indexed point are returned.\n            In this case, the query point is not considered its own neighbor.\n\n        radius : float\n            Radius of neighborhoods.\n            (default is the value passed to the constructor).\n\n        mode : {'connectivity', 'distance'}, optional\n            Type of returned matrix: 'connectivity' will return the\n            connectivity matrix with ones and zeros, in 'distance' the\n            edges are Euclidean distance between points.\n\n        Returns\n        -------\n        A : sparse matrix in CSR format, shape = [n_samples, n_samples]\n            A[i, j] is assigned the weight of edge that connects i to j.\n\n        Examples\n        --------\n        >>> X = [[0], [3], [1]]\n        >>> from sklearn.neighbors import NearestNeighbors\n        >>> neigh = NearestNeighbors(radius=1.5)\n        >>> neigh.fit(X) # doctest: +ELLIPSIS\n        NearestNeighbors(algorithm='auto', leaf_size=30, ...)\n        >>> A = neigh.radius_neighbors_graph(X)\n        >>> A.toarray()\n        array([[1., 0., 1.],\n               [0., 1., 0.],\n               [1., 0., 1.]])\n\n        See also\n        --------\n        kneighbors_graph\n        \"\"\"\n        if X is not None:\n            X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])\n\n        n_samples2 = self._fit_X.shape[0]\n        if radius is None:\n            radius = self.radius\n\n        # construct CSR matrix representation of the NN graph\n        if mode == 'connectivity':\n            A_ind = self.radius_neighbors(X, radius,\n                                          return_distance=False)\n            A_data = None\n        elif mode == 'distance':\n            dist, A_ind = self.radius_neighbors(X, radius,\n                                                return_distance=True)\n            A_data = np.concatenate(list(dist))\n        else:\n            raise ValueError(\n                'Unsupported mode, must be one of \"connectivity\", '\n                'or \"distance\" but got %s instead' % mode)\n\n        n_samples1 = A_ind.shape[0]\n        n_neighbors = np.array([len(a) for a in A_ind])\n        A_ind = np.concatenate(list(A_ind))\n        if A_data is None:\n            A_data = np.ones(len(A_ind))\n        A_indptr = np.concatenate((np.zeros(1, dtype=int),\n                                   np.cumsum(n_neighbors)))\n\n        return csr_matrix((A_data, A_ind, A_indptr),\n                          shape=(n_samples1, n_samples2))"
        }
      ]
    },
    {
      "pr_number": 11914,
      "pr_title": "[MRG] ENH Better error message for metrics of neighbors",
      "pr_body": "<!--\r\nThanks for contributing a pull request! Please ensure you have taken a look at\r\nthe contribution guidelines: https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md#pull-request-checklist\r\n-->\r\n\r\n#### Reference Issues/PRs\r\n<!--\r\nExample: Fixes #1234. See also #3456.\r\nPlease use keywords (e.g., Fixes) to create link to the issues or pull requests\r\nyou resolved, so that they will automatically be closed when your pull request\r\nis merged. See https://github.com/blog/1506-closing-issues-via-pull-requests\r\n-->\r\n#Fixes #11906 \r\n\r\n#### What does this implement/fix? Explain your changes.\r\n\r\nAdded expression to error message to get list of valid metrics.\r\n\r\n<!--\r\n#### Any other comments?\r\n\r\nPlease be aware that we are a loose team of volunteers so patience is\r\nnecessary; assistance handling other issues is very welcome. We value\r\nall user contributions, no matter how minor they are. If we are slow to\r\nreview, either the pull request needs some benchmarking, tinkering,\r\nconvincing, etc. or more likely the reviewers are simply busy. In either\r\ncase, we ask for your understanding during the review process.\r\nFor more information, see our FAQ on this topic:\r\nhttp://scikit-learn.org/dev/faq.html#why-is-my-pull-request-not-getting-any-attention.\r\n\r\nThanks for contributing!\r\n-->\r\n",
      "issue_id": 11906,
      "issue_title": "Better error message for invalid metric in NearestNeighbors ",
      "issue_body": "<!--\r\nIf your issue is a usage question, submit it here instead:\r\n- StackOverflow with the scikit-learn tag: http://stackoverflow.com/questions/tagged/scikit-learn\r\n- Mailing List: https://mail.python.org/mailman/listinfo/scikit-learn\r\nFor more information, see User Questions: http://scikit-learn.org/stable/support.html#user-questions\r\n-->\r\n\r\n<!-- Instructions For Filing a Bug: https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md#filing-bugs -->\r\n\r\n#### Description\r\n<!-- Example: Joblib Error thrown when calling fit on LatentDirichletAllocation with evaluate_every > 0-->\r\nError message for invalid metric in NearestNeighbors is unclear.\r\n\r\n#### Steps/Code to Reproduce\r\n<!--\r\nExample:\r\n```python\r\nfrom sklearn.feature_extraction.text import CountVectorizer\r\nfrom sklearn.decomposition import LatentDirichletAllocation\r\n\r\ndocs = [\"Help I have a bug\" for i in range(1000)]\r\n\r\nvectorizer = CountVectorizer(input=docs, analyzer='word')\r\nlda_features = vectorizer.fit_transform(docs)\r\n\r\nlda_model = LatentDirichletAllocation(\r\n    n_topics=10,\r\n    learning_method='online',\r\n    evaluate_every=10,\r\n    n_jobs=4,\r\n)\r\nmodel = lda_model.fit(lda_features)\r\n```\r\nIf the code is too long, feel free to put it in a public gist and link\r\nit in the issue: https://gist.github.com\r\n-->\r\n```python\r\nNearestNeighbors(metric='cheybshev')\r\n```\r\n\r\n#### Expected Results\r\n<!-- Example: No error is thrown. Please paste or describe the expected results.-->\r\nError message stating that metric should be 'cityblock', ... or callable rather than metric not valid for algorithm 'auto'. When I initially saw the error message, I did not realize I had a typo in the metric string. I thought it has something to do with the algorithm.\r\n\r\n#### Actual Results\r\n<!-- Please paste or specifically describe the actual output or traceback. -->\r\n```\r\nValueError: Metric 'cheybshev' not valid for algorithm 'auto'\r\n```\r\n\r\n#### Versions\r\n<!--\r\nPlease run the following snippet and paste the output below.\r\nimport platform; print(platform.platform())\r\nimport sys; print(\"Python\", sys.version)\r\nimport numpy; print(\"NumPy\", numpy.__version__)\r\nimport scipy; print(\"SciPy\", scipy.__version__)\r\nimport sklearn; print(\"Scikit-Learn\", sklearn.__version__)\r\n-->\r\nLinux-4.15.0-24-generic-x86_64-with-debian-stretch-sid\r\nPython 3.6.3 |Anaconda custom (64-bit)| (default, Nov  9 2017, 00:19:18) \r\n[GCC 7.2.0]\r\nNumPy 1.13.3\r\nSciPy 0.19.1\r\nScikit-Learn 0.19.1\r\n\r\n<!-- Thanks for contributing! -->\r\n",
      "issue_closed_at": "2018-09-13T15:34:02Z",
      "base_commit": "7ed61a24feb4ffde0bee9342acf4a58e3f946a61",
      "changes": [
        {
          "file": "sklearn/neighbors/__init__.py",
          "type": "line",
          "name": "line 14",
          "code": "from .kde import KernelDensity\nfrom .approximate import LSHForest\nfrom .lof import LocalOutlierFactor\n\n__all__ = ['BallTree',\n           'DistanceMetric',"
        },
        {
          "file": "sklearn/neighbors/__init__.py",
          "type": "line",
          "name": "line 28",
          "code": "           'radius_neighbors_graph',\n           'KernelDensity',\n           'LSHForest',\n           'LocalOutlierFactor']"
        },
        {
          "file": "sklearn/neighbors/base.py",
          "type": "function",
          "name": "_check_algorithm_metric",
          "class_name": "NeighborsBase",
          "code": "def _check_algorithm_metric(self):\n        if self.algorithm not in ['auto', 'brute',\n                                  'kd_tree', 'ball_tree']:\n            raise ValueError(\"unrecognized algorithm: '%s'\" % self.algorithm)\n\n        if self.algorithm == 'auto':\n            if self.metric == 'precomputed':\n                alg_check = 'brute'\n            elif (callable(self.metric) or\n                  self.metric in VALID_METRICS['ball_tree']):\n                alg_check = 'ball_tree'\n            else:\n                alg_check = 'brute'\n        else:\n            alg_check = self.algorithm\n\n        if callable(self.metric):\n            if self.algorithm == 'kd_tree':\n                # callable metric is only valid for brute force and ball_tree\n                raise ValueError(\n                    \"kd_tree algorithm does not support callable metric '%s'\"\n                    % self.metric)\n        elif self.metric not in VALID_METRICS[alg_check]:\n            raise ValueError(\"Metric '%s' not valid for algorithm '%s'\"\n                             % (self.metric, self.algorithm))\n\n        if self.metric_params is not None and 'p' in self.metric_params:\n            warnings.warn(\"Parameter p is found in metric_params. \"\n                          \"The corresponding parameter from __init__ \"\n                          \"is ignored.\", SyntaxWarning, stacklevel=3)\n            effective_p = self.metric_params['p']\n        else:\n            effective_p = self.p\n\n        if self.metric in ['wminkowski', 'minkowski'] and effective_p < 1:\n            raise ValueError(\"p must be greater than one for minkowski metric\")"
        }
      ]
    },
    {
      "pr_number": 11042,
      "pr_title": "[MRG + 1] Ensuring that the OneHotEncoder outputs sparse matrix with given dtype #11034",
      "pr_body": "#### Reference Issues/PRs\r\nOriginal discussion at #11034\r\n\r\n#### What does this implement/fix? Explain your changes.\r\n",
      "issue_id": 11034,
      "issue_title": "OneHotEncoder does not output scipy sparse matrix of given dtype",
      "issue_body": "#### Description\r\nOneHotEncoder ignores the specified dtype in the construction of the sparse array when mixed input data are passed, i.e with both categorical and real data type\r\n\r\n#### Steps/Code to Reproduce\r\n```python\r\nimport numpy as np\r\n\r\nfrom sklearn.preprocessing import OneHotEncoder\r\nenc = OneHotEncoder(dtype=np.float32, categorical_features=[0, 1])\r\n\r\nx = np.array([[0, 1, 0, 0], [1, 2, 0, 0]], dtype=int)\r\nsparse = enc.fit(x).transform(x)\r\n```\r\n\r\n#### Expected Results\r\n```python\r\nsparse: <2x6 sparse matrix of type '<class 'numpy.float32'>'\r\n\twith 4 stored elements in COOrdinate format>\r\n```\r\n\r\n#### Actual Results\r\n```python\r\nsparse: <2x6 sparse matrix of type '<class 'numpy.float64'>'\r\n\twith 4 stored elements in COOrdinate format>\r\n```\r\n\r\n#### Versions\r\n__Platform__: Linux-4.13.0-38-generic-x86_64-with-debian-stretch-sid\r\n__Python__: 3.6.3 |Anaconda custom (64-bit)| (default, Oct 13 2017, 12:02:49) [GCC 7.2.0]\r\n__NumPy__: NumPy \r\n__SciPy__: SciPy 1.0.1\r\n__Scikit-Learn__: Scikit-Learn 0.19.1\r\n",
      "issue_closed_at": "2018-06-06T09:03:02Z",
      "base_commit": "f049ec72eb70443ec8d7826066c4246035677c11",
      "changes": [
        {
          "file": "sklearn/preprocessing/data.py",
          "type": "function",
          "name": "add_dummy_feature",
          "class_name": null,
          "code": "def add_dummy_feature(X, value=1.0):\n    \"\"\"Augment dataset with an additional dummy feature.\n\n    This is useful for fitting an intercept term with implementations which\n    cannot otherwise fit it directly.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix}, shape [n_samples, n_features]\n        Data.\n\n    value : float\n        Value to use for the dummy feature.\n\n    Returns\n    -------\n\n    X : {array, sparse matrix}, shape [n_samples, n_features + 1]\n        Same data with dummy feature added as first column.\n\n    Examples\n    --------\n\n    >>> from sklearn.preprocessing import add_dummy_feature\n    >>> add_dummy_feature([[0, 1], [1, 0]])\n    array([[1., 0., 1.],\n           [1., 1., 0.]])\n    \"\"\"\n    X = check_array(X, accept_sparse=['csc', 'csr', 'coo'], dtype=FLOAT_DTYPES)\n    n_samples, n_features = X.shape\n    shape = (n_samples, n_features + 1)\n    if sparse.issparse(X):\n        if sparse.isspmatrix_coo(X):\n            # Shift columns to the right.\n            col = X.col + 1\n            # Column indices of dummy feature are 0 everywhere.\n            col = np.concatenate((np.zeros(n_samples), col))\n            # Row indices of dummy feature are 0, ..., n_samples-1.\n            row = np.concatenate((np.arange(n_samples), X.row))\n            # Prepend the dummy feature n_samples times.\n            data = np.concatenate((np.ones(n_samples) * value, X.data))\n            return sparse.coo_matrix((data, (row, col)), shape)\n        elif sparse.isspmatrix_csc(X):\n            # Shift index pointers since we need to add n_samples elements.\n            indptr = X.indptr + n_samples\n            # indptr[0] must be 0.\n            indptr = np.concatenate((np.array([0]), indptr))\n            # Row indices of dummy feature are 0, ..., n_samples-1.\n            indices = np.concatenate((np.arange(n_samples), X.indices))\n            # Prepend the dummy feature n_samples times.\n            data = np.concatenate((np.ones(n_samples) * value, X.data))\n            return sparse.csc_matrix((data, indices, indptr), shape)\n        else:\n            klass = X.__class__\n            return klass(add_dummy_feature(X.tocoo(), value))\n    else:\n        return np.hstack((np.ones((n_samples, 1)) * value, X))"
        },
        {
          "file": "sklearn/preprocessing/data.py",
          "type": "function",
          "name": "_transform_selected",
          "class_name": null,
          "code": "def _transform_selected(X, transform, selected=\"all\", copy=True):\n    \"\"\"Apply a transform function to portion of selected features\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix}, shape [n_samples, n_features]\n        Dense array or sparse matrix.\n\n    transform : callable\n        A callable transform(X) -> X_transformed\n\n    copy : boolean, optional\n        Copy X even if it could be avoided.\n\n    selected: \"all\" or array of indices or mask\n        Specify which features to apply the transform to.\n\n    Returns\n    -------\n    X : array or sparse matrix, shape=(n_samples, n_features_new)\n    \"\"\"\n    X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES)\n\n    if isinstance(selected, six.string_types) and selected == \"all\":\n        return transform(X)\n\n    if len(selected) == 0:\n        return X\n\n    n_features = X.shape[1]\n    ind = np.arange(n_features)\n    sel = np.zeros(n_features, dtype=bool)\n    sel[np.asarray(selected)] = True\n    not_sel = np.logical_not(sel)\n    n_selected = np.sum(sel)\n\n    if n_selected == 0:\n        # No features selected.\n        return X\n    elif n_selected == n_features:\n        # All features selected.\n        return transform(X)\n    else:\n        X_sel = transform(X[:, ind[sel]])\n        X_not_sel = X[:, ind[not_sel]]\n\n        if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):\n            return sparse.hstack((X_sel, X_not_sel))\n        else:\n            return np.hstack((X_sel, X_not_sel))"
        },
        {
          "file": "sklearn/preprocessing/data.py",
          "type": "function",
          "name": "_transform_selected",
          "class_name": null,
          "code": "def _transform_selected(X, transform, selected=\"all\", copy=True):\n    \"\"\"Apply a transform function to portion of selected features\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix}, shape [n_samples, n_features]\n        Dense array or sparse matrix.\n\n    transform : callable\n        A callable transform(X) -> X_transformed\n\n    copy : boolean, optional\n        Copy X even if it could be avoided.\n\n    selected: \"all\" or array of indices or mask\n        Specify which features to apply the transform to.\n\n    Returns\n    -------\n    X : array or sparse matrix, shape=(n_samples, n_features_new)\n    \"\"\"\n    X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES)\n\n    if isinstance(selected, six.string_types) and selected == \"all\":\n        return transform(X)\n\n    if len(selected) == 0:\n        return X\n\n    n_features = X.shape[1]\n    ind = np.arange(n_features)\n    sel = np.zeros(n_features, dtype=bool)\n    sel[np.asarray(selected)] = True\n    not_sel = np.logical_not(sel)\n    n_selected = np.sum(sel)\n\n    if n_selected == 0:\n        # No features selected.\n        return X\n    elif n_selected == n_features:\n        # All features selected.\n        return transform(X)\n    else:\n        X_sel = transform(X[:, ind[sel]])\n        X_not_sel = X[:, ind[not_sel]]\n\n        if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):\n            return sparse.hstack((X_sel, X_not_sel))\n        else:\n            return np.hstack((X_sel, X_not_sel))"
        },
        {
          "file": "sklearn/preprocessing/data.py",
          "type": "function",
          "name": "fit_transform",
          "class_name": "OneHotEncoder",
          "code": "def fit_transform(self, X, y=None):\n        \"\"\"Fit OneHotEncoder to X, then transform X.\n\n        Equivalent to self.fit(X).transform(X), but more convenient and more\n        efficient. See fit for the parameters, transform for the return value.\n\n        Parameters\n        ----------\n        X : array-like, shape [n_samples, n_feature]\n            Input array of type int.\n        \"\"\"\n        return _transform_selected(X, self._fit_transform,\n                                   self.categorical_features, copy=True)"
        },
        {
          "file": "sklearn/preprocessing/data.py",
          "type": "function",
          "name": "transform",
          "class_name": "CategoricalEncoder",
          "code": "def transform(self, X):\n        \"\"\"Transform X using specified encoding scheme.\n\n        Parameters\n        ----------\n        X : array-like, shape [n_samples, n_features]\n            The data to encode.\n\n        Returns\n        -------\n        X_out : sparse matrix or a 2-d array\n            Transformed input.\n\n        \"\"\"\n        X_temp = check_array(X, dtype=None)\n        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):\n            X = check_array(X, dtype=np.object)\n        else:\n            X = X_temp\n\n        n_samples, n_features = X.shape\n        X_int = np.zeros_like(X, dtype=np.int)\n        X_mask = np.ones_like(X, dtype=np.bool)\n\n        for i in range(n_features):\n            Xi = X[:, i]\n            valid_mask = np.in1d(Xi, self.categories_[i])\n\n            if not np.all(valid_mask):\n                if self.handle_unknown == 'error':\n                    diff = np.unique(X[~valid_mask, i])\n                    msg = (\"Found unknown categories {0} in column {1}\"\n                           \" during transform\".format(diff, i))\n                    raise ValueError(msg)\n                else:\n                    # Set the problematic rows to an acceptable value and\n                    # continue `The rows are marked `X_mask` and will be\n                    # removed later.\n                    X_mask[:, i] = valid_mask\n                    Xi = Xi.copy()\n                    Xi[~valid_mask] = self.categories_[i][0]\n            X_int[:, i] = self._label_encoders_[i].transform(Xi)\n\n        if self.encoding == 'ordinal':\n            return X_int.astype(self.dtype, copy=False)\n\n        mask = X_mask.ravel()\n        n_values = [cats.shape[0] for cats in self.categories_]\n        n_values = np.array([0] + n_values)\n        feature_indices = np.cumsum(n_values)\n\n        indices = (X_int + feature_indices[:-1]).ravel()[mask]\n        indptr = X_mask.sum(axis=1).cumsum()\n        indptr = np.insert(indptr, 0, 0)\n        data = np.ones(n_samples * n_features)[mask]\n\n        out = sparse.csr_matrix((data, indices, indptr),\n                                shape=(n_samples, feature_indices[-1]),\n                                dtype=self.dtype)\n        if self.encoding == 'onehot-dense':\n            return out.toarray()\n        else:\n            return out"
        }
      ]
    }
  ]
}