{
  "instance_id": "scikit-learn__scikit-learn-25747",
  "repo": "scikit-learn/scikit-learn",
  "created_at": "2023-03-02T20:38:47Z",
  "problem_statement": "FeatureUnion not working when aggregating data and pandas transform output selected\n### Describe the bug\n\nI would like to use `pandas` transform output and use a custom transformer in a feature union which aggregates data. When I'm using this combination I got an error. When I use default `numpy` output it works fine.\n\n### Steps/Code to Reproduce\n\n```python\r\nimport pandas as pd\r\nfrom sklearn.base import BaseEstimator, TransformerMixin\r\nfrom sklearn import set_config\r\nfrom sklearn.pipeline import make_union\r\n\r\nindex = pd.date_range(start=\"2020-01-01\", end=\"2020-01-05\", inclusive=\"left\", freq=\"H\")\r\ndata = pd.DataFrame(index=index, data=[10] * len(index), columns=[\"value\"])\r\ndata[\"date\"] = index.date\r\n\r\n\r\nclass MyTransformer(BaseEstimator, TransformerMixin):\r\n    def fit(self, X: pd.DataFrame, y: pd.Series | None = None, **kwargs):\r\n        return self\r\n\r\n    def transform(self, X: pd.DataFrame, y: pd.Series | None = None) -> pd.DataFrame:\r\n        return X[\"value\"].groupby(X[\"date\"]).sum()\r\n\r\n\r\n# This works.\r\nset_config(transform_output=\"default\")\r\nprint(make_union(MyTransformer()).fit_transform(data))\r\n\r\n# This does not work.\r\nset_config(transform_output=\"pandas\")\r\nprint(make_union(MyTransformer()).fit_transform(data))\r\n```\n\n### Expected Results\n\nNo error is thrown when using `pandas` transform output.\n\n### Actual Results\n\n```python\r\n---------------------------------------------------------------------------\r\nValueError                                Traceback (most recent call last)\r\nCell In[5], line 25\r\n     23 # This does not work.\r\n     24 set_config(transform_output=\"pandas\")\r\n---> 25 print(make_union(MyTransformer()).fit_transform(data))\r\n\r\nFile ~/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/sklearn/utils/_set_output.py:150, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)\r\n    143 if isinstance(data_to_wrap, tuple):\r\n    144     # only wrap the first output for cross decomposition\r\n    145     return (\r\n    146         _wrap_data_with_container(method, data_to_wrap[0], X, self),\r\n    147         *data_to_wrap[1:],\r\n    148     )\r\n--> 150 return _wrap_data_with_container(method, data_to_wrap, X, self)\r\n\r\nFile ~/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/sklearn/utils/_set_output.py:130, in _wrap_data_with_container(method, data_to_wrap, original_input, estimator)\r\n    127     return data_to_wrap\r\n    129 # dense_config == \"pandas\"\r\n--> 130 return _wrap_in_pandas_container(\r\n    131     data_to_wrap=data_to_wrap,\r\n    132     index=getattr(original_input, \"index\", None),\r\n    133     columns=estimator.get_feature_names_out,\r\n    134 )\r\n\r\nFile ~/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/sklearn/utils/_set_output.py:59, in _wrap_in_pandas_container(data_to_wrap, columns, index)\r\n     57         data_to_wrap.columns = columns\r\n     58     if index is not None:\r\n---> 59         data_to_wrap.index = index\r\n     60     return data_to_wrap\r\n     62 return pd.DataFrame(data_to_wrap, index=index, columns=columns)\r\n\r\nFile ~/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/pandas/core/generic.py:5588, in NDFrame.__setattr__(self, name, value)\r\n   5586 try:\r\n   5587     object.__getattribute__(self, name)\r\n-> 5588     return object.__setattr__(self, name, value)\r\n   5589 except AttributeError:\r\n   5590     pass\r\n\r\nFile ~/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/pandas/_libs/properties.pyx:70, in pandas._libs.properties.AxisProperty.__set__()\r\n\r\nFile ~/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/pandas/core/generic.py:769, in NDFrame._set_axis(self, axis, labels)\r\n    767 def _set_axis(self, axis: int, labels: Index) -> None:\r\n    768     labels = ensure_index(labels)\r\n--> 769     self._mgr.set_axis(axis, labels)\r\n    770     self._clear_item_cache()\r\n\r\nFile ~/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/pandas/core/internals/managers.py:214, in BaseBlockManager.set_axis(self, axis, new_labels)\r\n    212 def set_axis(self, axis: int, new_labels: Index) -> None:\r\n    213     # Caller is responsible for ensuring we have an Index object.\r\n--> 214     self._validate_set_axis(axis, new_labels)\r\n    215     self.axes[axis] = new_labels\r\n\r\nFile ~/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/pandas/core/internals/base.py:69, in DataManager._validate_set_axis(self, axis, new_labels)\r\n     66     pass\r\n     68 elif new_len != old_len:\r\n---> 69     raise ValueError(\r\n     70         f\"Length mismatch: Expected axis has {old_len} elements, new \"\r\n     71         f\"values have {new_len} elements\"\r\n     72     )\r\n\r\nValueError: Length mismatch: Expected axis has 4 elements, new values have 96 elements\r\n```\n\n### Versions\n\n```shell\nSystem:\r\n    python: 3.10.6 (main, Aug 30 2022, 05:11:14) [Clang 13.0.0 (clang-1300.0.29.30)]\r\nexecutable: /Users/macbookpro/.local/share/virtualenvs/3e_VBrf2/bin/python\r\n   machine: macOS-11.3-x86_64-i386-64bit\r\n\r\nPython dependencies:\r\n      sklearn: 1.2.1\r\n          pip: 22.3.1\r\n   setuptools: 67.3.2\r\n        numpy: 1.23.5\r\n        scipy: 1.10.1\r\n       Cython: None\r\n       pandas: 1.4.4\r\n   matplotlib: 3.7.0\r\n       joblib: 1.2.0\r\nthreadpoolctl: 3.1.0\r\n\r\nBuilt with OpenMP: True\r\n\r\nthreadpoolctl info:\r\n       user_api: blas\r\n   internal_api: openblas\r\n         prefix: libopenblas\r\n       filepath: /Users/macbookpro/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/numpy/.dylibs/libopenblas64_.0.dylib\r\n        version: 0.3.20\r\nthreading_layer: pthreads\r\n   architecture: Haswell\r\n    num_threads: 4\r\n\r\n       user_api: openmp\r\n   internal_api: openmp\r\n         prefix: libomp\r\n       filepath: /Users/macbookpro/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/sklearn/.dylibs/libomp.dylib\r\n        version: None\r\n    num_threads: 8\r\n\r\n       user_api: blas\r\n   internal_api: openblas\r\n         prefix: libopenblas\r\n       filepath: /Users/macbookpro/.local/share/virtualenvs/3e_VBrf2/lib/python3.10/site-packages/scipy/.dylibs/libopenblas.0.dylib\r\n        version: 0.3.18\r\nthreading_layer: pthreads\r\n   architecture: Haswell\r\n    num_threads: 4\n```\n\n",
  "patch": "diff --git a/sklearn/utils/_set_output.py b/sklearn/utils/_set_output.py\n--- a/sklearn/utils/_set_output.py\n+++ b/sklearn/utils/_set_output.py\n@@ -34,7 +34,7 @@ def _wrap_in_pandas_container(\n         `range(n_features)`.\n \n     index : array-like, default=None\n-        Index for data.\n+        Index for data. `index` is ignored if `data_to_wrap` is already a DataFrame.\n \n     Returns\n     -------\n@@ -55,8 +55,6 @@ def _wrap_in_pandas_container(\n     if isinstance(data_to_wrap, pd.DataFrame):\n         if columns is not None:\n             data_to_wrap.columns = columns\n-        if index is not None:\n-            data_to_wrap.index = index\n         return data_to_wrap\n \n     return pd.DataFrame(data_to_wrap, index=index, columns=columns)\n",
  "similar_bug_items": [
    {
      "pr_number": 19879,
      "pr_title": "FIX Error for sparse matrix in OrdinalEncoder.inverse_transform",
      "pr_body": "closes #19878\r\n\r\n`OrdinalEncoder.inverse_transform` should not support sparse matrix. It was already failing but with an obscure message.\r\nThis PR adds a non-regression test to check for the error message.",
      "issue_id": 19878,
      "issue_title": "OrdinalEncoder accept and failed with sparse matrix in inverse_transform",
      "issue_body": "`OrdinalEncoder` was documented to accept sparse matrix in `inverse_transform`.\r\nA check is internally done to accept sparse matrix. However, the `inverse_transform` will fail with this type of data.\r\nWe should remove the support in the check and make sure that we issue the right error.\r\n",
      "issue_closed_at": "2021-04-13T11:59:30Z",
      "base_commit": "767fd63c9ddddc46e288fdec2cca36a129529a8e",
      "changes": [
        {
          "file": "sklearn/preprocessing/_encoders.py",
          "type": "function",
          "name": "inverse_transform",
          "class_name": "OrdinalEncoder",
          "code": "def inverse_transform(self, X):\n        \"\"\"\n        Convert the data back to the original representation.\n\n        Parameters\n        ----------\n        X : {array-like, sparse matrix} of shape (n_samples, n_features)\n            The transformed data.\n\n        Returns\n        -------\n        X_tr : ndarray of shape (n_samples, n_features)\n            Inverse transformed array.\n        \"\"\"\n        check_is_fitted(self)\n        X = check_array(X, accept_sparse='csr', force_all_finite='allow-nan')\n\n        n_samples, _ = X.shape\n        n_features = len(self.categories_)\n\n        # validate shape of passed X\n        msg = (\"Shape of the passed X data is not correct. Expected {0} \"\n               \"columns, got {1}.\")\n        if X.shape[1] != n_features:\n            raise ValueError(msg.format(n_features, X.shape[1]))\n\n        # create resulting array of appropriate dtype\n        dt = np.find_common_type([cat.dtype for cat in self.categories_], [])\n        X_tr = np.empty((n_samples, n_features), dtype=dt)\n\n        found_unknown = {}\n\n        for i in range(n_features):\n            labels = X[:, i].astype('int64', copy=False)\n\n            # replace values of X[:, i] that were nan with actual indices\n            if i in self._missing_indices:\n                X_i_mask = _get_mask(X[:, i], np.nan)\n                labels[X_i_mask] = self._missing_indices[i]\n\n            if self.handle_unknown == 'use_encoded_value':\n                unknown_labels = labels == self.unknown_value\n                X_tr[:, i] = self.categories_[i][np.where(\n                    unknown_labels, 0, labels)]\n                found_unknown[i] = unknown_labels\n            else:\n                X_tr[:, i] = self.categories_[i][labels]\n\n        # insert None values for unknown values\n        if found_unknown:\n            X_tr = X_tr.astype(object, copy=False)\n\n            for idx, mask in found_unknown.items():\n                X_tr[mask, idx] = None\n\n        return X_tr"
        },
        {
          "file": "sklearn/preprocessing/_encoders.py",
          "type": "function",
          "name": "inverse_transform",
          "class_name": "OrdinalEncoder",
          "code": "def inverse_transform(self, X):\n        \"\"\"\n        Convert the data back to the original representation.\n\n        Parameters\n        ----------\n        X : {array-like, sparse matrix} of shape (n_samples, n_features)\n            The transformed data.\n\n        Returns\n        -------\n        X_tr : ndarray of shape (n_samples, n_features)\n            Inverse transformed array.\n        \"\"\"\n        check_is_fitted(self)\n        X = check_array(X, accept_sparse='csr', force_all_finite='allow-nan')\n\n        n_samples, _ = X.shape\n        n_features = len(self.categories_)\n\n        # validate shape of passed X\n        msg = (\"Shape of the passed X data is not correct. Expected {0} \"\n               \"columns, got {1}.\")\n        if X.shape[1] != n_features:\n            raise ValueError(msg.format(n_features, X.shape[1]))\n\n        # create resulting array of appropriate dtype\n        dt = np.find_common_type([cat.dtype for cat in self.categories_], [])\n        X_tr = np.empty((n_samples, n_features), dtype=dt)\n\n        found_unknown = {}\n\n        for i in range(n_features):\n            labels = X[:, i].astype('int64', copy=False)\n\n            # replace values of X[:, i] that were nan with actual indices\n            if i in self._missing_indices:\n                X_i_mask = _get_mask(X[:, i], np.nan)\n                labels[X_i_mask] = self._missing_indices[i]\n\n            if self.handle_unknown == 'use_encoded_value':\n                unknown_labels = labels == self.unknown_value\n                X_tr[:, i] = self.categories_[i][np.where(\n                    unknown_labels, 0, labels)]\n                found_unknown[i] = unknown_labels\n            else:\n                X_tr[:, i] = self.categories_[i][labels]\n\n        # insert None values for unknown values\n        if found_unknown:\n            X_tr = X_tr.astype(object, copy=False)\n\n            for idx, mask in found_unknown.items():\n                X_tr[mask, idx] = None\n\n        return X_tr"
        }
      ]
    },
    {
      "pr_number": 22775,
      "pr_title": "FIX Fix ColumnTransformer.get_feature_names_out with slices",
      "pr_body": "<!--\r\nThanks for contributing a pull request! Please ensure you have taken a look at\r\nthe contribution guidelines: https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md\r\n-->\r\n\r\n#### Reference Issues/PRs\r\nFixes #22774\r\n<!--\r\nExample: Fixes #1234. See also #3456.\r\nPlease use keywords (e.g., Fixes) to create link to the issues or pull requests\r\nyou resolved, so that they will automatically be closed when your pull request\r\nis merged. See https://github.com/blog/1506-closing-issues-via-pull-requests\r\n-->\r\n\r\n\r\n#### What does this implement/fix? Explain your changes.\r\nCurrently, ColumnTransformer's get_feature_names_out does not work when the columns are specified as slices. This fix makes get_feature_names_out work properly with slices as well.\r\n\r\n#### Any other comments?\r\n\r\n\r\n<!--\r\nPlease be aware that we are a loose team of volunteers so patience is\r\nnecessary; assistance handling other issues is very welcome. We value\r\nall user contributions, no matter how minor they are. If we are slow to\r\nreview, either the pull request needs some benchmarking, tinkering,\r\nconvincing, etc. or more likely the reviewers are simply busy. In either\r\ncase, we ask for your understanding during the review process.\r\nFor more information, see our FAQ on this topic:\r\nhttp://scikit-learn.org/dev/faq.html#why-is-my-pull-request-not-getting-any-attention.\r\n\r\nThanks for contributing!\r\n-->\r\n",
      "issue_id": 22774,
      "issue_title": "ColumnTransformer's get_feature_names_out does not work properly with slices",
      "issue_body": "### Describe the bug\r\n\r\nSlices are a supported for selecting columns in `ColumnTransformer`. Also, `get_column_names_out` is supported to label generated columns. But get_column_names_out does not work with slices.\r\n\r\n### Steps/Code to Reproduce\r\n\r\n```python\r\nimport numpy as np\r\nimport pandas as pd\r\nfrom sklearn.compose import ColumnTransformer\r\nfrom sklearn.preprocessing import Normalizer as _Normalizer\r\nfrom sklearn.base import _OneToOneFeatureMixin\r\n\r\n\r\nclass Normalizer(_OneToOneFeatureMixin, _Normalizer):\r\n    pass\r\n\r\nct = ColumnTransformer([(\"norm1\", Normalizer(norm='l1'), [0, 1]),\r\n                        (\"norm2\", Normalizer(norm='l1'), slice(2, 4))],\r\n                       verbose_feature_names_out=False)\r\nX = np.array([[0., 1., 2., 2.], [1., 1., 0., 1.]])\r\nct.fit_transform(X)\r\n\r\n#  Expecting array(['x0', 'x1', 'x2', 'x3'], dtype=object)\r\nct.get_feature_names_out()  # get TypeError\r\n\r\ndf = pd.DataFrame(X, columns=['c1', 'c2', 'c3', 'c4'])\r\nct.fit_transform(df)\r\n\r\n# Expecting array(['c0', 'c1', 'c2', 'c3'], dtype=object)\r\nct.get_feature_names_out()  # get TypeError\r\n```\r\n\r\n### Expected Results\r\n\r\nExpecting array(['x0', 'x1', 'x2', 'x3'], dtype=object)\r\n\r\nand \r\n\r\nExpecting array(['c0', 'c1', 'c2', 'c3'], dtype=object)\r\n\r\n### Actual Results\r\n\r\nProduces error\r\n\r\n### Versions\r\n\r\n```shell\r\nSystem:\r\n    python: 3.8.10 (default, Nov 26 2021, 20:14:08)  [GCC 9.3.0]\r\nexecutable: /home/popos/code/sklearn-transformer-extensions/.venv/bin/python\r\n   machine: Linux-5.16.11-76051611-generic-x86_64-with-glibc2.29\r\n\r\nPython dependencies:\r\n          pip: 22.0.4\r\n   setuptools: 60.9.3\r\n      sklearn: 1.0.2\r\n        numpy: 1.22.3\r\n        scipy: 1.6.1\r\n       Cython: None\r\n       pandas: 1.4.1\r\n   matplotlib: None\r\n       joblib: 1.1.0\r\nthreadpoolctl: 3.1.0\r\n\r\nBuilt with OpenMP: True\r\n```\r\n",
      "issue_closed_at": "2022-03-15T19:06:04Z",
      "base_commit": "cd5385e7112c453afff205fa0ce6a67356cbcf32",
      "changes": [
        {
          "file": "sklearn/compose/_column_transformer.py",
          "type": "function",
          "name": "_get_feature_name_out_for_transformer",
          "class_name": "ColumnTransformer",
          "code": "def _get_feature_name_out_for_transformer(\n        self, name, trans, column, feature_names_in\n    ):\n        \"\"\"Gets feature names of transformer.\n\n        Used in conjunction with self._iter(fitted=True) in get_feature_names_out.\n        \"\"\"\n        if trans == \"drop\" or _is_empty_column_selection(column):\n            return\n        elif trans == \"passthrough\":\n            if (not isinstance(column, slice)) and all(\n                isinstance(col, str) for col in column\n            ):\n                # selection was already strings\n                return column\n            else:\n                return feature_names_in[column]\n\n        # An actual transformer\n        if not hasattr(trans, \"get_feature_names_out\"):\n            raise AttributeError(\n                f\"Transformer {name} (type {type(trans).__name__}) does \"\n                \"not provide get_feature_names_out.\"\n            )\n        if isinstance(column, Iterable) and not all(\n            isinstance(col, str) for col in column\n        ):\n            column = _safe_indexing(feature_names_in, column)\n        return trans.get_feature_names_out(column)"
        }
      ]
    },
    {
      "pr_number": 11042,
      "pr_title": "[MRG + 1] Ensuring that the OneHotEncoder outputs sparse matrix with given dtype #11034",
      "pr_body": "#### Reference Issues/PRs\r\nOriginal discussion at #11034\r\n\r\n#### What does this implement/fix? Explain your changes.\r\n",
      "issue_id": 11034,
      "issue_title": "OneHotEncoder does not output scipy sparse matrix of given dtype",
      "issue_body": "#### Description\r\nOneHotEncoder ignores the specified dtype in the construction of the sparse array when mixed input data are passed, i.e with both categorical and real data type\r\n\r\n#### Steps/Code to Reproduce\r\n```python\r\nimport numpy as np\r\n\r\nfrom sklearn.preprocessing import OneHotEncoder\r\nenc = OneHotEncoder(dtype=np.float32, categorical_features=[0, 1])\r\n\r\nx = np.array([[0, 1, 0, 0], [1, 2, 0, 0]], dtype=int)\r\nsparse = enc.fit(x).transform(x)\r\n```\r\n\r\n#### Expected Results\r\n```python\r\nsparse: <2x6 sparse matrix of type '<class 'numpy.float32'>'\r\n\twith 4 stored elements in COOrdinate format>\r\n```\r\n\r\n#### Actual Results\r\n```python\r\nsparse: <2x6 sparse matrix of type '<class 'numpy.float64'>'\r\n\twith 4 stored elements in COOrdinate format>\r\n```\r\n\r\n#### Versions\r\n__Platform__: Linux-4.13.0-38-generic-x86_64-with-debian-stretch-sid\r\n__Python__: 3.6.3 |Anaconda custom (64-bit)| (default, Oct 13 2017, 12:02:49) [GCC 7.2.0]\r\n__NumPy__: NumPy \r\n__SciPy__: SciPy 1.0.1\r\n__Scikit-Learn__: Scikit-Learn 0.19.1\r\n",
      "issue_closed_at": "2018-06-06T09:03:02Z",
      "base_commit": "f049ec72eb70443ec8d7826066c4246035677c11",
      "changes": [
        {
          "file": "sklearn/preprocessing/data.py",
          "type": "function",
          "name": "add_dummy_feature",
          "class_name": null,
          "code": "def add_dummy_feature(X, value=1.0):\n    \"\"\"Augment dataset with an additional dummy feature.\n\n    This is useful for fitting an intercept term with implementations which\n    cannot otherwise fit it directly.\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix}, shape [n_samples, n_features]\n        Data.\n\n    value : float\n        Value to use for the dummy feature.\n\n    Returns\n    -------\n\n    X : {array, sparse matrix}, shape [n_samples, n_features + 1]\n        Same data with dummy feature added as first column.\n\n    Examples\n    --------\n\n    >>> from sklearn.preprocessing import add_dummy_feature\n    >>> add_dummy_feature([[0, 1], [1, 0]])\n    array([[1., 0., 1.],\n           [1., 1., 0.]])\n    \"\"\"\n    X = check_array(X, accept_sparse=['csc', 'csr', 'coo'], dtype=FLOAT_DTYPES)\n    n_samples, n_features = X.shape\n    shape = (n_samples, n_features + 1)\n    if sparse.issparse(X):\n        if sparse.isspmatrix_coo(X):\n            # Shift columns to the right.\n            col = X.col + 1\n            # Column indices of dummy feature are 0 everywhere.\n            col = np.concatenate((np.zeros(n_samples), col))\n            # Row indices of dummy feature are 0, ..., n_samples-1.\n            row = np.concatenate((np.arange(n_samples), X.row))\n            # Prepend the dummy feature n_samples times.\n            data = np.concatenate((np.ones(n_samples) * value, X.data))\n            return sparse.coo_matrix((data, (row, col)), shape)\n        elif sparse.isspmatrix_csc(X):\n            # Shift index pointers since we need to add n_samples elements.\n            indptr = X.indptr + n_samples\n            # indptr[0] must be 0.\n            indptr = np.concatenate((np.array([0]), indptr))\n            # Row indices of dummy feature are 0, ..., n_samples-1.\n            indices = np.concatenate((np.arange(n_samples), X.indices))\n            # Prepend the dummy feature n_samples times.\n            data = np.concatenate((np.ones(n_samples) * value, X.data))\n            return sparse.csc_matrix((data, indices, indptr), shape)\n        else:\n            klass = X.__class__\n            return klass(add_dummy_feature(X.tocoo(), value))\n    else:\n        return np.hstack((np.ones((n_samples, 1)) * value, X))"
        },
        {
          "file": "sklearn/preprocessing/data.py",
          "type": "function",
          "name": "_transform_selected",
          "class_name": null,
          "code": "def _transform_selected(X, transform, selected=\"all\", copy=True):\n    \"\"\"Apply a transform function to portion of selected features\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix}, shape [n_samples, n_features]\n        Dense array or sparse matrix.\n\n    transform : callable\n        A callable transform(X) -> X_transformed\n\n    copy : boolean, optional\n        Copy X even if it could be avoided.\n\n    selected: \"all\" or array of indices or mask\n        Specify which features to apply the transform to.\n\n    Returns\n    -------\n    X : array or sparse matrix, shape=(n_samples, n_features_new)\n    \"\"\"\n    X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES)\n\n    if isinstance(selected, six.string_types) and selected == \"all\":\n        return transform(X)\n\n    if len(selected) == 0:\n        return X\n\n    n_features = X.shape[1]\n    ind = np.arange(n_features)\n    sel = np.zeros(n_features, dtype=bool)\n    sel[np.asarray(selected)] = True\n    not_sel = np.logical_not(sel)\n    n_selected = np.sum(sel)\n\n    if n_selected == 0:\n        # No features selected.\n        return X\n    elif n_selected == n_features:\n        # All features selected.\n        return transform(X)\n    else:\n        X_sel = transform(X[:, ind[sel]])\n        X_not_sel = X[:, ind[not_sel]]\n\n        if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):\n            return sparse.hstack((X_sel, X_not_sel))\n        else:\n            return np.hstack((X_sel, X_not_sel))"
        },
        {
          "file": "sklearn/preprocessing/data.py",
          "type": "function",
          "name": "_transform_selected",
          "class_name": null,
          "code": "def _transform_selected(X, transform, selected=\"all\", copy=True):\n    \"\"\"Apply a transform function to portion of selected features\n\n    Parameters\n    ----------\n    X : {array-like, sparse matrix}, shape [n_samples, n_features]\n        Dense array or sparse matrix.\n\n    transform : callable\n        A callable transform(X) -> X_transformed\n\n    copy : boolean, optional\n        Copy X even if it could be avoided.\n\n    selected: \"all\" or array of indices or mask\n        Specify which features to apply the transform to.\n\n    Returns\n    -------\n    X : array or sparse matrix, shape=(n_samples, n_features_new)\n    \"\"\"\n    X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES)\n\n    if isinstance(selected, six.string_types) and selected == \"all\":\n        return transform(X)\n\n    if len(selected) == 0:\n        return X\n\n    n_features = X.shape[1]\n    ind = np.arange(n_features)\n    sel = np.zeros(n_features, dtype=bool)\n    sel[np.asarray(selected)] = True\n    not_sel = np.logical_not(sel)\n    n_selected = np.sum(sel)\n\n    if n_selected == 0:\n        # No features selected.\n        return X\n    elif n_selected == n_features:\n        # All features selected.\n        return transform(X)\n    else:\n        X_sel = transform(X[:, ind[sel]])\n        X_not_sel = X[:, ind[not_sel]]\n\n        if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):\n            return sparse.hstack((X_sel, X_not_sel))\n        else:\n            return np.hstack((X_sel, X_not_sel))"
        },
        {
          "file": "sklearn/preprocessing/data.py",
          "type": "function",
          "name": "fit_transform",
          "class_name": "OneHotEncoder",
          "code": "def fit_transform(self, X, y=None):\n        \"\"\"Fit OneHotEncoder to X, then transform X.\n\n        Equivalent to self.fit(X).transform(X), but more convenient and more\n        efficient. See fit for the parameters, transform for the return value.\n\n        Parameters\n        ----------\n        X : array-like, shape [n_samples, n_feature]\n            Input array of type int.\n        \"\"\"\n        return _transform_selected(X, self._fit_transform,\n                                   self.categorical_features, copy=True)"
        },
        {
          "file": "sklearn/preprocessing/data.py",
          "type": "function",
          "name": "transform",
          "class_name": "CategoricalEncoder",
          "code": "def transform(self, X):\n        \"\"\"Transform X using specified encoding scheme.\n\n        Parameters\n        ----------\n        X : array-like, shape [n_samples, n_features]\n            The data to encode.\n\n        Returns\n        -------\n        X_out : sparse matrix or a 2-d array\n            Transformed input.\n\n        \"\"\"\n        X_temp = check_array(X, dtype=None)\n        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):\n            X = check_array(X, dtype=np.object)\n        else:\n            X = X_temp\n\n        n_samples, n_features = X.shape\n        X_int = np.zeros_like(X, dtype=np.int)\n        X_mask = np.ones_like(X, dtype=np.bool)\n\n        for i in range(n_features):\n            Xi = X[:, i]\n            valid_mask = np.in1d(Xi, self.categories_[i])\n\n            if not np.all(valid_mask):\n                if self.handle_unknown == 'error':\n                    diff = np.unique(X[~valid_mask, i])\n                    msg = (\"Found unknown categories {0} in column {1}\"\n                           \" during transform\".format(diff, i))\n                    raise ValueError(msg)\n                else:\n                    # Set the problematic rows to an acceptable value and\n                    # continue `The rows are marked `X_mask` and will be\n                    # removed later.\n                    X_mask[:, i] = valid_mask\n                    Xi = Xi.copy()\n                    Xi[~valid_mask] = self.categories_[i][0]\n            X_int[:, i] = self._label_encoders_[i].transform(Xi)\n\n        if self.encoding == 'ordinal':\n            return X_int.astype(self.dtype, copy=False)\n\n        mask = X_mask.ravel()\n        n_values = [cats.shape[0] for cats in self.categories_]\n        n_values = np.array([0] + n_values)\n        feature_indices = np.cumsum(n_values)\n\n        indices = (X_int + feature_indices[:-1]).ravel()[mask]\n        indptr = X_mask.sum(axis=1).cumsum()\n        indptr = np.insert(indptr, 0, 0)\n        data = np.ones(n_samples * n_features)[mask]\n\n        out = sparse.csr_matrix((data, indices, indptr),\n                                shape=(n_samples, feature_indices[-1]),\n                                dtype=self.dtype)\n        if self.encoding == 'onehot-dense':\n            return out.toarray()\n        else:\n            return out"
        }
      ]
    },
    {
      "pr_number": 11043,
      "pr_title": "[MRG+2] ENH Passthrough DataFrame in FunctionTransformer",
      "pr_body": "<!--\r\nThanks for contributing a pull request! Please ensure you have taken a look at\r\nthe contribution guidelines: https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md#pull-request-checklist\r\n-->\r\n\r\n#### Reference Issues/PRs\r\n<!--\r\nExample: Fixes #1234. See also #3456.\r\nPlease use keywords (e.g., Fixes) to create link to the issues or pull requests\r\nyou resolved, so that they will automatically be closed when your pull request\r\nis merged. See https://github.com/blog/1506-closing-issues-via-pull-requests\r\n-->\r\n\r\ncloses #10655 \r\n\r\n#### What does this implement/fix? Explain your changes.\r\n\r\nAdded the following option\r\n\r\n- [x] Raise FutureWarning that `validate=False` will be the default in the future.\r\n- [x] Convert list to array when `validate=False`.\r\n- [x] Make a what's new entry for the change of behaviour.\r\n\r\n#### Any other comments?\r\n\r\n\r\n<!--\r\nPlease be aware that we are a loose team of volunteers so patience is\r\nnecessary; assistance handling other issues is very welcome. We value\r\nall user contributions, no matter how minor they are. If we are slow to\r\nreview, either the pull request needs some benchmarking, tinkering,\r\nconvincing, etc. or more likely the reviewers are simply busy. In either\r\ncase, we ask for your understanding during the review process.\r\nFor more information, see our FAQ on this topic:\r\nhttp://scikit-learn.org/dev/faq.html#why-is-my-pull-request-not-getting-any-attention.\r\n\r\nThanks for contributing!\r\n-->\r\n",
      "issue_id": 10655,
      "issue_title": "FunctionTransformer should not convert DataFrames to arrays by default",
      "issue_body": "I would expect a common use of FunctionTransformer is to apply some function to a Pandas DataFrame, ideally using its own methods or accessors. As noted in #10648, it can be easy for users to miss that they need to set validate=False to pass through a DataFrame without converting it to a NumPy array. I think it would be more user-friendly to have `validate='array-or-frame'` by default, which would pass through DataFrames to the function, but otherwise convert its input to a 2d array. For strict backwards compatibility, the default should be changed through a deprecation cycle, warning whenever using the default validation means a DataFrame is currently converted to an array.\r\n\r\nDo others agree?",
      "issue_closed_at": "2018-07-10T09:48:04Z",
      "base_commit": "19bc7e8af6ec3468b6c7f4718a31cd5f508528cd",
      "changes": [
        {
          "file": "sklearn/preprocessing/_function_transformer.py",
          "type": "class",
          "name": "FunctionTransformer",
          "code": "class FunctionTransformer(BaseEstimator, TransformerMixin):\n    \"\"\"Constructs a transformer from an arbitrary callable.\n\n    A FunctionTransformer forwards its X (and optionally y) arguments to a\n    user-defined function or function object and returns the result of this\n    function. This is useful for stateless transformations such as taking the\n    log of frequencies, doing custom scaling, etc.\n\n    Note: If a lambda is used as the function, then the resulting\n    transformer will not be pickleable.\n\n    .. versionadded:: 0.17\n\n    Read more in the :ref:`User Guide <function_transformer>`.\n\n    Parameters\n    ----------\n    func : callable, optional default=None\n        The callable to use for the transformation. This will be passed\n        the same arguments as transform, with args and kwargs forwarded.\n        If func is None, then func will be the identity function.\n\n    inverse_func : callable, optional default=None\n        The callable to use for the inverse transformation. This will be\n        passed the same arguments as inverse transform, with args and\n        kwargs forwarded. If inverse_func is None, then inverse_func\n        will be the identity function.\n\n    validate : bool, optional default=True\n        Indicate that the input X array should be checked before calling\n        func. If validate is false, there will be no input validation.\n        If it is true, then X will be converted to a 2-dimensional NumPy\n        array or sparse matrix. If this conversion is not possible or X\n        contains NaN or infinity, an exception is raised.\n\n    accept_sparse : boolean, optional\n        Indicate that func accepts a sparse matrix as input. If validate is\n        False, this has no effect. Otherwise, if accept_sparse is false,\n        sparse matrix inputs will cause an exception to be raised.\n\n    pass_y : bool, optional default=False\n        Indicate that transform should forward the y argument to the\n        inner callable.\n\n        .. deprecated::0.19\n\n    check_inverse : bool, default=True\n       Whether to check that or ``func`` followed by ``inverse_func`` leads to\n       the original inputs. It can be used for a sanity check, raising a\n       warning when the condition is not fulfilled.\n\n       .. versionadded:: 0.20\n\n    kw_args : dict, optional\n        Dictionary of additional keyword arguments to pass to func.\n\n    inv_kw_args : dict, optional\n        Dictionary of additional keyword arguments to pass to inverse_func.\n\n    \"\"\"\n    def __init__(self, func=None, inverse_func=None, validate=True,\n                 accept_sparse=False, pass_y='deprecated', check_inverse=True,\n                 kw_args=None, inv_kw_args=None):\n        self.func = func\n        self.inverse_func = inverse_func\n        self.validate = validate\n        self.accept_sparse = accept_sparse\n        self.pass_y = pass_y\n        self.check_inverse = check_inverse\n        self.kw_args = kw_args\n        self.inv_kw_args = inv_kw_args\n\n    def _check_inverse_transform(self, X):\n        \"\"\"Check that func and inverse_func are the inverse.\"\"\"\n        idx_selected = slice(None, None, max(1, X.shape[0] // 100))\n        try:\n            assert_allclose_dense_sparse(\n                X[idx_selected],\n                self.inverse_transform(self.transform(X[idx_selected])))\n        except AssertionError:\n            warnings.warn(\"The provided functions are not strictly\"\n                          \" inverse of each other. If you are sure you\"\n                          \" want to proceed regardless, set\"\n                          \" 'check_inverse=False'.\", UserWarning)\n\n    def fit(self, X, y=None):\n        \"\"\"Fit transformer by checking X.\n\n        If ``validate`` is ``True``, ``X`` will be checked.\n\n        Parameters\n        ----------\n        X : array-like, shape (n_samples, n_features)\n            Input array.\n\n        Returns\n        -------\n        self\n        \"\"\"\n        if self.validate:\n            X = check_array(X, self.accept_sparse)\n        if (self.check_inverse and not (self.func is None or\n                                        self.inverse_func is None)):\n            self._check_inverse_transform(X)\n        return self\n\n    def transform(self, X, y='deprecated'):\n        \"\"\"Transform X using the forward function.\n\n        Parameters\n        ----------\n        X : array-like, shape (n_samples, n_features)\n            Input array.\n\n        y : (ignored)\n            .. deprecated::0.19\n\n        Returns\n        -------\n        X_out : array-like, shape (n_samples, n_features)\n            Transformed input.\n        \"\"\"\n        if not isinstance(y, string_types) or y != 'deprecated':\n            warnings.warn(\"The parameter y on transform() is \"\n                          \"deprecated since 0.19 and will be removed in 0.21\",\n                          DeprecationWarning)\n\n        return self._transform(X, y=y, func=self.func, kw_args=self.kw_args)\n\n    def inverse_transform(self, X, y='deprecated'):\n        \"\"\"Transform X using the inverse function.\n\n        Parameters\n        ----------\n        X : array-like, shape (n_samples, n_features)\n            Input array.\n\n        y : (ignored)\n            .. deprecated::0.19\n\n        Returns\n        -------\n        X_out : array-like, shape (n_samples, n_features)\n            Transformed input.\n        \"\"\"\n        if not isinstance(y, string_types) or y != 'deprecated':\n            warnings.warn(\"The parameter y on inverse_transform() is \"\n                          \"deprecated since 0.19 and will be removed in 0.21\",\n                          DeprecationWarning)\n        return self._transform(X, y=y, func=self.inverse_func,\n                               kw_args=self.inv_kw_args)\n\n    def _transform(self, X, y=None, func=None, kw_args=None):\n        if self.validate:\n            X = check_array(X, self.accept_sparse)\n\n        if func is None:\n            func = _identity\n\n        if (not isinstance(self.pass_y, string_types) or\n                self.pass_y != 'deprecated'):\n            # We do this to know if pass_y was set to False / True\n            pass_y = self.pass_y\n            warnings.warn(\"The parameter pass_y is deprecated since 0.19 and \"\n                          \"will be removed in 0.21\", DeprecationWarning)\n        else:\n            pass_y = False\n\n        return func(X, *((y,) if pass_y else ()),\n                    **(kw_args if kw_args else {}))"
        },
        {
          "file": "sklearn/preprocessing/_function_transformer.py",
          "type": "class",
          "name": "FunctionTransformer",
          "code": "class FunctionTransformer(BaseEstimator, TransformerMixin):\n    \"\"\"Constructs a transformer from an arbitrary callable.\n\n    A FunctionTransformer forwards its X (and optionally y) arguments to a\n    user-defined function or function object and returns the result of this\n    function. This is useful for stateless transformations such as taking the\n    log of frequencies, doing custom scaling, etc.\n\n    Note: If a lambda is used as the function, then the resulting\n    transformer will not be pickleable.\n\n    .. versionadded:: 0.17\n\n    Read more in the :ref:`User Guide <function_transformer>`.\n\n    Parameters\n    ----------\n    func : callable, optional default=None\n        The callable to use for the transformation. This will be passed\n        the same arguments as transform, with args and kwargs forwarded.\n        If func is None, then func will be the identity function.\n\n    inverse_func : callable, optional default=None\n        The callable to use for the inverse transformation. This will be\n        passed the same arguments as inverse transform, with args and\n        kwargs forwarded. If inverse_func is None, then inverse_func\n        will be the identity function.\n\n    validate : bool, optional default=True\n        Indicate that the input X array should be checked before calling\n        func. If validate is false, there will be no input validation.\n        If it is true, then X will be converted to a 2-dimensional NumPy\n        array or sparse matrix. If this conversion is not possible or X\n        contains NaN or infinity, an exception is raised.\n\n    accept_sparse : boolean, optional\n        Indicate that func accepts a sparse matrix as input. If validate is\n        False, this has no effect. Otherwise, if accept_sparse is false,\n        sparse matrix inputs will cause an exception to be raised.\n\n    pass_y : bool, optional default=False\n        Indicate that transform should forward the y argument to the\n        inner callable.\n\n        .. deprecated::0.19\n\n    check_inverse : bool, default=True\n       Whether to check that or ``func`` followed by ``inverse_func`` leads to\n       the original inputs. It can be used for a sanity check, raising a\n       warning when the condition is not fulfilled.\n\n       .. versionadded:: 0.20\n\n    kw_args : dict, optional\n        Dictionary of additional keyword arguments to pass to func.\n\n    inv_kw_args : dict, optional\n        Dictionary of additional keyword arguments to pass to inverse_func.\n\n    \"\"\"\n    def __init__(self, func=None, inverse_func=None, validate=True,\n                 accept_sparse=False, pass_y='deprecated', check_inverse=True,\n                 kw_args=None, inv_kw_args=None):\n        self.func = func\n        self.inverse_func = inverse_func\n        self.validate = validate\n        self.accept_sparse = accept_sparse\n        self.pass_y = pass_y\n        self.check_inverse = check_inverse\n        self.kw_args = kw_args\n        self.inv_kw_args = inv_kw_args\n\n    def _check_inverse_transform(self, X):\n        \"\"\"Check that func and inverse_func are the inverse.\"\"\"\n        idx_selected = slice(None, None, max(1, X.shape[0] // 100))\n        try:\n            assert_allclose_dense_sparse(\n                X[idx_selected],\n                self.inverse_transform(self.transform(X[idx_selected])))\n        except AssertionError:\n            warnings.warn(\"The provided functions are not strictly\"\n                          \" inverse of each other. If you are sure you\"\n                          \" want to proceed regardless, set\"\n                          \" 'check_inverse=False'.\", UserWarning)\n\n    def fit(self, X, y=None):\n        \"\"\"Fit transformer by checking X.\n\n        If ``validate`` is ``True``, ``X`` will be checked.\n\n        Parameters\n        ----------\n        X : array-like, shape (n_samples, n_features)\n            Input array.\n\n        Returns\n        -------\n        self\n        \"\"\"\n        if self.validate:\n            X = check_array(X, self.accept_sparse)\n        if (self.check_inverse and not (self.func is None or\n                                        self.inverse_func is None)):\n            self._check_inverse_transform(X)\n        return self\n\n    def transform(self, X, y='deprecated'):\n        \"\"\"Transform X using the forward function.\n\n        Parameters\n        ----------\n        X : array-like, shape (n_samples, n_features)\n            Input array.\n\n        y : (ignored)\n            .. deprecated::0.19\n\n        Returns\n        -------\n        X_out : array-like, shape (n_samples, n_features)\n            Transformed input.\n        \"\"\"\n        if not isinstance(y, string_types) or y != 'deprecated':\n            warnings.warn(\"The parameter y on transform() is \"\n                          \"deprecated since 0.19 and will be removed in 0.21\",\n                          DeprecationWarning)\n\n        return self._transform(X, y=y, func=self.func, kw_args=self.kw_args)\n\n    def inverse_transform(self, X, y='deprecated'):\n        \"\"\"Transform X using the inverse function.\n\n        Parameters\n        ----------\n        X : array-like, shape (n_samples, n_features)\n            Input array.\n\n        y : (ignored)\n            .. deprecated::0.19\n\n        Returns\n        -------\n        X_out : array-like, shape (n_samples, n_features)\n            Transformed input.\n        \"\"\"\n        if not isinstance(y, string_types) or y != 'deprecated':\n            warnings.warn(\"The parameter y on inverse_transform() is \"\n                          \"deprecated since 0.19 and will be removed in 0.21\",\n                          DeprecationWarning)\n        return self._transform(X, y=y, func=self.inverse_func,\n                               kw_args=self.inv_kw_args)\n\n    def _transform(self, X, y=None, func=None, kw_args=None):\n        if self.validate:\n            X = check_array(X, self.accept_sparse)\n\n        if func is None:\n            func = _identity\n\n        if (not isinstance(self.pass_y, string_types) or\n                self.pass_y != 'deprecated'):\n            # We do this to know if pass_y was set to False / True\n            pass_y = self.pass_y\n            warnings.warn(\"The parameter pass_y is deprecated since 0.19 and \"\n                          \"will be removed in 0.21\", DeprecationWarning)\n        else:\n            pass_y = False\n\n        return func(X, *((y,) if pass_y else ()),\n                    **(kw_args if kw_args else {}))"
        },
        {
          "file": "sklearn/preprocessing/_function_transformer.py",
          "type": "function",
          "name": "__init__",
          "class_name": "FunctionTransformer",
          "code": "def __init__(self, func=None, inverse_func=None, validate=True,\n                 accept_sparse=False, pass_y='deprecated', check_inverse=True,\n                 kw_args=None, inv_kw_args=None):\n        self.func = func\n        self.inverse_func = inverse_func\n        self.validate = validate\n        self.accept_sparse = accept_sparse\n        self.pass_y = pass_y\n        self.check_inverse = check_inverse\n        self.kw_args = kw_args\n        self.inv_kw_args = inv_kw_args"
        },
        {
          "file": "sklearn/preprocessing/_function_transformer.py",
          "type": "function",
          "name": "fit",
          "class_name": "FunctionTransformer",
          "code": "def fit(self, X, y=None):\n        \"\"\"Fit transformer by checking X.\n\n        If ``validate`` is ``True``, ``X`` will be checked.\n\n        Parameters\n        ----------\n        X : array-like, shape (n_samples, n_features)\n            Input array.\n\n        Returns\n        -------\n        self\n        \"\"\"\n        if self.validate:\n            X = check_array(X, self.accept_sparse)\n        if (self.check_inverse and not (self.func is None or\n                                        self.inverse_func is None)):\n            self._check_inverse_transform(X)\n        return self"
        },
        {
          "file": "sklearn/preprocessing/_function_transformer.py",
          "type": "function",
          "name": "inverse_transform",
          "class_name": "FunctionTransformer",
          "code": "def inverse_transform(self, X, y='deprecated'):\n        \"\"\"Transform X using the inverse function.\n\n        Parameters\n        ----------\n        X : array-like, shape (n_samples, n_features)\n            Input array.\n\n        y : (ignored)\n            .. deprecated::0.19\n\n        Returns\n        -------\n        X_out : array-like, shape (n_samples, n_features)\n            Transformed input.\n        \"\"\"\n        if not isinstance(y, string_types) or y != 'deprecated':\n            warnings.warn(\"The parameter y on inverse_transform() is \"\n                          \"deprecated since 0.19 and will be removed in 0.21\",\n                          DeprecationWarning)\n        return self._transform(X, y=y, func=self.inverse_func,\n                               kw_args=self.inv_kw_args)"
        }
      ]
    },
    {
      "pr_number": 18528,
      "pr_title": "FIX TruncatedSVD.fit_transform returns the same as fit.transform",
      "pr_body": "Fix #15144\r\nSupersede and close #16421",
      "issue_id": 15144,
      "issue_title": "TruncatedSVD.fit(X).transform(X) is not the same as .fit_transform(X)",
      "issue_body": "<!-- Instructions For Filing a Bug: https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md#filing-bugs -->\r\n\r\n#### Description\r\nRecall that SVD(X) decomposes X into three matrices, U, Sigma, and V^t. \r\n\r\nIn scikit-learn, TruncatedSVD treats .fit().transform() differently from .fit_transform(). On the one hand, .fit(X).transform(X) will return X @ V. On the other hand, .fit_transform(X) will return U * Sigma. If this is intended behaviour, I believe it should be detailed in the documentation.\r\n\r\n#### Steps/Code to Reproduce\r\n\r\nExample:\r\n```python\r\nimport numpy as np\r\nfrom sklearn.decomposition import TruncatedSVD\r\nfrom sklearn.random_projection import sparse_random_matrix\r\nX = sparse_random_matrix(100, 100, density=0.01, random_state=42)\r\n\r\nx1 =  TruncatedSVD(n_components=5, n_iter=7, random_state=42).fit_transform(X)\r\nx2 =  TruncatedSVD(n_components=5, n_iter=7, random_state=42).fit_transform(X)\r\nx3 =  TruncatedSVD(n_components=5, n_iter=7, random_state=42).fit(X).transform(X)\r\n\r\nnp.linalg.norm(x1 - x2)  # >>> equals 0, as desired\r\nnp.linalg.norm(x1 - x3)  # >>> equals 0.0248, implies the result is different -- not desirable!\r\n```\r\n\r\n#### Expected Results\r\nWe should have np.linalg.norm(x1 - x3) == 0 to be True.\r\n\r\n#### Actual Results\r\nWe get that np.linalg.norm(x1 - x3) equals 0.0248, meaning that the result of .fit(X).transform(X) is different from .fit_transform(X).\r\n\r\n#### Versions\r\nPython deps:\r\n       pip: 19.2.3\r\nsetuptools: 41.2.0\r\n   sklearn: 0.21.3\r\n     numpy: 1.17.2\r\n     scipy: 1.3.1\r\n    Cython: None\r\n    pandas: 0.25.1\r\n",
      "issue_closed_at": "2020-10-06T20:47:11Z",
      "base_commit": "aa4a10dbfaee9fd52af06f0f0c8e8ae77f243ef6",
      "changes": [
        {
          "file": "sklearn/decomposition/_truncated_svd.py",
          "type": "function",
          "name": "__init__",
          "class_name": "TruncatedSVD",
          "code": "def __init__(self, n_components=2, *, algorithm=\"randomized\", n_iter=5,\n                 random_state=None, tol=0.):\n        self.algorithm = algorithm\n        self.n_components = n_components\n        self.n_iter = n_iter\n        self.random_state = random_state\n        self.tol = tol"
        },
        {
          "file": "sklearn/decomposition/_truncated_svd.py",
          "type": "function",
          "name": "fit",
          "class_name": "TruncatedSVD",
          "code": "def fit(self, X, y=None):\n        \"\"\"Fit LSI model on training data X.\n\n        Parameters\n        ----------\n        X : {array-like, sparse matrix} of shape (n_samples, n_features)\n            Training data.\n\n        y : Ignored\n\n        Returns\n        -------\n        self : object\n            Returns the transformer object.\n        \"\"\"\n        self.fit_transform(X)\n        return self"
        },
        {
          "file": "sklearn/decomposition/_truncated_svd.py",
          "type": "function",
          "name": "fit_transform",
          "class_name": "TruncatedSVD",
          "code": "def fit_transform(self, X, y=None):\n        \"\"\"Fit LSI model to X and perform dimensionality reduction on X.\n\n        Parameters\n        ----------\n        X : {array-like, sparse matrix} of shape (n_samples, n_features)\n            Training data.\n\n        y : Ignored\n\n        Returns\n        -------\n        X_new : ndarray of shape (n_samples, n_components)\n            Reduced version of X. This will always be a dense array.\n        \"\"\"\n        X = self._validate_data(X, accept_sparse=['csr', 'csc'],\n                                ensure_min_features=2)\n        random_state = check_random_state(self.random_state)\n\n        if self.algorithm == \"arpack\":\n            U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol)\n            # svds doesn't abide by scipy.linalg.svd/randomized_svd\n            # conventions, so reverse its outputs.\n            Sigma = Sigma[::-1]\n            U, VT = svd_flip(U[:, ::-1], VT[::-1])\n\n        elif self.algorithm == \"randomized\":\n            k = self.n_components\n            n_features = X.shape[1]\n            if k >= n_features:\n                raise ValueError(\"n_components must be < n_features;\"\n                                 \" got %d >= %d\" % (k, n_features))\n            U, Sigma, VT = randomized_svd(X, self.n_components,\n                                          n_iter=self.n_iter,\n                                          random_state=random_state)\n        else:\n            raise ValueError(\"unknown algorithm %r\" % self.algorithm)\n\n        self.components_ = VT\n\n        # Calculate explained variance & explained variance ratio\n        X_transformed = U * Sigma\n        self.explained_variance_ = exp_var = np.var(X_transformed, axis=0)\n        if sp.issparse(X):\n            _, full_var = mean_variance_axis(X, axis=0)\n            full_var = full_var.sum()\n        else:\n            full_var = np.var(X, axis=0).sum()\n        self.explained_variance_ratio_ = exp_var / full_var\n        self.singular_values_ = Sigma  # Store the singular values.\n\n        return X_transformed"
        }
      ]
    }
  ]
}