{
  "Selected_candidate": {
    "pr_number": 5416,
    "pr_title": "[MRG+1] BUG: reset internal state of scaler before fitting",
    "pr_body": "Fixes #5408.\n",
    "issue_id": 5408,
    "issue_title": "Broken example: examples/svm/plot_rbf_parameters.py",
    "issue_body": "```\n--------------------------------------------------------------------------\nValueError                                Traceback (most recent call last)\n/home/le243287/dev/scikit-learn/examples/svm/plot_rbf_parameters.py in <module>()\n    117 scaler = StandardScaler()\n    118 X = scaler.fit_transform(X)\n--> 119 X_2d = scaler.fit_transform(X_2d)\n    120 \n    121 ##############################################################################\n\n/home/le243287/dev/scikit-learn/sklearn/base.pyc in fit_transform(self, X, y, **fit_params)\n    453         if y is None:\n    454             # fit method of arity 1 (unsupervised transformation)\n--> 455             return self.fit(X, **fit_params).transform(X)\n    456         else:\n    457             # fit method of arity 2 (supervised transformation)\n\n/home/le243287/dev/scikit-learn/sklearn/preprocessing/data.pyc in fit(self, X, y)\n    501         y: Passthrough for ``Pipeline`` compatibility.\n    502         \"\"\"\n--> 503         return self.partial_fit(X, y)\n    504 \n    505     def partial_fit(self, X, y=None):\n\n/home/le243287/dev/scikit-learn/sklearn/preprocessing/data.pyc in partial_fit(self, X, y)\n    565             self.mean_, self.var_, self.n_samples_seen_ = \\\n    566                 _incremental_mean_and_var(X, self.mean_, self.var_,\n--> 567                                           self.n_samples_seen_)\n    568 \n    569         if self.with_std:\n\n/home/le243287/dev/scikit-learn/sklearn/utils/extmath.pyc in _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count)\n    730     updated_sample_count = last_sample_count + new_sample_count\n    731 \n--> 732     updated_mean = (last_sum + new_sum) / updated_sample_count\n    733 \n    734     if last_variance is None:\n\nValueError: operands could not be broadcast together with shapes (4,) (2,)\n```\n",
    "issue_closed_at": "2015-10-16T15:47:27Z",
    "base_commit": "2300bdd86503fad0130c1ab433e6ecce2dbff2f1",
    "changes": [
      {
        "file": "sklearn/preprocessing/data.py",
        "type": "function",
        "name": "data_range",
        "class_name": "MinMaxScaler",
        "code": "def data_range(self):\n        return self.data_range_"
      },
      {
        "file": "sklearn/preprocessing/data.py",
        "type": "function",
        "name": "__init__",
        "class_name": "OneHotEncoder",
        "code": "def __init__(self, n_values=\"auto\", categorical_features=\"all\",\n                 dtype=np.float, sparse=True, handle_unknown='error'):\n        self.n_values = n_values\n        self.categorical_features = categorical_features\n        self.dtype = dtype\n        self.sparse = sparse\n        self.handle_unknown = handle_unknown"
      },
      {
        "file": "sklearn/preprocessing/data.py",
        "type": "function",
        "name": "fit",
        "class_name": "OneHotEncoder",
        "code": "def fit(self, X, y=None):\n        \"\"\"Fit OneHotEncoder to X.\n\n        Parameters\n        ----------\n        X : array-like, shape [n_samples, n_feature]\n            Input array of type int.\n\n        Returns\n        -------\n        self\n        \"\"\"\n        self.fit_transform(X)\n        return self"
      },
      {
        "file": "sklearn/preprocessing/data.py",
        "type": "class",
        "name": "MaxAbsScaler",
        "code": "class MaxAbsScaler(BaseEstimator, TransformerMixin):\n    \"\"\"Scale each feature by its maximum absolute value.\n\n    This estimator scales and translates each feature individually such\n    that the maximal absolute value of each feature in the\n    training set will be 1.0. It does not shift/center the data, and\n    thus does not destroy any sparsity.\n\n    This scaler can also be applied to sparse CSR or CSC matrices.\n\n    Parameters\n    ----------\n    copy : boolean, optional, default is True\n        Set to False to perform inplace scaling and avoid a copy (if the input\n        is already a numpy array).\n\n    Attributes\n    ----------\n    scale_ : ndarray, shape (n_features,)\n        Per feature relative scaling of the data.\n\n    max_abs_ : ndarray, shape (n_features,)\n        Per feature maximum absolute value.\n\n    n_samples_seen_ : int\n        The number of samples processed by the estimator. Will be reset on\n        new calls to fit, but increments across ``partial_fit`` calls.\n    \"\"\"\n\n    def __init__(self, copy=True):\n        self.copy = copy\n\n    def fit(self, X, y=None):\n        \"\"\"Compute the maximum absolute value to be used for later scaling.\n\n        Parameters\n        ----------\n        X : {array-like, sparse matrix}, shape [n_samples, n_features]\n            The data used to compute the per-feature minimum and maximum\n            used for later scaling along the features axis.\n        \"\"\"\n        return self.partial_fit(X, y)\n\n    def partial_fit(self, X, y=None):\n        \"\"\"Online computation of max absolute value of X for later scaling.\n        All of X is processed as a single batch. This is intended for cases\n        when `fit` is not feasible due to very large number of `n_samples`\n        or because X is read from a continuous stream.\n\n        Parameters\n        ----------\n        X : {array-like, sparse matrix}, shape [n_samples_, n_features]\n            The data used to compute the mean and standard deviation\n            used for later scaling along the features axis.\n\n        y: Passthrough for ``Pipeline`` compatibility.\n        \"\"\"\n        X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,\n                        ensure_2d=False, estimator=self, dtype=FLOAT_DTYPES)\n\n        if X.ndim == 1:\n            warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)\n\n        if sparse.issparse(X):\n            mins, maxs = min_max_axis(X, axis=0)\n            max_abs = np.maximum(np.abs(mins), np.abs(maxs))\n        else:\n            max_abs = np.abs(X).max(axis=0)\n\n        # First pass\n        if not hasattr(self, 'n_samples_seen_'):\n            self.n_samples_seen_ = X.shape[0]\n        # Next passes\n        else:\n            max_abs = np.maximum(self.max_abs_, max_abs)\n            self.n_samples_seen_ += X.shape[0]\n\n        self.max_abs_ = max_abs\n        self.scale_ = _handle_zeros_in_scale(max_abs)\n        return self\n\n    def transform(self, X, y=None):\n        \"\"\"Scale the data\n\n        Parameters\n        ----------\n        X : {array-like, sparse matrix}\n            The data that should be scaled.\n        \"\"\"\n        check_is_fitted(self, 'scale_')\n        X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,\n                        ensure_2d=False, estimator=self, dtype=FLOAT_DTYPES)\n\n        if X.ndim == 1:\n            warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)\n\n        if sparse.issparse(X):\n            if X.shape[0] == 1:\n                inplace_row_scale(X, 1.0 / self.scale_)\n            else:\n                inplace_column_scale(X, 1.0 / self.scale_)\n        else:\n            X /= self.scale_\n        return X\n\n    def inverse_transform(self, X):\n        \"\"\"Scale back the data to the original representation\n\n        Parameters\n        ----------\n        X : {array-like, sparse matrix}\n            The data that should be transformed back.\n        \"\"\"\n        check_is_fitted(self, 'scale_')\n        X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,\n                        ensure_2d=False, estimator=self, dtype=FLOAT_DTYPES)\n        if X.ndim == 1:\n            warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)\n\n        if sparse.issparse(X):\n            if X.shape[0] == 1:\n                inplace_row_scale(X, self.scale_)\n            else:\n                inplace_column_scale(X, self.scale_)\n        else:\n            X *= self.scale_\n        return X"
      }
    ]
  },
  "Justification": "Candidate D addresses a user-facing issue with model fitting and transformation processes, which aligns with the current bug's concern about how mixture models should conform to clusterer interfaces. Although both bugs are not identical, they share a focus on model compatibility and user functionality in the sklearn library. The structural aspect of the failures listed in the traceback may provide insights into ensuring correct behavior of the fit and transform methods, similar to how adjustments are needed for mixture models to behave more like clusterers. The fixing approach in Candidate D may yield applicable strategies for enhancing the Mixture Models' interface.",
  "instance_id": "scikit-learn__scikit-learn-11281",
  "repo": "scikit-learn/scikit-learn",
  "created_at": "2018-06-15T17:15:25Z",
  "problem_statement": "Should mixture models have a clusterer-compatible interface\nMixture models are currently a bit different. They are basically clusterers, except they are probabilistic, and are applied to inductive problems unlike many clusterers. But they are unlike clusterers in API:\r\n* they have an `n_components` parameter, with identical purpose to `n_clusters`\r\n* they do not store the `labels_` of the training data\r\n* they do not have a `fit_predict` method\r\n\r\nAnd they are almost entirely documented separately.\r\n\r\nShould we make the MMs more like clusterers?\n",
  "patch": "diff --git a/sklearn/mixture/base.py b/sklearn/mixture/base.py\n--- a/sklearn/mixture/base.py\n+++ b/sklearn/mixture/base.py\n@@ -172,7 +172,7 @@ def _initialize(self, X, resp):\n     def fit(self, X, y=None):\n         \"\"\"Estimate model parameters with the EM algorithm.\n \n-        The method fit the model `n_init` times and set the parameters with\n+        The method fits the model `n_init` times and set the parameters with\n         which the model has the largest likelihood or lower bound. Within each\n         trial, the method iterates between E-step and M-step for `max_iter`\n         times until the change of likelihood or lower bound is less than\n@@ -188,6 +188,32 @@ def fit(self, X, y=None):\n         -------\n         self\n         \"\"\"\n+        self.fit_predict(X, y)\n+        return self\n+\n+    def fit_predict(self, X, y=None):\n+        \"\"\"Estimate model parameters using X and predict the labels for X.\n+\n+        The method fits the model n_init times and sets the parameters with\n+        which the model has the largest likelihood or lower bound. Within each\n+        trial, the method iterates between E-step and M-step for `max_iter`\n+        times until the change of likelihood or lower bound is less than\n+        `tol`, otherwise, a `ConvergenceWarning` is raised. After fitting, it\n+        predicts the most probable label for the input data points.\n+\n+        .. versionadded:: 0.20\n+\n+        Parameters\n+        ----------\n+        X : array-like, shape (n_samples, n_features)\n+            List of n_features-dimensional data points. Each row\n+            corresponds to a single data point.\n+\n+        Returns\n+        -------\n+        labels : array, shape (n_samples,)\n+            Component labels.\n+        \"\"\"\n         X = _check_X(X, self.n_components, ensure_min_samples=2)\n         self._check_initial_parameters(X)\n \n@@ -240,7 +266,7 @@ def fit(self, X, y=None):\n         self._set_parameters(best_params)\n         self.n_iter_ = best_n_iter\n \n-        return self\n+        return log_resp.argmax(axis=1)\n \n     def _e_step(self, X):\n         \"\"\"E step.\n"
}