{
  "RepoName": "https://github.com/nalepae/pandarallel.git",
  "CommitSHA": "261a652cddb219ac353ff803e81646c08b72fc6f",
  "Time": "",
  "Difficulty": "Easy",
  "Type": "logic error",
  "BuggyCode": [
    {
      "path": "nalepae_pandarallel/setup.py",
      "content": "from setuptools import setup\n\nsetup()\n"
    },
    {
      "path": "nalepae_pandarallel/tests/test_pandarallel.py",
      "content": "import importlib\nimport math\n\nimport numpy as np\nimport pandas as pd\nimport pytest\nfrom pandarallel import pandarallel\n\n\n@pytest.fixture(params=(1000, 1))\ndef df_size(request):\n    return request.param\n\n\n@pytest.fixture(params=(False, True))\ndef progress_bar(request):\n    return request.param\n\n\n@pytest.fixture(params=(None, False))\ndef use_memory_fs(request):\n    return request.param\n\n\n@pytest.fixture(params=(RuntimeError, AttributeError, ZeroDivisionError))\ndef exception(request):\n    return request.param\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_apply_axis_0(request):\n    def func(x):\n        return max(x) - min(x)\n\n    return dict(named=func, anonymous=lambda x: max(x) - min(x))[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_apply_axis_1(request):\n    def func(x):\n        return math.sin(x.a**2) + math.sin(x.b**2)\n\n    return dict(\n        named=func, anonymous=lambda x: math.sin(x.a**2) + math.sin(x.b**2)\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_applymap(request):\n    def func(x):\n        return math.sin(x**2) - math.cos(x**2)\n\n    return dict(named=func, anonymous=lambda x: math.sin(x**2) - math.cos(x**2))[\n        request.param\n    ]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_series_map(request):\n    def func(x):\n        return math.log10(math.sqrt(math.exp(x**2)))\n\n    return dict(\n        named=func, anonymous=lambda x: math.log10(math.sqrt(math.exp(x**2)))\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_series_apply(request):\n    def func(x, power, bias=0):\n        return math.log10(math.sqrt(math.exp(x**power))) + bias\n\n    return dict(\n        named=func,\n        anonymous=lambda x, power, bias=0: math.log10(math.sqrt(math.exp(x**power)))\n        + bias,\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_series_rolling_apply(request):\n    def func(x):\n        return x.iloc[0] + x.iloc[1] ** 2 + x.iloc[2] ** 3 + x.iloc[3] ** 4\n\n    return dict(\n        named=func,\n        anonymous=lambda x: x.iloc[0]\n        + x.iloc[1] ** 2\n        + x.iloc[2] ** 3\n        + x.iloc[3] ** 4,\n    )[request.param]\n\n\n@pytest.fixture()\ndef func_dataframe_groupby_apply():\n    def func(df):\n        dum = 0\n        for item in df.b:\n            dum += math.log10(math.sqrt(math.exp(item**2)))\n\n        return dum / len(df.b)\n\n    return func\n\n\n@pytest.fixture()\ndef func_dataframe_groupby_apply_complex():\n    def func(df):\n        return pd.DataFrame(\n            [[df.b.mean(), df.b.min(), df.b.max()]],\n            columns=[\"b_mean\", \"b_min\", \"b_max\"],\n        )\n\n    return func\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_groupby_rolling_apply(request):\n    def func(x):\n        return x.iloc[0] + x.iloc[1] ** 2 + x.iloc[2] ** 3 + x.iloc[3] ** 4\n\n    return dict(\n        named=func,\n        anonymous=lambda x: x.iloc[0]\n        + x.iloc[1] ** 2\n        + x.iloc[2] ** 3\n        + x.iloc[3] ** 4,\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_groupby_expanding_apply(request):\n    def func(x):\n        return (x.multiply(pd.Series(range(1, len(x)), dtype=\"float\"))).sum()\n\n    return dict(\n        named=func,\n        anonymous=lambda x: (\n            x.multiply(pd.Series(range(1, len(x)), dtype=\"float\"))\n        ).sum(),\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_apply_axis_0_no_reduce(request):\n    def func(x):\n        return x\n\n    return dict(named=func, anonymous=lambda x: x)[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_apply_axis_1_no_reduce(request):\n    def func(x):\n        return x**2\n\n    return dict(named=func, anonymous=lambda x: x**2)[request.param]\n\n\n@pytest.fixture\ndef pandarallel_init(progress_bar, use_memory_fs):\n    pandarallel.initialize(\n        progress_bar=progress_bar, use_memory_fs=use_memory_fs, nb_workers=2\n    )\n\n\ndef test_dataframe_apply_invalid_function(pandarallel_init, exception):\n    def f(_):\n        raise exception\n\n    df = pd.DataFrame(dict(a=[1, 2, 3, 4]))\n\n    with pytest.raises(exception):\n        df.parallel_apply(f)\n\n\ndef test_dataframe_apply_axis_0(pandarallel_init, func_dataframe_apply_axis_0, df_size):\n    df = pd.DataFrame(\n        dict(\n            a=np.random.randint(1, 8, df_size),\n            b=np.random.rand(df_size),\n            c=np.random.randint(1, 8, df_size),\n            d=np.random.rand(df_size),\n            e=np.random.randint(1, 8, df_size),\n            f=np.random.rand(df_size),\n            g=np.random.randint(1, 8, df_size),\n            h=np.random.rand(df_size),\n        )\n    )\n    df.index = [item / 10 for item in df.index]\n\n    res = df.apply(func_dataframe_apply_axis_0)\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_0)\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_apply_axis_1(pandarallel_init, func_dataframe_apply_axis_1, df_size):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 8, df_size), b=np.random.rand(df_size))\n    )\n    df.index = [item / 10 for item in df.index]\n\n    res = df.apply(func_dataframe_apply_axis_1, axis=1)\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_1, axis=1)\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_apply_invalid_axis(pandarallel_init):\n    df = pd.DataFrame(dict(a=[1, 2, 3, 4]))\n\n    with pytest.raises(ValueError):\n        df.parallel_apply(lambda x: x, axis=\"invalid\")\n    \ndef test_empty_dataframe_apply_axis_0(pandarallel_init, func_dataframe_apply_axis_0):\n    df = pd.DataFrame()\n\n    res = df.apply(func_dataframe_apply_axis_0)\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_0)\n    assert res.equals(res_parallel)\n\ndef test_empty_dataframe_apply_axis_1(pandarallel_init, func_dataframe_apply_axis_1):\n    df = pd.DataFrame()\n\n    res = df.apply(func_dataframe_apply_axis_1)\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_1)\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_applymap(pandarallel_init, func_dataframe_applymap, df_size):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 8, df_size), b=np.random.rand(df_size))\n    )\n    df.index = [item / 10 for item in df.index]\n\n    res = df.applymap(func_dataframe_applymap)\n    res_parallel = df.parallel_applymap(func_dataframe_applymap)\n    assert res.equals(res_parallel)\n\n\ndef test_series_map(pandarallel_init, func_series_map, df_size):\n    df = pd.DataFrame(dict(a=np.random.rand(df_size) + 1))\n\n    res = df.a.map(func_series_map)\n    res_parallel = df.a.parallel_map(func_series_map)\n    assert res.equals(res_parallel)\n\n\ndef test_series_apply(pandarallel_init, func_series_apply, df_size):\n    df = pd.DataFrame(dict(a=np.random.rand(df_size) + 1))\n\n    res = df.a.apply(func_series_apply, args=(2,), bias=3)\n    res_parallel = df.a.parallel_apply(func_series_apply, args=(2,), bias=3)\n    assert res.equals(res_parallel)\n\ndef test_empty_series_apply(pandarallel_init, func_series_apply):\n    df = pd.DataFrame(dict(a=[]))\n\n    res = df.a.apply(func_series_apply, args=(2,), bias=3)\n    res_parallel = df.a.parallel_apply(func_series_apply, args=(2,), bias=3)\n    assert res.equals(res_parallel)\n\n\ndef test_series_rolling_apply(pandarallel_init, func_series_rolling_apply, df_size):\n    df = pd.DataFrame(dict(a=np.random.randint(1, 8, df_size), b=list(range(df_size))))\n\n    res = df.b.rolling(4).apply(func_series_rolling_apply, raw=False)\n    res_parallel = df.b.rolling(4).parallel_apply(func_series_rolling_apply, raw=False)\n\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_groupby_apply(\n    pandarallel_init, func_dataframe_groupby_apply, df_size\n):\n    df = pd.DataFrame(\n        dict(\n            a=np.random.randint(1, 8, df_size),\n            b=np.random.rand(df_size),\n            c=np.random.rand(df_size),\n        )\n    )\n\n    res = df.groupby(\"a\").apply(func_dataframe_groupby_apply)\n    res_parallel = df.groupby(\"a\").parallel_apply(func_dataframe_groupby_apply)\n    assert res.equals(res_parallel)\n\n    res = df.groupby([\"a\"]).apply(func_dataframe_groupby_apply)\n    res_parallel = df.groupby([\"a\"]).parallel_apply(func_dataframe_groupby_apply)\n    assert res.equals(res_parallel)\n\n    res = df.groupby([\"a\", \"b\"]).apply(func_dataframe_groupby_apply)\n    res_parallel = df.groupby([\"a\", \"b\"]).parallel_apply(func_dataframe_groupby_apply)\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_groupby_apply_complex(\n    pandarallel_init, func_dataframe_groupby_apply_complex, df_size\n):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 100, df_size), b=np.random.rand(df_size))\n    )\n\n    res = df.groupby(\"a\").apply(func_dataframe_groupby_apply_complex)\n    res_parallel = df.groupby(\"a\").parallel_apply(func_dataframe_groupby_apply_complex)\n    res.equals(res_parallel)\n\n\ndef test_dataframe_groupby_rolling_apply(\n    pandarallel_init, func_dataframe_groupby_rolling_apply, df_size\n):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 10, df_size), b=np.random.rand(df_size))\n    )\n\n    res = (\n        df.groupby(\"a\")\n        .b.rolling(4)\n        .apply(func_dataframe_groupby_rolling_apply, raw=False)\n    )\n    res_parallel = (\n        df.groupby(\"a\")\n        .b.rolling(4)\n        .parallel_apply(func_dataframe_groupby_rolling_apply, raw=False)\n    )\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_groupby_expanding_apply(\n    pandarallel_init, func_dataframe_groupby_expanding_apply, df_size\n):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 10, df_size), b=np.random.rand(df_size))\n    )\n\n    res = (\n        df.groupby(\"a\")\n        .b.expanding()\n        .apply(func_dataframe_groupby_expanding_apply, raw=False)\n    )\n    res_parallel = (\n        df.groupby(\"a\")\n        .b.expanding()\n        .parallel_apply(func_dataframe_groupby_expanding_apply, raw=False)\n    )\n    res.equals(res_parallel)\n\n\ndef test_dataframe_axis_0_no_reduction(\n    pandarallel_init, func_dataframe_apply_axis_0_no_reduce, df_size\n):\n    df = pd.DataFrame(\n        dict(\n            a=np.random.randint(1, 10, df_size),\n            b=np.random.randint(1, 10, df_size),\n            c=np.random.randint(1, 10, df_size),\n        )\n    )\n    res = df.apply(func_dataframe_apply_axis_0_no_reduce)\n\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_0_no_reduce)\n\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_axis_1_no_reduction(\n    pandarallel_init, func_dataframe_apply_axis_1_no_reduce, df_size\n):\n    df = pd.DataFrame(\n        dict(\n            a=np.random.randint(1, 10, df_size),\n            b=np.random.randint(1, 10, df_size),\n            c=np.random.randint(1, 10, df_size),\n        )\n    )\n\n    res = df.apply(func_dataframe_apply_axis_1_no_reduce, axis=1)\n\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_1_no_reduce, axis=1)\n\n    assert res.equals(res_parallel)\n\ndef test_memory_fs_root_environment_variable(monkeypatch):\n    monkeypatch.setenv(\"MEMORY_FS_ROOT\", \"/test\")\n    from pandarallel import core\n    importlib.reload(core)\n\n    assert core.MEMORY_FS_ROOT == \"/test\"\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/utils.py",
      "content": "import itertools\nfrom enum import Enum\nfrom typing import Any, Dict, List, Tuple\n\nimport pandas as pd\nfrom pandas import DataFrame, Index\n\n\ndef chunk(nb_item: int, nb_chunks: int, start_offset=0) -> List[slice]:\n    \"\"\"\n    Return `nb_chunks` slices of approximatively `nb_item / nb_chunks` each.\n\n    Parameters\n    ----------\n    nb_item : int\n        Total number of items\n\n    nb_chunks : int\n        Number of chunks to return\n\n    start_offset : int\n        Shift start of slice by this amount\n\n    Returns\n    -------\n    A list of slices\n\n    Examples\n    --------\n    >>> chunks = chunk(103, 4)\n    >>> chunks\n    [slice(0, 26, None), slice(26, 52, None), slice(52, 78, None), slice(78, 103, None)]\n    \"\"\"\n    if nb_item == 0:\n        return [slice(0)]\n    \n    if nb_item <= nb_chunks:\n        return [slice(max(0, idx - start_offset), idx + 1) for idx in range(nb_item)]\n\n    quotient = nb_item // nb_chunks\n    remainder = nb_item % nb_chunks\n\n    quotients = [quotient] * nb_chunks\n    remainders = [1] * remainder + [0] * (nb_chunks - remainder)\n\n    nb_elems_per_chunk = [\n        quotient + remainder for quotient, remainder in zip(quotients, remainders)\n    ]\n\n    accumulated = list(itertools.accumulate(nb_elems_per_chunk))\n    shifted_accumulated = accumulated.copy()\n    shifted_accumulated.insert(0, 0)\n    shifted_accumulated.pop()\n\n    return [\n        slice(max(0, begin - start_offset), end)\n        for begin, end in zip(shifted_accumulated, accumulated)\n    ]\n\n\ndef df_indexed_like(df: DataFrame, axes: List[Index]) -> bool:\n    \"\"\"\n    Returns whether a data frame is indexed in the way specified by the\n    provided axes.\n\n    Used by DataFrameGroupBy to determine whether a group has been modified.\n\n    Function adapted from pandas.core.groupby.ops._is_indexed_like\n\n    Parameters\n    ----------\n    df : DataFrame\n        The data frame in question\n\n    axes : List[Index]\n        The axes to which the data frame is compared\n\n    Returns\n    -------\n    Whether or not the data frame is indexed in the same wa as the axes.\n    \"\"\"\n    if isinstance(df, DataFrame):\n        return df.axes[0].equals(axes[0])\n\n    return False\n\n\ndef get_pandas_version() -> Tuple[int, int]:\n    major_str, minor_str, *_ = pd.__version__.split(\".\")\n    return int(major_str), int(minor_str)\n\n\ndef get_axis_int(user_defined_function_kwargs: Dict[str, Any]):\n    axis = user_defined_function_kwargs.get(\"axis\", 0)\n\n    if axis not in {0, 1, \"index\", \"columns\"}:\n        raise ValueError(f\"No axis named {axis} for object type DataFrame\")\n\n    return {0: 0, 1: 1, \"index\": 0, \"columns\": 1}[axis]\n\n\nclass WorkerStatus(int, Enum):\n    Running = 0\n    Success = 1\n    Error = 2\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/core.py",
      "content": "import multiprocessing\nimport os\nimport pickle\nfrom itertools import count\nfrom multiprocessing.managers import SyncManager\nfrom pathlib import Path\nfrom tempfile import NamedTemporaryFile\nfrom typing import Any, Callable, Dict, Iterator, Optional, Tuple, Type, cast\n\nimport dill\nimport pandas as pd\nimport psutil\nfrom pandas.core.groupby import DataFrameGroupBy as PandaDataFrameGroupBy\nfrom pandas.core.window.expanding import ExpandingGroupby as PandasExpandingGroupby\nfrom pandas.core.window.rolling import RollingGroupby as PandasRollingGroupby\n\nfrom .data_types import (\n    DataFrame,\n    DataFrameGroupBy,\n    DataType,\n    ExpandingGroupBy,\n    RollingGroupBy,\n    Series,\n    SeriesRolling,\n)\nfrom .progress_bars import ProgressBarsType, get_progress_bars, progress_wrapper\nfrom .utils import WorkerStatus\n\nON_WINDOWS = os.name == \"nt\"\nCONTEXT = multiprocessing.get_context(\"spawn\" if ON_WINDOWS else \"fork\")\n\n# Root of Memory File System\nMEMORY_FS_ROOT = os.environ.get(\"MEMORY_FS_ROOT\", \"/dev/shm\")\n\n# By default, Pandarallel use all available CPUs\nNB_PHYSICAL_CORES = psutil.cpu_count(logical=False)\n\n# Prefix and suffix for files used with Memory File System\nPREFIX = \"pandarallel\"\nPREFIX_INPUT = f\"{PREFIX}_input_\"\nPREFIX_OUTPUT = f\"{PREFIX}_output_\"\nSUFFIX = \".pickle\"\n\n# We use these classes decorators pattern instead of the classic one because of this:\n# https://www.stevenengelhardt.com/2013/01/16/python-multiprocessing-module-and-closures/\n\n\nclass WrapWorkFunctionForFileSystem:\n    def __init__(\n        self,\n        work_function: Callable[\n            [Any, Callable, tuple, Dict[str, Any], Dict[str, Any]], Any\n        ],\n    ) -> None:\n        self.work_function = work_function\n\n    def __call__(\n        self,\n        input_file_path: Path,\n        output_file_path: Path,\n        progress_bars_type: ProgressBarsType,\n        worker_index: int,\n        master_workers_queue: multiprocessing.Queue,\n        dilled_user_defined_function: bytes,\n        user_defined_function_args: tuple,\n        user_defined_function_kwargs: Dict[str, Any],\n        extra: Dict[str, Any],\n    ) -> None:\n        try:\n            # Load dataframe from input file\n            with input_file_path.open(\"rb\") as file_descriptor:\n                data = pickle.load(file_descriptor)\n\n            # Delete input file since we don't need it any more. It will free some RAM\n            # since the input file is stored into Shared Memory.\n            input_file_path.unlink()\n\n            data_size = len(data)\n            user_defined_function: Callable = dill.loads(dilled_user_defined_function)\n\n            progress_wrapped_user_defined_function = progress_wrapper(\n                user_defined_function, master_workers_queue, worker_index, data_size\n            )\n\n            used_user_defined_function = (\n                progress_wrapped_user_defined_function\n                if progress_bars_type\n                in (\n                    ProgressBarsType.InUserDefinedFunction,\n                    ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns,\n                )\n                else user_defined_function\n            )\n\n            result = self.work_function(\n                data,\n                used_user_defined_function,\n                user_defined_function_args,\n                user_defined_function_kwargs,\n                extra,\n            )\n\n            with output_file_path.open(\"wb\") as file_descriptor:\n                pickle.dump(result, file_descriptor)\n\n            master_workers_queue.put((worker_index, WorkerStatus.Success, None))\n\n        except:\n            master_workers_queue.put((worker_index, WorkerStatus.Error, None))\n            raise\n\n\nclass WrapWorkFunctionForPipe:\n    def __init__(\n        self,\n        work_function: Callable[\n            [\n                Any,\n                Callable,\n                tuple,\n                Dict[str, Any],\n                Dict[str, Any],\n            ],\n            Any,\n        ],\n    ) -> None:\n        self.work_function = work_function\n\n    def __call__(\n        self,\n        data: Any,\n        progress_bars_type: ProgressBarsType,\n        worker_index: int,\n        master_workers_queue: multiprocessing.Queue,\n        dilled_user_defined_function: bytes,\n        user_defined_function_args: tuple,\n        user_defined_function_kwargs: Dict[str, Any],\n        extra: Dict[str, Any],\n    ) -> Any:\n        try:\n            data_size = len(data)\n            user_defined_function: Callable = dill.loads(dilled_user_defined_function)\n\n            progress_wrapped_user_defined_function = progress_wrapper(\n                user_defined_function, master_workers_queue, worker_index, data_size\n            )\n\n            used_user_defined_function = (\n                progress_wrapped_user_defined_function\n                if progress_bars_type\n                in (\n                    ProgressBarsType.InUserDefinedFunction,\n                    ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns,\n                )\n                else user_defined_function\n            )\n\n            results = self.work_function(\n                data,\n                used_user_defined_function,\n                user_defined_function_args,\n                user_defined_function_kwargs,\n                extra,\n            )\n\n            master_workers_queue.put((worker_index, WorkerStatus.Success, None))\n\n            return results\n\n        except:\n            master_workers_queue.put((worker_index, WorkerStatus.Error, None))\n            raise\n\n\ndef wrap_reduce_function_for_file_system(\n    reduce_function: Callable[[Iterator, Dict[str, Any]], Any]\n) -> Callable[[Iterator[Path], Dict[str, Any]], Any]:\n    \"\"\"This wrapper transforms a `reduce` function which takes as input:\n    - A list of pandas Dataframe\n    - An user defined function\n    and which returns a pandas Dataframe, into a `reduct` function which takes as input:\n    - A list of paths where  pandas Dataframe are pickled\n    which returns a pandas Dataframe.\n    \"\"\"\n\n    def closure(output_file_paths: Iterator[Path], extra: Dict[str, Any]) -> Any:\n        def get_dataframe_and_delete_file(file_path: Path) -> Any:\n            with file_path.open(\"rb\") as file_descriptor:\n                data = pickle.load(file_descriptor)\n\n            file_path.unlink()\n            return data\n\n        dfs = (\n            get_dataframe_and_delete_file(output_file_path)\n            for output_file_path in output_file_paths\n        )\n\n        return reduce_function(dfs, extra)\n\n    return closure\n\n\ndef parallelize_with_memory_file_system(\n    nb_requested_workers: int,\n    data_type: Type[DataType],\n    progress_bars_type: ProgressBarsType,\n):\n    def closure(\n        data: Any,\n        user_defined_function: Callable,\n        *user_defined_function_args: tuple,\n        **user_defined_function_kwargs: Dict[str, Any],\n    ):\n        wrapped_work_function = WrapWorkFunctionForFileSystem(data_type.work)\n        wrapped_reduce_function = wrap_reduce_function_for_file_system(data_type.reduce)\n\n        chunks = list(\n            data_type.get_chunks(\n                nb_requested_workers,\n                data,\n                user_defined_function_kwargs=user_defined_function_kwargs,\n            )\n        )\n\n        nb_workers = len(chunks)\n\n        multiplicator_factor = (\n            len(cast(pd.DataFrame, data).columns)\n            if progress_bars_type\n            == ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns\n            else 1\n        )\n\n        progresses_length = [len(chunk_) * multiplicator_factor for chunk_ in chunks]\n\n        work_extra = data_type.get_work_extra(data)\n        reduce_extra = data_type.get_reduce_extra(data, user_defined_function_kwargs)\n\n        show_progress_bars = progress_bars_type != ProgressBarsType.No\n\n        progress_bars = get_progress_bars(progresses_length, show_progress_bars)\n        progresses = [0] * nb_workers\n        workers_status = [WorkerStatus.Running] * nb_workers\n\n        input_files = [\n            NamedTemporaryFile(\n                prefix=PREFIX_INPUT, suffix=SUFFIX, dir=MEMORY_FS_ROOT, delete=False\n            )\n            for _ in range(nb_workers)\n        ]\n\n        output_files = [\n            NamedTemporaryFile(\n                prefix=PREFIX_OUTPUT, suffix=SUFFIX, dir=MEMORY_FS_ROOT, delete=False\n            )\n            for _ in range(nb_workers)\n        ]\n\n        try:\n            for chunk, input_file in zip(chunks, input_files):\n                with Path(input_file.name).open(\"wb\") as file_descriptor:\n                    pickle.dump(chunk, file_descriptor)\n\n            dilled_user_defined_function = dill.dumps(user_defined_function)\n            manager: SyncManager = CONTEXT.Manager()\n            master_workers_queue = manager.Queue()\n\n            work_args_list = [\n                (\n                    Path(input_file.name),\n                    Path(output_file.name),\n                    progress_bars_type,\n                    worker_index,\n                    master_workers_queue,\n                    dilled_user_defined_function,\n                    user_defined_function_args,\n                    user_defined_function_kwargs,\n                    {\n                        **work_extra,\n                        **{\n                            \"master_workers_queue\": master_workers_queue,\n                            \"show_progress_bars\": show_progress_bars,\n                            \"worker_index\": worker_index,\n                        },\n                    },\n                )\n                for worker_index, (\n                    input_file,\n                    output_file,\n                ) in enumerate(zip(input_files, output_files))\n            ]\n\n            pool = CONTEXT.Pool(nb_workers)\n            results_promise = pool.starmap_async(wrapped_work_function, work_args_list)\n\n            pool.close()\n\n            generation = count()\n\n            while any(\n                (\n                    worker_status == WorkerStatus.Running\n                    for worker_status in workers_status\n                )\n            ):\n                message: Tuple[int, WorkerStatus, Any] = master_workers_queue.get()\n                worker_index, worker_status, payload = message\n                workers_status[worker_index] = worker_status\n\n                if worker_status == WorkerStatus.Success:\n                    progresses[worker_index] = progresses_length[worker_index]\n                    progress_bars.update(progresses)\n                elif worker_status == WorkerStatus.Running:\n                    progress = cast(int, payload)\n                    progresses[worker_index] = progress\n\n                    if next(generation) % nb_workers == 0:\n                        progress_bars.update(progresses)\n                elif worker_status == WorkerStatus.Error:\n                    progress_bars.set_error(worker_index)\n                    progress_bars.update(progresses)\n\n            try:\n                return wrapped_reduce_function(\n                    (Path(output_file.name) for output_file in output_files),\n                    reduce_extra,\n                )\n            except EOFError:\n                # Loading the files failed, this most likely means that there\n                # was some error during processing and the files were never\n                # saved at all.\n                results_promise.get()\n\n                # If the above statement does not raise an exception, that\n                # means the multiprocessing went well and we want to re-raise\n                # the original EOFError.\n                raise\n\n        finally:\n            for output_file in output_files:\n                # When pandarallel stop supporting Python 3.7 and older, replace this\n                # try/except clause by:\n                # Path(output_file.name).unlink(missing_ok=True)\n                try:\n                    Path(output_file.name).unlink()\n                except FileNotFoundError:\n                    # Do nothing, this is the nominal case.\n                    pass\n\n    return closure\n\n\ndef parallelize_with_pipe(\n    nb_requested_workers: int,\n    data_type: Type[DataType],\n    progress_bars_type: ProgressBarsType,\n):\n    def closure(\n        data: Any,\n        user_defined_function: Callable,\n        *user_defined_function_args: tuple,\n        **user_defined_function_kwargs: Dict[str, Any],\n    ):\n        wrapped_work_function = WrapWorkFunctionForPipe(data_type.work)\n        dilled_user_defined_function = dill.dumps(user_defined_function)\n        manager: SyncManager = CONTEXT.Manager()\n        master_workers_queue = manager.Queue()\n\n        chunks = list(\n            data_type.get_chunks(\n                nb_requested_workers,\n                data,\n                user_defined_function_kwargs=user_defined_function_kwargs,\n            )\n        )\n\n        nb_workers = len(chunks)\n\n        multiplicator_factor = (\n            len(cast(pd.DataFrame, data).columns)\n            if progress_bars_type\n            == ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns\n            else 1\n        )\n\n        progresses_length = [len(chunk_) * multiplicator_factor for chunk_ in chunks]\n\n        work_extra = data_type.get_work_extra(data)\n        reduce_extra = data_type.get_reduce_extra(data, user_defined_function_kwargs)\n\n        show_progress_bars = progress_bars_type != ProgressBarsType.No\n\n        progress_bars = get_progress_bars(progresses_length, show_progress_bars)\n        progresses = [0] * nb_workers\n        workers_status = [WorkerStatus.Running] * nb_workers\n\n        work_args_list = [\n            (\n                chunk,\n                progress_bars_type,\n                worker_index,\n                master_workers_queue,\n                dilled_user_defined_function,\n                user_defined_function_args,\n                user_defined_function_kwargs,\n                {\n                    **work_extra,\n                    **{\n                        \"master_workers_queue\": master_workers_queue,\n                        \"show_progress_bars\": show_progress_bars,\n                        \"worker_index\": worker_index,\n                    },\n                },\n            )\n            for worker_index, chunk in enumerate(chunks)\n        ]\n\n        pool = CONTEXT.Pool(nb_workers)\n        results_promise = pool.starmap_async(wrapped_work_function, work_args_list)\n        pool.close()\n\n        generation = count()\n\n        while any(\n            (worker_status == WorkerStatus.Running for worker_status in workers_status)\n        ):\n            message: Tuple[int, WorkerStatus, Any] = master_workers_queue.get()\n            worker_index, worker_status, payload = message\n            workers_status[worker_index] = worker_status\n\n            if worker_status == WorkerStatus.Success:\n                progresses[worker_index] = progresses_length[worker_index]\n                progress_bars.update(progresses)\n            elif worker_status == WorkerStatus.Running:\n                progress = cast(int, payload)\n                progresses[worker_index] = progress\n\n                if next(generation) % nb_workers == 0:\n                    progress_bars.update(progresses)\n            elif worker_status == WorkerStatus.Error:\n                progress_bars.set_error(worker_index)\n\n        results = results_promise.get()\n\n        return data_type.reduce(results, reduce_extra)\n\n    return closure\n\n\nclass pandarallel:\n    @classmethod\n    def initialize(\n        cls,\n        shm_size_mb=None,\n        nb_workers=NB_PHYSICAL_CORES,\n        progress_bar=False,\n        verbose=2,\n        use_memory_fs: Optional[bool] = None,\n    ) -> None:\n        show_progress_bars = progress_bar\n        is_memory_fs_available = Path(MEMORY_FS_ROOT).exists()\n\n        use_memory_fs = (\n            use_memory_fs if use_memory_fs is not None else is_memory_fs_available\n        )\n\n        parallelize = (\n            parallelize_with_memory_file_system\n            if use_memory_fs\n            else parallelize_with_pipe\n        )\n\n        if use_memory_fs and not is_memory_fs_available:\n            raise SystemError(\"Memory file system is not available\")\n\n        if verbose >= 2:\n            print(f\"INFO: Pandarallel will run on {nb_workers} workers.\")\n\n            message = (\n                (\n                    \"INFO: Pandarallel will use Memory file system to transfer data \"\n                    \"between the main process and workers.\"\n                )\n                if use_memory_fs\n                else (\n                    \"INFO: Pandarallel will use standard multiprocessing data transfer \"\n                    \"(pipe) to transfer data between the main process and workers.\"\n                )\n            )\n\n            print(message)\n\n            if ON_WINDOWS and verbose >= 2:\n                print()\n                print(\n                    (\n                        \"WARNING: You are on Windows. If you detect any issue with \"\n                        \"pandarallel, be sure you checked out the Troubleshooting page:\"\n                    )\n                )\n                print(\"https://nalepae.github.io/pandarallel/troubleshooting/\")\n\n        progress_bars_in_user_defined_function = (\n            ProgressBarsType.InUserDefinedFunction\n            if show_progress_bars\n            else ProgressBarsType.No\n        )\n\n        progress_bars_in_user_defined_function_multiply_by_number_of_columns = (\n            ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns\n            if show_progress_bars\n            else ProgressBarsType.No\n        )\n\n        progress_bars_in_work_function = (\n            ProgressBarsType.InWorkFunction\n            if show_progress_bars\n            else ProgressBarsType.No\n        )\n\n        # DataFrame\n        pd.DataFrame.parallel_apply = parallelize(\n            nb_workers, DataFrame.Apply, progress_bars_in_user_defined_function\n        )\n        pd.DataFrame.parallel_applymap = parallelize(\n            nb_workers,\n            DataFrame.ApplyMap,\n            progress_bars_in_user_defined_function_multiply_by_number_of_columns,\n        )\n\n        # DataFrame GroupBy\n        PandaDataFrameGroupBy.parallel_apply = parallelize(\n            nb_workers, DataFrameGroupBy.Apply, progress_bars_in_user_defined_function\n        )\n\n        # Expanding GroupBy\n        PandasExpandingGroupby.parallel_apply = parallelize(\n            nb_workers, ExpandingGroupBy.Apply, progress_bars_in_work_function\n        )\n\n        # Rolling GroupBy\n        PandasRollingGroupby.parallel_apply = parallelize(\n            nb_workers, RollingGroupBy.Apply, progress_bars_in_work_function\n        )\n\n        # Series\n        pd.Series.parallel_apply = parallelize(\n            nb_workers, Series.Apply, progress_bars_in_user_defined_function\n        )\n        pd.Series.parallel_map = parallelize(nb_workers, Series.Map, show_progress_bars)\n\n        # Series Rolling\n        pd.core.window.Rolling.parallel_apply = parallelize(\n            nb_workers, SeriesRolling.Apply, progress_bars_in_user_defined_function\n        )\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/progress_bars.py",
      "content": "import multiprocessing\nimport os\nimport shutil\nimport sys\nfrom abc import ABC, abstractmethod\nfrom enum import Enum\nfrom itertools import count\nfrom time import time_ns\nfrom typing import Callable, List, Union\n\nfrom .utils import WorkerStatus\n\nINTERVAL_NS = 250_000_000  # 0.25 sec\nMINIMUM_TERMINAL_WIDTH = 72\n\n\nclass ProgressBarsType(int, Enum):\n    No = 0\n    InUserDefinedFunction = 1\n    InUserDefinedFunctionMultiplyByNumberOfColumns = 2\n    InWorkFunction = 3\n\n\nclass ProgressBars(ABC):\n    @abstractmethod\n    def __init__(self, maxs: List[int], show: bool) -> None:\n        ...\n\n    @abstractmethod\n    def update(self, values: List[int]) -> None:\n        ...\n\n    def set_error(self, index: int) -> None:\n        pass\n\n\nclass ProgressState:\n    def __init__(self, chunk_size: int) -> None:\n        self.last_put_iteration = 0\n        self.next_put_iteration = max(chunk_size // 100, 1)\n        self.last_put_time = time_ns()\n\n\ndef is_notebook_lab() -> bool:\n    try:\n        shell: str = get_ipython().__class__.__name__  # type: ignore\n\n        # Shell: Google Colab\n        # TerminalInteractiveShell: Terminal running IPython\n        # ZMQInteractiveShell: Jupyter notebook/lab or qtconsole\n        return shell in {\"Shell\", \"ZMQInteractiveShell\"}\n    except NameError:\n        # Probably standard Python interpreter\n        return False\n\n\nclass ProgressBarsConsole(ProgressBars):\n    def __init__(self, maxs: List[int], show: bool) -> None:\n        self.__show = show\n        self.__bars = [[0, max] for max in maxs]\n        self.__width = self.__get_width()\n\n        self.__lines = self.__update_lines()\n\n        if show:\n            sys.stdout.write(\"\\n\".join(self.__lines))\n            sys.stdout.flush()\n\n    def __get_width(self) -> int:\n        try:\n            columns = shutil.get_terminal_size().columns\n            return max(MINIMUM_TERMINAL_WIDTH, columns - 1)\n        except AttributeError:\n            # Python 2\n            pass\n\n        try:\n            columns = int(os.popen(\"stty size\", \"r\").read().split()[1])\n            return max(MINIMUM_TERMINAL_WIDTH, columns - 1)\n        except:\n            return MINIMUM_TERMINAL_WIDTH\n\n    def __remove_displayed_lines(self) -> None:\n        if len(self.__bars) >= 1:\n            sys.stdout.write(\"\\b\" * len(self.__lines[-1]))\n\n        if len(self.__bars) >= 2:\n            sys.stdout.write(\"\\033M\" * (len(self.__lines) - 1))\n\n        self.__lines = []\n\n    def __update_line(self, done: int, total: int) -> str:\n        if total == 0:\n            percent = 0\n        else:\n            percent = done / total\n        bar = (\":\" * int(percent * 40)).ljust(40, \" \")\n        percent = round(percent * 100, 2)\n        format = \" {percent:6.2f}% {bar:s} | {done:8d} / {total:8d} |\"\n        ret = format.format(percent=percent, bar=bar, done=done, total=total)\n        return ret[: self.__width].ljust(self.__width, \" \")\n\n    def __update_lines(self) -> List[str]:\n        return [self.__update_line(value, max) for value, max in self.__bars]\n\n    def update(self, values: List[int]) -> None:\n        \"\"\"Update a bar value.\n        Positional arguments:\n        values - The new values of each bar\n        \"\"\"\n        if not self.__show:\n            return\n\n        for index, value in enumerate(values):\n            self.__bars[index][0] = value\n\n        self.__remove_displayed_lines()\n        self.__lines = self.__update_lines()\n\n        sys.stdout.write(\"\\n\".join(self.__lines))\n        sys.stdout.flush()\n\n\nclass ProgressBarsNotebookLab(ProgressBars):\n    def __init__(self, maxs: List[int], show: bool) -> None:\n        \"\"\"Initialization.\n        Positional argument:\n        maxs - List containing the max value of each progress bar\n        \"\"\"\n        self.__show = show\n\n        if not show:\n            return\n\n        from IPython.display import display\n        from ipywidgets import HBox, IntProgress, Label, VBox\n\n        self.__bars = [\n            HBox(\n                [\n                    IntProgress(0, 0, max, description=\"{:.2f}%\".format(0)),\n                    Label(\"{} / {}\".format(0, max)),\n                ]\n            )\n            for max in maxs\n        ]\n\n        display(VBox(self.__bars))\n\n    def update(self, values: List[int]) -> None:\n        \"\"\"Update a bar value.\n        Positional arguments:\n        values - The new values of each bar\n        \"\"\"\n        if not self.__show:\n            return\n\n        for index, value in enumerate(values):\n            bar, label = self.__bars[index].children\n\n            label.value = \"{} / {}\".format(value, bar.max)\n            \n            bar.value = value\n\n            if value >= bar.max:\n                bar.bar_style = \"success\"\n\n            if bar.max != 0:\n                bar.description = \"{:.2f}%\".format(bar.value / bar.max * 100)\n\n    def set_error(self, index: int) -> None:\n        \"\"\"Set a bar on error\"\"\"\n        if not self.__show:\n            return\n\n        bar, _ = self.__bars[index].children\n        bar.bar_style = \"danger\"\n\n\ndef get_progress_bars(\n    maxs: List[int], show\n) -> Union[ProgressBarsNotebookLab, ProgressBarsConsole]:\n    return (\n        ProgressBarsNotebookLab(maxs, show)\n        if is_notebook_lab()\n        else ProgressBarsConsole(maxs, show)\n    )\n\n\ndef progress_wrapper(\n    user_defined_function: Callable,\n    master_workers_queue: multiprocessing.Queue,\n    index: int,\n    chunk_size: int,\n) -> Callable:\n    \"\"\"Wrap the function to apply in a function which monitor the part of work already\n    done.\n    \"\"\"\n    counter = count()\n    state = ProgressState(chunk_size)\n\n    def closure(*user_defined_function_args, **user_defined_functions_kwargs):\n        iteration = next(counter)\n\n        if iteration == state.next_put_iteration:\n            time_now = time_ns()\n            master_workers_queue.put_nowait((index, WorkerStatus.Running, iteration))\n\n            delta_t = time_now - state.last_put_time\n            delta_i = iteration - state.last_put_iteration\n\n            state.next_put_iteration += (\n                max(int((delta_i / delta_t) * INTERVAL_NS), 1) if delta_t != 0 else 1\n            )\n\n            state.last_put_iteration = iteration\n            state.last_put_time = time_now\n\n        return user_defined_function(\n            *user_defined_function_args, **user_defined_functions_kwargs\n        )\n\n    return closure\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/__init__.py",
      "content": "from .core import pandarallel\n\n__version__ = \"1.6.5\"\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py",
      "content": "import multiprocessing\nfrom typing import Any, Callable, Dict, Iterable, Iterator, List, Tuple\n\nimport pandas as pd\nfrom pandas.core.window.expanding import ExpandingGroupby as PandasExpandingGroupby\n\nfrom ..utils import WorkerStatus, chunk, get_pandas_version\nfrom .generic import DataType\n\n\nclass ExpandingGroupBy:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: PandasExpandingGroupby, *args, **kwargs\n        ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n            pandas_version = get_pandas_version()\n\n            nb_items = (\n                len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n            )\n\n            chunks = chunk(nb_items, nb_workers)\n\n            iterator = (\n                iter(data._groupby)\n                if pandas_version < (1, 3)\n                else data._grouper.get_iterator(data.obj)\n            )\n\n            for chunk_ in chunks:\n                yield [next(iterator) for _ in range(chunk_.stop - chunk_.start)]\n\n        @staticmethod\n        def get_work_extra(data: PandasExpandingGroupby):\n            attributes = {\n                attribute: getattr(data, attribute) for attribute in data._attributes\n            }\n\n            return {\"attributes\": attributes}\n\n        @staticmethod\n        def work(\n            data: List[Tuple[int, pd.DataFrame]],\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> List[pd.DataFrame]:\n            show_progress_bars: bool = extra[\"show_progress_bars\"]\n            master_workers_queue: multiprocessing.Queue = extra[\"master_workers_queue\"]\n            worker_index: int = extra[\"worker_index\"]\n\n            def compute_result(\n                iteration: int,\n                attributes: Dict[str, Any],\n                index: int,\n                df: pd.DataFrame,\n                user_defined_function: Callable,\n                user_defined_function_args: tuple,\n                user_defined_function_kwargs: Dict[str, Any],\n            ) -> pd.DataFrame:\n                item = df.expanding(**attributes).apply(\n                    user_defined_function,\n                    *user_defined_function_args,\n                    **user_defined_function_kwargs\n                )\n\n                item.index = pd.MultiIndex.from_product([[index], item.index])\n\n                if show_progress_bars:\n                    master_workers_queue.put_nowait(\n                        (worker_index, WorkerStatus.Running, iteration)\n                    )\n\n                return item\n\n            attributes = extra[\"attributes\"]\n\n            dfs = (\n                compute_result(\n                    iteration,\n                    attributes,\n                    index,\n                    df,\n                    user_defined_function,\n                    user_defined_function_args,\n                    user_defined_function_kwargs,\n                )\n                for iteration, (index, df) in enumerate(data)\n            )\n\n            return pd.concat(dfs)\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.DataFrame], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/series.py",
      "content": "from typing import Any, Callable, Dict, Iterable, Iterator\n\nimport pandas as pd\n\nfrom ..utils import chunk\nfrom .generic import DataType\n\n\nclass Series:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: pd.Series, **kwargs\n        ) -> Iterator[pd.Series]:\n            for chunk_ in chunk(data.size, nb_workers):\n                yield data.iloc[chunk_]\n\n        @staticmethod\n        def work(\n            data: pd.Series,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.Series:\n            return data.apply(\n                user_defined_function,\n                *user_defined_function_args,\n                **user_defined_function_kwargs\n            )\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.Series], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n\n    class Map(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: pd.Series, **kwargs\n        ) -> Iterator[pd.Series]:\n            for chunk_ in chunk(data.size, nb_workers):\n                yield data.iloc[chunk_]\n\n        @staticmethod\n        def work(\n            data: pd.Series,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.Series:\n            return data.map(\n                user_defined_function,\n                *user_defined_function_args,\n                **user_defined_function_kwargs\n            )\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.Series], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/series_rolling.py",
      "content": "from typing import Any, Callable, Dict, Iterable, Iterator\n\nimport pandas as pd\nfrom pandas.core.window.rolling import Rolling\n\nfrom ..utils import chunk\nfrom .generic import DataType\n\n\nclass SeriesRolling:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, rolling: Rolling, **kwargs\n        ) -> Iterator[pd.Series]:\n            chunks = chunk(rolling.obj.size, nb_workers, rolling.window)\n\n            for chunk_ in chunks:\n                yield rolling.obj[chunk_]\n\n        @staticmethod\n        def get_work_extra(data: Rolling) -> Dict[str, Any]:\n            return {\n                \"attributes\": {\n                    attribute: getattr(data, attribute)\n                    for attribute in data._attributes\n                }\n            }\n\n        @staticmethod\n        def work(\n            data: pd.Series,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.Series:\n            attributes: Dict[str, Any] = extra[\"attributes\"]\n            worker_index: int = extra[\"worker_index\"]\n\n            result = data.rolling(**attributes).apply(\n                user_defined_function,\n                *user_defined_function_args,\n                **user_defined_function_kwargs\n            )\n\n            return result if worker_index == 0 else result[attributes[\"window\"] :]\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.Series], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/__init__.py",
      "content": "from .dataframe import DataFrame\nfrom .dataframe_groupby import DataFrameGroupBy\nfrom .expanding_groupby import ExpandingGroupBy\nfrom .rolling_groupby import RollingGroupBy\nfrom .generic import DataType\nfrom .series import Series\nfrom .series_rolling import SeriesRolling\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/dataframe.py",
      "content": "from typing import Any, Callable, Dict, Iterable, Iterator\nfrom types import GeneratorType\n\nimport pandas as pd\n\nfrom ..utils import chunk, get_axis_int\nfrom .generic import DataType\n\n\nclass DataFrame:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: pd.DataFrame, **kwargs\n        ) -> Iterator[pd.DataFrame]:\n            user_defined_function_kwargs = kwargs[\"user_defined_function_kwargs\"]\n\n            axis_int = get_axis_int(user_defined_function_kwargs)\n            opposite_axis_int = 1 - axis_int\n\n            for chunk_ in chunk(data.shape[opposite_axis_int], nb_workers):\n                yield data.iloc[chunk_] if axis_int == 1 else data.iloc[:, chunk_]\n\n        @staticmethod\n        def work(\n            data: pd.DataFrame,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.DataFrame:\n            return data.apply(\n                user_defined_function,\n                *user_defined_function_args,\n                **user_defined_function_kwargs,\n            )\n\n        @staticmethod\n        def get_reduce_extra(\n            data: Any, user_defined_function_kwargs: Dict[str, Any]\n        ) -> Dict[str, Any]:\n            return {\"axis\": get_axis_int(user_defined_function_kwargs)}\n\n        @staticmethod\n        def reduce(\n            datas: Iterable[pd.DataFrame], extra: Dict[str, Any]\n        ) -> pd.DataFrame:\n            if isinstance(datas, GeneratorType):\n                datas = list(datas)\n            axis = 0 if isinstance(datas[0], pd.Series) else 1 - extra[\"axis\"]\n            return pd.concat(datas, copy=False, axis=axis)\n\n    class ApplyMap(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: pd.DataFrame, **kwargs\n        ) -> Iterator[pd.DataFrame]:\n            for chunk_ in chunk(data.shape[0], nb_workers):\n                yield data.iloc[chunk_]\n\n        @staticmethod\n        def work(\n            data: pd.DataFrame,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.DataFrame:\n            return data.applymap(user_defined_function)\n\n        @staticmethod\n        def reduce(\n            datas: Iterable[pd.DataFrame], extra: Dict[str, Any]\n        ) -> pd.DataFrame:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/generic.py",
      "content": "from abc import ABC, abstractmethod\nfrom typing import Any, Callable, Dict, Iterable, Iterator\n\n\nclass DataType(ABC):\n    @staticmethod\n    @abstractmethod\n    def get_chunks(nb_workers: int, data: Any, **kwargs) -> Iterator[Any]:\n        ...\n\n    @staticmethod\n    def get_work_extra(data: Any) -> Dict[str, Any]:\n        return dict()\n\n    @staticmethod\n    @abstractmethod\n    def work(\n        data: Any,\n        user_defined_function: Callable,\n        user_defined_function_args: tuple,\n        user_defined_function_kwargs: Dict[str, Any],\n        extra: Dict[str, Any],\n    ) -> Any:\n        ...\n\n    @staticmethod\n    def get_reduce_extra(\n        data: Any, user_defined_function_kwargs: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        return dict()\n\n    @staticmethod\n    @abstractmethod\n    def reduce(datas: Iterable[Any], extra: Dict[str, Any]) -> Any:\n        ...\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py",
      "content": "import multiprocessing\nfrom typing import Any, Callable, Dict, Iterable, Iterator, List, Tuple\n\nimport pandas as pd\nfrom pandas.core.window.rolling import RollingGroupby as PandasRollingGroupby\n\nfrom ..utils import WorkerStatus, chunk, get_pandas_version\nfrom .generic import DataType\n\n\nclass RollingGroupBy:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: PandasRollingGroupby, *args, **kwargs\n        ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n            pandas_version = get_pandas_version()\n\n            nb_items = (\n                len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n            )\n\n            chunks = chunk(nb_items, nb_workers)\n\n            iterator = (\n                iter(data._groupby)\n                if pandas_version < (1, 3)\n                else data._grouper.get_iterator(data.obj)\n            )\n\n            for chunk_ in chunks:\n                yield [next(iterator) for _ in range(chunk_.stop - chunk_.start)]\n\n        @staticmethod\n        def get_work_extra(data: PandasRollingGroupby):\n            attributes = {\n                attribute: getattr(data, attribute) for attribute in data._attributes\n            }\n\n            return {\"attributes\": attributes}\n\n        @staticmethod\n        def work(\n            data: List[Tuple[int, pd.DataFrame]],\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> List[pd.DataFrame]:\n            show_progress_bars: bool = extra[\"show_progress_bars\"]\n            master_workers_queue: multiprocessing.Queue = extra[\"master_workers_queue\"]\n            worker_index: int = extra[\"worker_index\"]\n\n            def compute_result(\n                iteration: int,\n                attributes: Dict[str, Any],\n                index: int,\n                df: pd.DataFrame,\n                user_defined_function: Callable,\n                user_defined_function_args: tuple,\n                user_defined_function_kwargs: Dict[str, Any],\n            ) -> pd.DataFrame:\n                item = df.rolling(**attributes).apply(\n                    user_defined_function,\n                    *user_defined_function_args,\n                    **user_defined_function_kwargs\n                )\n\n                item.index = pd.MultiIndex.from_product([[index], item.index])\n\n                if show_progress_bars:\n                    master_workers_queue.put_nowait(\n                        (worker_index, WorkerStatus.Running, iteration)\n                    )\n\n                return item\n\n            attributes = extra[\"attributes\"]\n\n            dfs = (\n                compute_result(\n                    iteration,\n                    attributes,\n                    index,\n                    df,\n                    user_defined_function,\n                    user_defined_function_args,\n                    user_defined_function_kwargs,\n                )\n                for iteration, (index, df) in enumerate(data)\n            )\n\n            return pd.concat(dfs)\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.DataFrame], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/dataframe_groupby.py",
      "content": "import itertools\nfrom typing import Any, Callable, Dict, Iterable, Iterator, List, Tuple, Union, cast\n\nimport pandas as pd\nfrom pandas.core.groupby.generic import DataFrameGroupBy as PandasDataFrameGroupBy\n\nfrom ..utils import chunk, df_indexed_like, get_pandas_version\nfrom .generic import DataType\n\n\nclass DataFrameGroupBy:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, dataframe_groupby: PandasDataFrameGroupBy, **kwargs\n        ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n            chunks = chunk(dataframe_groupby.ngroups, nb_workers)\n            iterator = iter(dataframe_groupby)\n\n            for chunk_ in chunks:\n                yield [next(iterator) for _ in range(chunk_.stop - chunk_.start)]\n\n        @staticmethod\n        def work(\n            data: List[Tuple[int, pd.DataFrame]],\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> List[Tuple[int, pd.DataFrame, bool]]:\n            def compute_result(\n                key: int, df: pd.DataFrame\n            ) -> Tuple[int, pd.DataFrame, bool]:\n                result = user_defined_function(\n                    df, *user_defined_function_args, **user_defined_function_kwargs\n                )\n                mutated = not df_indexed_like(result, df.axes)\n                return key, result, mutated\n\n            return [compute_result(key, df) for key, df in data]\n\n        @staticmethod\n        def get_reduce_extra(\n            data: PandasDataFrameGroupBy, user_defined_function_kwargs: Dict[str, Any]\n        ) -> Dict[str, Any]:\n            return {\"df_groupby\": data}\n\n        @staticmethod\n        def reduce(\n            datas: Iterable[List[Tuple[int, pd.DataFrame, bool]]], extra: Dict[str, Any]\n        ) -> pd.Series:\n            def get_args(\n                keys: List[int],\n                values: List[pd.DataFrame],\n                df_groupby: PandasDataFrameGroupBy,\n            ) -> Union[\n                Tuple[List[int], List[pd.DataFrame]],\n                Tuple[pd.DataFrame, List[int], List[pd.DataFrame]],\n                Tuple[pd.DataFrame, List[pd.DataFrame]],\n            ]:\n                pandas_version = get_pandas_version()\n\n                if pandas_version < (1, 3):\n                    return keys, values\n                elif pandas_version < (1, 4):\n                    return df_groupby._selected_obj, keys, values\n                else:\n                    return df_groupby._selected_obj, values\n\n            df_groupby: PandasDataFrameGroupBy = extra[\"df_groupby\"]\n\n            results = itertools.chain.from_iterable(datas)\n            keys, values, mutated = zip(*results)\n\n            keys = cast(List[int], keys)\n            values = cast(List[pd.DataFrame], values)\n            mutated = cast(List[bool], mutated)\n\n            args = get_args(keys, values, df_groupby)\n  \n            return df_groupby._wrap_applied_output(*args, not_indexed_same=mutated)\n"
    }
  ],
  "OriginCode": [
    {
      "path": "nalepae_pandarallel/setup.py",
      "content": "from setuptools import setup\n\nsetup()\n"
    },
    {
      "path": "nalepae_pandarallel/tests/test_pandarallel.py",
      "content": "import importlib\nimport math\n\nimport numpy as np\nimport pandas as pd\nimport pytest\nfrom pandarallel import pandarallel\n\n\n@pytest.fixture(params=(1000, 1))\ndef df_size(request):\n    return request.param\n\n\n@pytest.fixture(params=(False, True))\ndef progress_bar(request):\n    return request.param\n\n\n@pytest.fixture(params=(None, False))\ndef use_memory_fs(request):\n    return request.param\n\n\n@pytest.fixture(params=(RuntimeError, AttributeError, ZeroDivisionError))\ndef exception(request):\n    return request.param\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_apply_axis_0(request):\n    def func(x):\n        return max(x) - min(x)\n\n    return dict(named=func, anonymous=lambda x: max(x) - min(x))[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_apply_axis_1(request):\n    def func(x):\n        return math.sin(x.a**2) + math.sin(x.b**2)\n\n    return dict(\n        named=func, anonymous=lambda x: math.sin(x.a**2) + math.sin(x.b**2)\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_applymap(request):\n    def func(x):\n        return math.sin(x**2) - math.cos(x**2)\n\n    return dict(named=func, anonymous=lambda x: math.sin(x**2) - math.cos(x**2))[\n        request.param\n    ]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_series_map(request):\n    def func(x):\n        return math.log10(math.sqrt(math.exp(x**2)))\n\n    return dict(\n        named=func, anonymous=lambda x: math.log10(math.sqrt(math.exp(x**2)))\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_series_apply(request):\n    def func(x, power, bias=0):\n        return math.log10(math.sqrt(math.exp(x**power))) + bias\n\n    return dict(\n        named=func,\n        anonymous=lambda x, power, bias=0: math.log10(math.sqrt(math.exp(x**power)))\n        + bias,\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_series_rolling_apply(request):\n    def func(x):\n        return x.iloc[0] + x.iloc[1] ** 2 + x.iloc[2] ** 3 + x.iloc[3] ** 4\n\n    return dict(\n        named=func,\n        anonymous=lambda x: x.iloc[0]\n        + x.iloc[1] ** 2\n        + x.iloc[2] ** 3\n        + x.iloc[3] ** 4,\n    )[request.param]\n\n\n@pytest.fixture()\ndef func_dataframe_groupby_apply():\n    def func(df):\n        dum = 0\n        for item in df.b:\n            dum += math.log10(math.sqrt(math.exp(item**2)))\n\n        return dum / len(df.b)\n\n    return func\n\n\n@pytest.fixture()\ndef func_dataframe_groupby_apply_complex():\n    def func(df):\n        return pd.DataFrame(\n            [[df.b.mean(), df.b.min(), df.b.max()]],\n            columns=[\"b_mean\", \"b_min\", \"b_max\"],\n        )\n\n    return func\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_groupby_rolling_apply(request):\n    def func(x):\n        return x.iloc[0] + x.iloc[1] ** 2 + x.iloc[2] ** 3 + x.iloc[3] ** 4\n\n    return dict(\n        named=func,\n        anonymous=lambda x: x.iloc[0]\n        + x.iloc[1] ** 2\n        + x.iloc[2] ** 3\n        + x.iloc[3] ** 4,\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_groupby_expanding_apply(request):\n    def func(x):\n        return (x.multiply(pd.Series(range(1, len(x)), dtype=\"float\"))).sum()\n\n    return dict(\n        named=func,\n        anonymous=lambda x: (\n            x.multiply(pd.Series(range(1, len(x)), dtype=\"float\"))\n        ).sum(),\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_apply_axis_0_no_reduce(request):\n    def func(x):\n        return x\n\n    return dict(named=func, anonymous=lambda x: x)[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_apply_axis_1_no_reduce(request):\n    def func(x):\n        return x**2\n\n    return dict(named=func, anonymous=lambda x: x**2)[request.param]\n\n\n@pytest.fixture\ndef pandarallel_init(progress_bar, use_memory_fs):\n    pandarallel.initialize(\n        progress_bar=progress_bar, use_memory_fs=use_memory_fs, nb_workers=2\n    )\n\n\ndef test_dataframe_apply_invalid_function(pandarallel_init, exception):\n    def f(_):\n        raise exception\n\n    df = pd.DataFrame(dict(a=[1, 2, 3, 4]))\n\n    with pytest.raises(exception):\n        df.parallel_apply(f)\n\n\ndef test_dataframe_apply_axis_0(pandarallel_init, func_dataframe_apply_axis_0, df_size):\n    df = pd.DataFrame(\n        dict(\n            a=np.random.randint(1, 8, df_size),\n            b=np.random.rand(df_size),\n            c=np.random.randint(1, 8, df_size),\n            d=np.random.rand(df_size),\n            e=np.random.randint(1, 8, df_size),\n            f=np.random.rand(df_size),\n            g=np.random.randint(1, 8, df_size),\n            h=np.random.rand(df_size),\n        )\n    )\n    df.index = [item / 10 for item in df.index]\n\n    res = df.apply(func_dataframe_apply_axis_0)\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_0)\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_apply_axis_1(pandarallel_init, func_dataframe_apply_axis_1, df_size):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 8, df_size), b=np.random.rand(df_size))\n    )\n    df.index = [item / 10 for item in df.index]\n\n    res = df.apply(func_dataframe_apply_axis_1, axis=1)\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_1, axis=1)\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_apply_invalid_axis(pandarallel_init):\n    df = pd.DataFrame(dict(a=[1, 2, 3, 4]))\n\n    with pytest.raises(ValueError):\n        df.parallel_apply(lambda x: x, axis=\"invalid\")\n    \ndef test_empty_dataframe_apply_axis_0(pandarallel_init, func_dataframe_apply_axis_0):\n    df = pd.DataFrame()\n\n    res = df.apply(func_dataframe_apply_axis_0)\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_0)\n    assert res.equals(res_parallel)\n\ndef test_empty_dataframe_apply_axis_1(pandarallel_init, func_dataframe_apply_axis_1):\n    df = pd.DataFrame()\n\n    res = df.apply(func_dataframe_apply_axis_1)\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_1)\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_applymap(pandarallel_init, func_dataframe_applymap, df_size):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 8, df_size), b=np.random.rand(df_size))\n    )\n    df.index = [item / 10 for item in df.index]\n\n    res = df.applymap(func_dataframe_applymap)\n    res_parallel = df.parallel_applymap(func_dataframe_applymap)\n    assert res.equals(res_parallel)\n\n\ndef test_series_map(pandarallel_init, func_series_map, df_size):\n    df = pd.DataFrame(dict(a=np.random.rand(df_size) + 1))\n\n    res = df.a.map(func_series_map)\n    res_parallel = df.a.parallel_map(func_series_map)\n    assert res.equals(res_parallel)\n\n\ndef test_series_apply(pandarallel_init, func_series_apply, df_size):\n    df = pd.DataFrame(dict(a=np.random.rand(df_size) + 1))\n\n    res = df.a.apply(func_series_apply, args=(2,), bias=3)\n    res_parallel = df.a.parallel_apply(func_series_apply, args=(2,), bias=3)\n    assert res.equals(res_parallel)\n\ndef test_empty_series_apply(pandarallel_init, func_series_apply):\n    df = pd.DataFrame(dict(a=[]))\n\n    res = df.a.apply(func_series_apply, args=(2,), bias=3)\n    res_parallel = df.a.parallel_apply(func_series_apply, args=(2,), bias=3)\n    assert res.equals(res_parallel)\n\n\ndef test_series_rolling_apply(pandarallel_init, func_series_rolling_apply, df_size):\n    df = pd.DataFrame(dict(a=np.random.randint(1, 8, df_size), b=list(range(df_size))))\n\n    res = df.b.rolling(4).apply(func_series_rolling_apply, raw=False)\n    res_parallel = df.b.rolling(4).parallel_apply(func_series_rolling_apply, raw=False)\n\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_groupby_apply(\n    pandarallel_init, func_dataframe_groupby_apply, df_size\n):\n    df = pd.DataFrame(\n        dict(\n            a=np.random.randint(1, 8, df_size),\n            b=np.random.rand(df_size),\n            c=np.random.rand(df_size),\n        )\n    )\n\n    res = df.groupby(\"a\").apply(func_dataframe_groupby_apply)\n    res_parallel = df.groupby(\"a\").parallel_apply(func_dataframe_groupby_apply)\n    assert res.equals(res_parallel)\n\n    res = df.groupby([\"a\"]).apply(func_dataframe_groupby_apply)\n    res_parallel = df.groupby([\"a\"]).parallel_apply(func_dataframe_groupby_apply)\n    assert res.equals(res_parallel)\n\n    res = df.groupby([\"a\", \"b\"]).apply(func_dataframe_groupby_apply)\n    res_parallel = df.groupby([\"a\", \"b\"]).parallel_apply(func_dataframe_groupby_apply)\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_groupby_apply_complex(\n    pandarallel_init, func_dataframe_groupby_apply_complex, df_size\n):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 100, df_size), b=np.random.rand(df_size))\n    )\n\n    res = df.groupby(\"a\").apply(func_dataframe_groupby_apply_complex)\n    res_parallel = df.groupby(\"a\").parallel_apply(func_dataframe_groupby_apply_complex)\n    res.equals(res_parallel)\n\n\ndef test_dataframe_groupby_rolling_apply(\n    pandarallel_init, func_dataframe_groupby_rolling_apply, df_size\n):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 10, df_size), b=np.random.rand(df_size))\n    )\n\n    res = (\n        df.groupby(\"a\")\n        .b.rolling(4)\n        .apply(func_dataframe_groupby_rolling_apply, raw=False)\n    )\n    res_parallel = (\n        df.groupby(\"a\")\n        .b.rolling(4)\n        .parallel_apply(func_dataframe_groupby_rolling_apply, raw=False)\n    )\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_groupby_expanding_apply(\n    pandarallel_init, func_dataframe_groupby_expanding_apply, df_size\n):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 10, df_size), b=np.random.rand(df_size))\n    )\n\n    res = (\n        df.groupby(\"a\")\n        .b.expanding()\n        .apply(func_dataframe_groupby_expanding_apply, raw=False)\n    )\n    res_parallel = (\n        df.groupby(\"a\")\n        .b.expanding()\n        .parallel_apply(func_dataframe_groupby_expanding_apply, raw=False)\n    )\n    res.equals(res_parallel)\n\n\ndef test_dataframe_axis_0_no_reduction(\n    pandarallel_init, func_dataframe_apply_axis_0_no_reduce, df_size\n):\n    df = pd.DataFrame(\n        dict(\n            a=np.random.randint(1, 10, df_size),\n            b=np.random.randint(1, 10, df_size),\n            c=np.random.randint(1, 10, df_size),\n        )\n    )\n    res = df.apply(func_dataframe_apply_axis_0_no_reduce)\n\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_0_no_reduce)\n\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_axis_1_no_reduction(\n    pandarallel_init, func_dataframe_apply_axis_1_no_reduce, df_size\n):\n    df = pd.DataFrame(\n        dict(\n            a=np.random.randint(1, 10, df_size),\n            b=np.random.randint(1, 10, df_size),\n            c=np.random.randint(1, 10, df_size),\n        )\n    )\n\n    res = df.apply(func_dataframe_apply_axis_1_no_reduce, axis=1)\n\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_1_no_reduce, axis=1)\n\n    assert res.equals(res_parallel)\n\ndef test_memory_fs_root_environment_variable(monkeypatch):\n    monkeypatch.setenv(\"MEMORY_FS_ROOT\", \"/test\")\n    from pandarallel import core\n    importlib.reload(core)\n\n    assert core.MEMORY_FS_ROOT == \"/test\"\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/utils.py",
      "content": "import itertools\nfrom enum import Enum\nfrom typing import Any, Dict, List, Tuple\n\nimport pandas as pd\nfrom pandas import DataFrame, Index\n\n\ndef chunk(nb_item: int, nb_chunks: int, start_offset=0) -> List[slice]:\n    \"\"\"\n    Return `nb_chunks` slices of approximatively `nb_item / nb_chunks` each.\n\n    Parameters\n    ----------\n    nb_item : int\n        Total number of items\n\n    nb_chunks : int\n        Number of chunks to return\n\n    start_offset : int\n        Shift start of slice by this amount\n\n    Returns\n    -------\n    A list of slices\n\n    Examples\n    --------\n    >>> chunks = chunk(103, 4)\n    >>> chunks\n    [slice(0, 26, None), slice(26, 52, None), slice(52, 78, None), slice(78, 103, None)]\n    \"\"\"\n    if nb_item == 0:\n        return [slice(0)]\n    \n    if nb_item <= nb_chunks:\n        return [slice(max(0, idx - start_offset), idx + 1) for idx in range(nb_item)]\n\n    quotient = nb_item // nb_chunks\n    remainder = nb_item % nb_chunks\n\n    quotients = [quotient] * nb_chunks\n    remainders = [1] * remainder + [0] * (nb_chunks - remainder)\n\n    nb_elems_per_chunk = [\n        quotient + remainder for quotient, remainder in zip(quotients, remainders)\n    ]\n\n    accumulated = list(itertools.accumulate(nb_elems_per_chunk))\n    shifted_accumulated = accumulated.copy()\n    shifted_accumulated.insert(0, 0)\n    shifted_accumulated.pop()\n\n    return [\n        slice(max(0, begin - start_offset), end)\n        for begin, end in zip(shifted_accumulated, accumulated)\n    ]\n\n\ndef df_indexed_like(df: DataFrame, axes: List[Index]) -> bool:\n    \"\"\"\n    Returns whether a data frame is indexed in the way specified by the\n    provided axes.\n\n    Used by DataFrameGroupBy to determine whether a group has been modified.\n\n    Function adapted from pandas.core.groupby.ops._is_indexed_like\n\n    Parameters\n    ----------\n    df : DataFrame\n        The data frame in question\n\n    axes : List[Index]\n        The axes to which the data frame is compared\n\n    Returns\n    -------\n    Whether or not the data frame is indexed in the same wa as the axes.\n    \"\"\"\n    if isinstance(df, DataFrame):\n        return df.axes[0].equals(axes[0])\n\n    return False\n\n\ndef get_pandas_version() -> Tuple[int, int]:\n    major_str, minor_str, *_ = pd.__version__.split(\".\")\n    return int(major_str), int(minor_str)\n\n\ndef get_axis_int(user_defined_function_kwargs: Dict[str, Any]):\n    axis = user_defined_function_kwargs.get(\"axis\", 0)\n\n    if axis not in {0, 1, \"index\", \"columns\"}:\n        raise ValueError(f\"No axis named {axis} for object type DataFrame\")\n\n    return {0: 0, 1: 1, \"index\": 0, \"columns\": 1}[axis]\n\n\nclass WorkerStatus(int, Enum):\n    Running = 0\n    Success = 1\n    Error = 2\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/core.py",
      "content": "import multiprocessing\nimport os\nimport pickle\nfrom itertools import count\nfrom multiprocessing.managers import SyncManager\nfrom pathlib import Path\nfrom tempfile import NamedTemporaryFile\nfrom typing import Any, Callable, Dict, Iterator, Optional, Tuple, Type, cast\n\nimport dill\nimport pandas as pd\nimport psutil\nfrom pandas.core.groupby import DataFrameGroupBy as PandaDataFrameGroupBy\nfrom pandas.core.window.expanding import ExpandingGroupby as PandasExpandingGroupby\nfrom pandas.core.window.rolling import RollingGroupby as PandasRollingGroupby\n\nfrom .data_types import (\n    DataFrame,\n    DataFrameGroupBy,\n    DataType,\n    ExpandingGroupBy,\n    RollingGroupBy,\n    Series,\n    SeriesRolling,\n)\nfrom .progress_bars import ProgressBarsType, get_progress_bars, progress_wrapper\nfrom .utils import WorkerStatus\n\nON_WINDOWS = os.name == \"nt\"\nCONTEXT = multiprocessing.get_context(\"spawn\" if ON_WINDOWS else \"fork\")\n\n# Root of Memory File System\nMEMORY_FS_ROOT = os.environ.get(\"MEMORY_FS_ROOT\", \"/dev/shm\")\n\n# By default, Pandarallel use all available CPUs\nNB_PHYSICAL_CORES = psutil.cpu_count(logical=False)\n\n# Prefix and suffix for files used with Memory File System\nPREFIX = \"pandarallel\"\nPREFIX_INPUT = f\"{PREFIX}_input_\"\nPREFIX_OUTPUT = f\"{PREFIX}_output_\"\nSUFFIX = \".pickle\"\n\n# We use these classes decorators pattern instead of the classic one because of this:\n# https://www.stevenengelhardt.com/2013/01/16/python-multiprocessing-module-and-closures/\n\n\nclass WrapWorkFunctionForFileSystem:\n    def __init__(\n        self,\n        work_function: Callable[\n            [Any, Callable, tuple, Dict[str, Any], Dict[str, Any]], Any\n        ],\n    ) -> None:\n        self.work_function = work_function\n\n    def __call__(\n        self,\n        input_file_path: Path,\n        output_file_path: Path,\n        progress_bars_type: ProgressBarsType,\n        worker_index: int,\n        master_workers_queue: multiprocessing.Queue,\n        dilled_user_defined_function: bytes,\n        user_defined_function_args: tuple,\n        user_defined_function_kwargs: Dict[str, Any],\n        extra: Dict[str, Any],\n    ) -> None:\n        try:\n            # Load dataframe from input file\n            with input_file_path.open(\"rb\") as file_descriptor:\n                data = pickle.load(file_descriptor)\n\n            # Delete input file since we don't need it any more. It will free some RAM\n            # since the input file is stored into Shared Memory.\n            input_file_path.unlink()\n\n            data_size = len(data)\n            user_defined_function: Callable = dill.loads(dilled_user_defined_function)\n\n            progress_wrapped_user_defined_function = progress_wrapper(\n                user_defined_function, master_workers_queue, worker_index, data_size\n            )\n\n            used_user_defined_function = (\n                progress_wrapped_user_defined_function\n                if progress_bars_type\n                in (\n                    ProgressBarsType.InUserDefinedFunction,\n                    ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns,\n                )\n                else user_defined_function\n            )\n\n            result = self.work_function(\n                data,\n                used_user_defined_function,\n                user_defined_function_args,\n                user_defined_function_kwargs,\n                extra,\n            )\n\n            with output_file_path.open(\"wb\") as file_descriptor:\n                pickle.dump(result, file_descriptor)\n\n            master_workers_queue.put((worker_index, WorkerStatus.Success, None))\n\n        except:\n            master_workers_queue.put((worker_index, WorkerStatus.Error, None))\n            raise\n\n\nclass WrapWorkFunctionForPipe:\n    def __init__(\n        self,\n        work_function: Callable[\n            [\n                Any,\n                Callable,\n                tuple,\n                Dict[str, Any],\n                Dict[str, Any],\n            ],\n            Any,\n        ],\n    ) -> None:\n        self.work_function = work_function\n\n    def __call__(\n        self,\n        data: Any,\n        progress_bars_type: ProgressBarsType,\n        worker_index: int,\n        master_workers_queue: multiprocessing.Queue,\n        dilled_user_defined_function: bytes,\n        user_defined_function_args: tuple,\n        user_defined_function_kwargs: Dict[str, Any],\n        extra: Dict[str, Any],\n    ) -> Any:\n        try:\n            data_size = len(data)\n            user_defined_function: Callable = dill.loads(dilled_user_defined_function)\n\n            progress_wrapped_user_defined_function = progress_wrapper(\n                user_defined_function, master_workers_queue, worker_index, data_size\n            )\n\n            used_user_defined_function = (\n                progress_wrapped_user_defined_function\n                if progress_bars_type\n                in (\n                    ProgressBarsType.InUserDefinedFunction,\n                    ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns,\n                )\n                else user_defined_function\n            )\n\n            results = self.work_function(\n                data,\n                used_user_defined_function,\n                user_defined_function_args,\n                user_defined_function_kwargs,\n                extra,\n            )\n\n            master_workers_queue.put((worker_index, WorkerStatus.Success, None))\n\n            return results\n\n        except:\n            master_workers_queue.put((worker_index, WorkerStatus.Error, None))\n            raise\n\n\ndef wrap_reduce_function_for_file_system(\n    reduce_function: Callable[[Iterator, Dict[str, Any]], Any]\n) -> Callable[[Iterator[Path], Dict[str, Any]], Any]:\n    \"\"\"This wrapper transforms a `reduce` function which takes as input:\n    - A list of pandas Dataframe\n    - An user defined function\n    and which returns a pandas Dataframe, into a `reduct` function which takes as input:\n    - A list of paths where  pandas Dataframe are pickled\n    which returns a pandas Dataframe.\n    \"\"\"\n\n    def closure(output_file_paths: Iterator[Path], extra: Dict[str, Any]) -> Any:\n        def get_dataframe_and_delete_file(file_path: Path) -> Any:\n            with file_path.open(\"rb\") as file_descriptor:\n                data = pickle.load(file_descriptor)\n\n            file_path.unlink()\n            return data\n\n        dfs = (\n            get_dataframe_and_delete_file(output_file_path)\n            for output_file_path in output_file_paths\n        )\n\n        return reduce_function(dfs, extra)\n\n    return closure\n\n\ndef parallelize_with_memory_file_system(\n    nb_requested_workers: int,\n    data_type: Type[DataType],\n    progress_bars_type: ProgressBarsType,\n):\n    def closure(\n        data: Any,\n        user_defined_function: Callable,\n        *user_defined_function_args: tuple,\n        **user_defined_function_kwargs: Dict[str, Any],\n    ):\n        wrapped_work_function = WrapWorkFunctionForFileSystem(data_type.work)\n        wrapped_reduce_function = wrap_reduce_function_for_file_system(data_type.reduce)\n\n        chunks = list(\n            data_type.get_chunks(\n                nb_requested_workers,\n                data,\n                user_defined_function_kwargs=user_defined_function_kwargs,\n            )\n        )\n\n        nb_workers = len(chunks)\n\n        multiplicator_factor = (\n            len(cast(pd.DataFrame, data).columns)\n            if progress_bars_type\n            == ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns\n            else 1\n        )\n\n        progresses_length = [len(chunk_) * multiplicator_factor for chunk_ in chunks]\n\n        work_extra = data_type.get_work_extra(data)\n        reduce_extra = data_type.get_reduce_extra(data, user_defined_function_kwargs)\n\n        show_progress_bars = progress_bars_type != ProgressBarsType.No\n\n        progress_bars = get_progress_bars(progresses_length, show_progress_bars)\n        progresses = [0] * nb_workers\n        workers_status = [WorkerStatus.Running] * nb_workers\n\n        input_files = [\n            NamedTemporaryFile(\n                prefix=PREFIX_INPUT, suffix=SUFFIX, dir=MEMORY_FS_ROOT, delete=False\n            )\n            for _ in range(nb_workers)\n        ]\n\n        output_files = [\n            NamedTemporaryFile(\n                prefix=PREFIX_OUTPUT, suffix=SUFFIX, dir=MEMORY_FS_ROOT, delete=False\n            )\n            for _ in range(nb_workers)\n        ]\n\n        try:\n            for chunk, input_file in zip(chunks, input_files):\n                with Path(input_file.name).open(\"wb\") as file_descriptor:\n                    pickle.dump(chunk, file_descriptor)\n\n            dilled_user_defined_function = dill.dumps(user_defined_function)\n            manager: SyncManager = CONTEXT.Manager()\n            master_workers_queue = manager.Queue()\n\n            work_args_list = [\n                (\n                    Path(input_file.name),\n                    Path(output_file.name),\n                    progress_bars_type,\n                    worker_index,\n                    master_workers_queue,\n                    dilled_user_defined_function,\n                    user_defined_function_args,\n                    user_defined_function_kwargs,\n                    {\n                        **work_extra,\n                        **{\n                            \"master_workers_queue\": master_workers_queue,\n                            \"show_progress_bars\": show_progress_bars,\n                            \"worker_index\": worker_index,\n                        },\n                    },\n                )\n                for worker_index, (\n                    input_file,\n                    output_file,\n                ) in enumerate(zip(input_files, output_files))\n            ]\n\n            pool = CONTEXT.Pool(nb_workers)\n            results_promise = pool.starmap_async(wrapped_work_function, work_args_list)\n\n            pool.close()\n\n            generation = count()\n\n            while any(\n                (\n                    worker_status == WorkerStatus.Running\n                    for worker_status in workers_status\n                )\n            ):\n                message: Tuple[int, WorkerStatus, Any] = master_workers_queue.get()\n                worker_index, worker_status, payload = message\n                workers_status[worker_index] = worker_status\n\n                if worker_status == WorkerStatus.Success:\n                    progresses[worker_index] = progresses_length[worker_index]\n                    progress_bars.update(progresses)\n                elif worker_status == WorkerStatus.Running:\n                    progress = cast(int, payload)\n                    progresses[worker_index] = progress\n\n                    if next(generation) % nb_workers == 0:\n                        progress_bars.update(progresses)\n                elif worker_status == WorkerStatus.Error:\n                    progress_bars.set_error(worker_index)\n                    progress_bars.update(progresses)\n\n            try:\n                return wrapped_reduce_function(\n                    (Path(output_file.name) for output_file in output_files),\n                    reduce_extra,\n                )\n            except EOFError:\n                # Loading the files failed, this most likely means that there\n                # was some error during processing and the files were never\n                # saved at all.\n                results_promise.get()\n\n                # If the above statement does not raise an exception, that\n                # means the multiprocessing went well and we want to re-raise\n                # the original EOFError.\n                raise\n\n        finally:\n            for output_file in output_files:\n                # When pandarallel stop supporting Python 3.7 and older, replace this\n                # try/except clause by:\n                # Path(output_file.name).unlink(missing_ok=True)\n                try:\n                    Path(output_file.name).unlink()\n                except FileNotFoundError:\n                    # Do nothing, this is the nominal case.\n                    pass\n\n    return closure\n\n\ndef parallelize_with_pipe(\n    nb_requested_workers: int,\n    data_type: Type[DataType],\n    progress_bars_type: ProgressBarsType,\n):\n    def closure(\n        data: Any,\n        user_defined_function: Callable,\n        *user_defined_function_args: tuple,\n        **user_defined_function_kwargs: Dict[str, Any],\n    ):\n        wrapped_work_function = WrapWorkFunctionForPipe(data_type.work)\n        dilled_user_defined_function = dill.dumps(user_defined_function)\n        manager: SyncManager = CONTEXT.Manager()\n        master_workers_queue = manager.Queue()\n\n        chunks = list(\n            data_type.get_chunks(\n                nb_requested_workers,\n                data,\n                user_defined_function_kwargs=user_defined_function_kwargs,\n            )\n        )\n\n        nb_workers = len(chunks)\n\n        multiplicator_factor = (\n            len(cast(pd.DataFrame, data).columns)\n            if progress_bars_type\n            == ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns\n            else 1\n        )\n\n        progresses_length = [len(chunk_) * multiplicator_factor for chunk_ in chunks]\n\n        work_extra = data_type.get_work_extra(data)\n        reduce_extra = data_type.get_reduce_extra(data, user_defined_function_kwargs)\n\n        show_progress_bars = progress_bars_type != ProgressBarsType.No\n\n        progress_bars = get_progress_bars(progresses_length, show_progress_bars)\n        progresses = [0] * nb_workers\n        workers_status = [WorkerStatus.Running] * nb_workers\n\n        work_args_list = [\n            (\n                chunk,\n                progress_bars_type,\n                worker_index,\n                master_workers_queue,\n                dilled_user_defined_function,\n                user_defined_function_args,\n                user_defined_function_kwargs,\n                {\n                    **work_extra,\n                    **{\n                        \"master_workers_queue\": master_workers_queue,\n                        \"show_progress_bars\": show_progress_bars,\n                        \"worker_index\": worker_index,\n                    },\n                },\n            )\n            for worker_index, chunk in enumerate(chunks)\n        ]\n\n        pool = CONTEXT.Pool(nb_workers)\n        results_promise = pool.starmap_async(wrapped_work_function, work_args_list)\n        pool.close()\n\n        generation = count()\n\n        while any(\n            (worker_status == WorkerStatus.Running for worker_status in workers_status)\n        ):\n            message: Tuple[int, WorkerStatus, Any] = master_workers_queue.get()\n            worker_index, worker_status, payload = message\n            workers_status[worker_index] = worker_status\n\n            if worker_status == WorkerStatus.Success:\n                progresses[worker_index] = progresses_length[worker_index]\n                progress_bars.update(progresses)\n            elif worker_status == WorkerStatus.Running:\n                progress = cast(int, payload)\n                progresses[worker_index] = progress\n\n                if next(generation) % nb_workers == 0:\n                    progress_bars.update(progresses)\n            elif worker_status == WorkerStatus.Error:\n                progress_bars.set_error(worker_index)\n\n        results = results_promise.get()\n\n        return data_type.reduce(results, reduce_extra)\n\n    return closure\n\n\nclass pandarallel:\n    @classmethod\n    def initialize(\n        cls,\n        shm_size_mb=None,\n        nb_workers=NB_PHYSICAL_CORES,\n        progress_bar=False,\n        verbose=2,\n        use_memory_fs: Optional[bool] = None,\n    ) -> None:\n        show_progress_bars = progress_bar\n        is_memory_fs_available = Path(MEMORY_FS_ROOT).exists()\n\n        use_memory_fs = (\n            use_memory_fs if use_memory_fs is not None else is_memory_fs_available\n        )\n\n        parallelize = (\n            parallelize_with_memory_file_system\n            if use_memory_fs\n            else parallelize_with_pipe\n        )\n\n        if use_memory_fs and not is_memory_fs_available:\n            raise SystemError(\"Memory file system is not available\")\n\n        if verbose >= 2:\n            print(f\"INFO: Pandarallel will run on {nb_workers} workers.\")\n\n            message = (\n                (\n                    \"INFO: Pandarallel will use Memory file system to transfer data \"\n                    \"between the main process and workers.\"\n                )\n                if use_memory_fs\n                else (\n                    \"INFO: Pandarallel will use standard multiprocessing data transfer \"\n                    \"(pipe) to transfer data between the main process and workers.\"\n                )\n            )\n\n            print(message)\n\n            if ON_WINDOWS and verbose >= 2:\n                print()\n                print(\n                    (\n                        \"WARNING: You are on Windows. If you detect any issue with \"\n                        \"pandarallel, be sure you checked out the Troubleshooting page:\"\n                    )\n                )\n                print(\"https://nalepae.github.io/pandarallel/troubleshooting/\")\n\n        progress_bars_in_user_defined_function = (\n            ProgressBarsType.InUserDefinedFunction\n            if show_progress_bars\n            else ProgressBarsType.No\n        )\n\n        progress_bars_in_user_defined_function_multiply_by_number_of_columns = (\n            ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns\n            if show_progress_bars\n            else ProgressBarsType.No\n        )\n\n        progress_bars_in_work_function = (\n            ProgressBarsType.InWorkFunction\n            if show_progress_bars\n            else ProgressBarsType.No\n        )\n\n        # DataFrame\n        pd.DataFrame.parallel_apply = parallelize(\n            nb_workers, DataFrame.Apply, progress_bars_in_user_defined_function\n        )\n        pd.DataFrame.parallel_applymap = parallelize(\n            nb_workers,\n            DataFrame.ApplyMap,\n            progress_bars_in_user_defined_function_multiply_by_number_of_columns,\n        )\n\n        # DataFrame GroupBy\n        PandaDataFrameGroupBy.parallel_apply = parallelize(\n            nb_workers, DataFrameGroupBy.Apply, progress_bars_in_user_defined_function\n        )\n\n        # Expanding GroupBy\n        PandasExpandingGroupby.parallel_apply = parallelize(\n            nb_workers, ExpandingGroupBy.Apply, progress_bars_in_work_function\n        )\n\n        # Rolling GroupBy\n        PandasRollingGroupby.parallel_apply = parallelize(\n            nb_workers, RollingGroupBy.Apply, progress_bars_in_work_function\n        )\n\n        # Series\n        pd.Series.parallel_apply = parallelize(\n            nb_workers, Series.Apply, progress_bars_in_user_defined_function\n        )\n        pd.Series.parallel_map = parallelize(nb_workers, Series.Map, show_progress_bars)\n\n        # Series Rolling\n        pd.core.window.Rolling.parallel_apply = parallelize(\n            nb_workers, SeriesRolling.Apply, progress_bars_in_user_defined_function\n        )\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/progress_bars.py",
      "content": "import multiprocessing\nimport os\nimport shutil\nimport sys\nfrom abc import ABC, abstractmethod\nfrom enum import Enum\nfrom itertools import count\nfrom time import time_ns\nfrom typing import Callable, List, Union\n\nfrom .utils import WorkerStatus\n\nINTERVAL_NS = 250_000_000  # 0.25 sec\nMINIMUM_TERMINAL_WIDTH = 72\n\n\nclass ProgressBarsType(int, Enum):\n    No = 0\n    InUserDefinedFunction = 1\n    InUserDefinedFunctionMultiplyByNumberOfColumns = 2\n    InWorkFunction = 3\n\n\nclass ProgressBars(ABC):\n    @abstractmethod\n    def __init__(self, maxs: List[int], show: bool) -> None:\n        ...\n\n    @abstractmethod\n    def update(self, values: List[int]) -> None:\n        ...\n\n    def set_error(self, index: int) -> None:\n        pass\n\n\nclass ProgressState:\n    def __init__(self, chunk_size: int) -> None:\n        self.last_put_iteration = 0\n        self.next_put_iteration = max(chunk_size // 100, 1)\n        self.last_put_time = time_ns()\n\n\ndef is_notebook_lab() -> bool:\n    try:\n        shell: str = get_ipython().__class__.__name__  # type: ignore\n\n        # Shell: Google Colab\n        # TerminalInteractiveShell: Terminal running IPython\n        # ZMQInteractiveShell: Jupyter notebook/lab or qtconsole\n        return shell in {\"Shell\", \"ZMQInteractiveShell\"}\n    except NameError:\n        # Probably standard Python interpreter\n        return False\n\n\nclass ProgressBarsConsole(ProgressBars):\n    def __init__(self, maxs: List[int], show: bool) -> None:\n        self.__show = show\n        self.__bars = [[0, max] for max in maxs]\n        self.__width = self.__get_width()\n\n        self.__lines = self.__update_lines()\n\n        if show:\n            sys.stdout.write(\"\\n\".join(self.__lines))\n            sys.stdout.flush()\n\n    def __get_width(self) -> int:\n        try:\n            columns = shutil.get_terminal_size().columns\n            return max(MINIMUM_TERMINAL_WIDTH, columns - 1)\n        except AttributeError:\n            # Python 2\n            pass\n\n        try:\n            columns = int(os.popen(\"stty size\", \"r\").read().split()[1])\n            return max(MINIMUM_TERMINAL_WIDTH, columns - 1)\n        except:\n            return MINIMUM_TERMINAL_WIDTH\n\n    def __remove_displayed_lines(self) -> None:\n        if len(self.__bars) >= 1:\n            sys.stdout.write(\"\\b\" * len(self.__lines[-1]))\n\n        if len(self.__bars) >= 2:\n            sys.stdout.write(\"\\033M\" * (len(self.__lines) - 1))\n\n        self.__lines = []\n\n    def __update_line(self, done: int, total: int) -> str:\n        if total == 0:\n            percent = 0\n        else:\n            percent = done / total\n        bar = (\":\" * int(percent * 40)).ljust(40, \" \")\n        percent = round(percent * 100, 2)\n        format = \" {percent:6.2f}% {bar:s} | {done:8d} / {total:8d} |\"\n        ret = format.format(percent=percent, bar=bar, done=done, total=total)\n        return ret[: self.__width].ljust(self.__width, \" \")\n\n    def __update_lines(self) -> List[str]:\n        return [self.__update_line(value, max) for value, max in self.__bars]\n\n    def update(self, values: List[int]) -> None:\n        \"\"\"Update a bar value.\n        Positional arguments:\n        values - The new values of each bar\n        \"\"\"\n        if not self.__show:\n            return\n\n        for index, value in enumerate(values):\n            self.__bars[index][0] = value\n\n        self.__remove_displayed_lines()\n        self.__lines = self.__update_lines()\n\n        sys.stdout.write(\"\\n\".join(self.__lines))\n        sys.stdout.flush()\n\n\nclass ProgressBarsNotebookLab(ProgressBars):\n    def __init__(self, maxs: List[int], show: bool) -> None:\n        \"\"\"Initialization.\n        Positional argument:\n        maxs - List containing the max value of each progress bar\n        \"\"\"\n        self.__show = show\n\n        if not show:\n            return\n\n        from IPython.display import display\n        from ipywidgets import HBox, IntProgress, Label, VBox\n\n        self.__bars = [\n            HBox(\n                [\n                    IntProgress(0, 0, max, description=\"{:.2f}%\".format(0)),\n                    Label(\"{} / {}\".format(0, max)),\n                ]\n            )\n            for max in maxs\n        ]\n\n        display(VBox(self.__bars))\n\n    def update(self, values: List[int]) -> None:\n        \"\"\"Update a bar value.\n        Positional arguments:\n        values - The new values of each bar\n        \"\"\"\n        if not self.__show:\n            return\n\n        for index, value in enumerate(values):\n            bar, label = self.__bars[index].children\n\n            label.value = \"{} / {}\".format(value, bar.max)\n            \n            bar.value = value\n\n            if value >= bar.max:\n                bar.bar_style = \"success\"\n\n            if bar.max != 0:\n                bar.description = \"{:.2f}%\".format(bar.value / bar.max * 100)\n\n    def set_error(self, index: int) -> None:\n        \"\"\"Set a bar on error\"\"\"\n        if not self.__show:\n            return\n\n        bar, _ = self.__bars[index].children\n        bar.bar_style = \"danger\"\n\n\ndef get_progress_bars(\n    maxs: List[int], show\n) -> Union[ProgressBarsNotebookLab, ProgressBarsConsole]:\n    return (\n        ProgressBarsNotebookLab(maxs, show)\n        if is_notebook_lab()\n        else ProgressBarsConsole(maxs, show)\n    )\n\n\ndef progress_wrapper(\n    user_defined_function: Callable,\n    master_workers_queue: multiprocessing.Queue,\n    index: int,\n    chunk_size: int,\n) -> Callable:\n    \"\"\"Wrap the function to apply in a function which monitor the part of work already\n    done.\n    \"\"\"\n    counter = count()\n    state = ProgressState(chunk_size)\n\n    def closure(*user_defined_function_args, **user_defined_functions_kwargs):\n        iteration = next(counter)\n\n        if iteration == state.next_put_iteration:\n            time_now = time_ns()\n            master_workers_queue.put_nowait((index, WorkerStatus.Running, iteration))\n\n            delta_t = time_now - state.last_put_time\n            delta_i = iteration - state.last_put_iteration\n\n            state.next_put_iteration += (\n                max(int((delta_i / delta_t) * INTERVAL_NS), 1) if delta_t != 0 else 1\n            )\n\n            state.last_put_iteration = iteration\n            state.last_put_time = time_now\n\n        return user_defined_function(\n            *user_defined_function_args, **user_defined_functions_kwargs\n        )\n\n    return closure\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/__init__.py",
      "content": "from .core import pandarallel\n\n__version__ = \"1.6.5\"\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py",
      "content": "import multiprocessing\nfrom typing import Any, Callable, Dict, Iterable, Iterator, List, Tuple\n\nimport pandas as pd\nfrom pandas.core.window.expanding import ExpandingGroupby as PandasExpandingGroupby\n\nfrom ..utils import WorkerStatus, chunk, get_pandas_version\nfrom .generic import DataType\n\n\nclass ExpandingGroupBy:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: PandasExpandingGroupby, *args, **kwargs\n        ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n            pandas_version = get_pandas_version()\n\n            nb_items = (\n                len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n            )\n\n            chunks = chunk(nb_items, nb_workers)\n\n            iterator = (\n                iter(data._groupby)\n                if pandas_version < (1, 3)\n                else data._grouper.get_iterator(data.obj)\n            )\n\n            for chunk_ in chunks:\n                yield [next(iterator) for _ in range(chunk_.stop - chunk_.start)]\n\n        @staticmethod\n        def get_work_extra(data: PandasExpandingGroupby):\n            attributes = {\n                attribute: getattr(data, attribute) for attribute in data._attributes\n            }\n\n            return {\"attributes\": attributes}\n\n        @staticmethod\n        def work(\n            data: List[Tuple[int, pd.DataFrame]],\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> List[pd.DataFrame]:\n            show_progress_bars: bool = extra[\"show_progress_bars\"]\n            master_workers_queue: multiprocessing.Queue = extra[\"master_workers_queue\"]\n            worker_index: int = extra[\"worker_index\"]\n\n            def compute_result(\n                iteration: int,\n                attributes: Dict[str, Any],\n                index: int,\n                df: pd.DataFrame,\n                user_defined_function: Callable,\n                user_defined_function_args: tuple,\n                user_defined_function_kwargs: Dict[str, Any],\n            ) -> pd.DataFrame:\n                item = df.expanding(**attributes).apply(\n                    user_defined_function,\n                    *user_defined_function_args,\n                    **user_defined_function_kwargs\n                )\n\n                item.index = pd.MultiIndex.from_product([[index], item.index])\n\n                if show_progress_bars:\n                    master_workers_queue.put_nowait(\n                        (worker_index, WorkerStatus.Running, iteration)\n                    )\n\n                return item\n\n            attributes = extra[\"attributes\"]\n            attributes.pop(\"_grouper\", None)\n\n            dfs = (\n                compute_result(\n                    iteration,\n                    attributes,\n                    index,\n                    df,\n                    user_defined_function,\n                    user_defined_function_args,\n                    user_defined_function_kwargs,\n                )\n                for iteration, (index, df) in enumerate(data)\n            )\n\n            return pd.concat(dfs)\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.DataFrame], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/series.py",
      "content": "from typing import Any, Callable, Dict, Iterable, Iterator\n\nimport pandas as pd\n\nfrom ..utils import chunk\nfrom .generic import DataType\n\n\nclass Series:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: pd.Series, **kwargs\n        ) -> Iterator[pd.Series]:\n            for chunk_ in chunk(data.size, nb_workers):\n                yield data.iloc[chunk_]\n\n        @staticmethod\n        def work(\n            data: pd.Series,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.Series:\n            return data.apply(\n                user_defined_function,\n                *user_defined_function_args,\n                **user_defined_function_kwargs\n            )\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.Series], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n\n    class Map(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: pd.Series, **kwargs\n        ) -> Iterator[pd.Series]:\n            for chunk_ in chunk(data.size, nb_workers):\n                yield data.iloc[chunk_]\n\n        @staticmethod\n        def work(\n            data: pd.Series,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.Series:\n            return data.map(\n                user_defined_function,\n                *user_defined_function_args,\n                **user_defined_function_kwargs\n            )\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.Series], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/series_rolling.py",
      "content": "from typing import Any, Callable, Dict, Iterable, Iterator\n\nimport pandas as pd\nfrom pandas.core.window.rolling import Rolling\n\nfrom ..utils import chunk\nfrom .generic import DataType\n\n\nclass SeriesRolling:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, rolling: Rolling, **kwargs\n        ) -> Iterator[pd.Series]:\n            chunks = chunk(rolling.obj.size, nb_workers, rolling.window)\n\n            for chunk_ in chunks:\n                yield rolling.obj[chunk_]\n\n        @staticmethod\n        def get_work_extra(data: Rolling) -> Dict[str, Any]:\n            return {\n                \"attributes\": {\n                    attribute: getattr(data, attribute)\n                    for attribute in data._attributes\n                }\n            }\n\n        @staticmethod\n        def work(\n            data: pd.Series,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.Series:\n            attributes: Dict[str, Any] = extra[\"attributes\"]\n            worker_index: int = extra[\"worker_index\"]\n\n            result = data.rolling(**attributes).apply(\n                user_defined_function,\n                *user_defined_function_args,\n                **user_defined_function_kwargs\n            )\n\n            return result if worker_index == 0 else result[attributes[\"window\"] :]\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.Series], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/__init__.py",
      "content": "from .dataframe import DataFrame\nfrom .dataframe_groupby import DataFrameGroupBy\nfrom .expanding_groupby import ExpandingGroupBy\nfrom .rolling_groupby import RollingGroupBy\nfrom .generic import DataType\nfrom .series import Series\nfrom .series_rolling import SeriesRolling\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/dataframe.py",
      "content": "from typing import Any, Callable, Dict, Iterable, Iterator\nfrom types import GeneratorType\n\nimport pandas as pd\n\nfrom ..utils import chunk, get_axis_int\nfrom .generic import DataType\n\n\nclass DataFrame:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: pd.DataFrame, **kwargs\n        ) -> Iterator[pd.DataFrame]:\n            user_defined_function_kwargs = kwargs[\"user_defined_function_kwargs\"]\n\n            axis_int = get_axis_int(user_defined_function_kwargs)\n            opposite_axis_int = 1 - axis_int\n\n            for chunk_ in chunk(data.shape[opposite_axis_int], nb_workers):\n                yield data.iloc[chunk_] if axis_int == 1 else data.iloc[:, chunk_]\n\n        @staticmethod\n        def work(\n            data: pd.DataFrame,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.DataFrame:\n            return data.apply(\n                user_defined_function,\n                *user_defined_function_args,\n                **user_defined_function_kwargs,\n            )\n\n        @staticmethod\n        def get_reduce_extra(\n            data: Any, user_defined_function_kwargs: Dict[str, Any]\n        ) -> Dict[str, Any]:\n            return {\"axis\": get_axis_int(user_defined_function_kwargs)}\n\n        @staticmethod\n        def reduce(\n            datas: Iterable[pd.DataFrame], extra: Dict[str, Any]\n        ) -> pd.DataFrame:\n            if isinstance(datas, GeneratorType):\n                datas = list(datas)\n            axis = 0 if isinstance(datas[0], pd.Series) else 1 - extra[\"axis\"]\n            return pd.concat(datas, copy=False, axis=axis)\n\n    class ApplyMap(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: pd.DataFrame, **kwargs\n        ) -> Iterator[pd.DataFrame]:\n            for chunk_ in chunk(data.shape[0], nb_workers):\n                yield data.iloc[chunk_]\n\n        @staticmethod\n        def work(\n            data: pd.DataFrame,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.DataFrame:\n            return data.applymap(user_defined_function)\n\n        @staticmethod\n        def reduce(\n            datas: Iterable[pd.DataFrame], extra: Dict[str, Any]\n        ) -> pd.DataFrame:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/generic.py",
      "content": "from abc import ABC, abstractmethod\nfrom typing import Any, Callable, Dict, Iterable, Iterator\n\n\nclass DataType(ABC):\n    @staticmethod\n    @abstractmethod\n    def get_chunks(nb_workers: int, data: Any, **kwargs) -> Iterator[Any]:\n        ...\n\n    @staticmethod\n    def get_work_extra(data: Any) -> Dict[str, Any]:\n        return dict()\n\n    @staticmethod\n    @abstractmethod\n    def work(\n        data: Any,\n        user_defined_function: Callable,\n        user_defined_function_args: tuple,\n        user_defined_function_kwargs: Dict[str, Any],\n        extra: Dict[str, Any],\n    ) -> Any:\n        ...\n\n    @staticmethod\n    def get_reduce_extra(\n        data: Any, user_defined_function_kwargs: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        return dict()\n\n    @staticmethod\n    @abstractmethod\n    def reduce(datas: Iterable[Any], extra: Dict[str, Any]) -> Any:\n        ...\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py",
      "content": "import multiprocessing\nfrom typing import Any, Callable, Dict, Iterable, Iterator, List, Tuple\n\nimport pandas as pd\nfrom pandas.core.window.rolling import RollingGroupby as PandasRollingGroupby\n\nfrom ..utils import WorkerStatus, chunk, get_pandas_version\nfrom .generic import DataType\n\n\nclass RollingGroupBy:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: PandasRollingGroupby, *args, **kwargs\n        ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n            pandas_version = get_pandas_version()\n\n            nb_items = (\n                len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n            )\n\n            chunks = chunk(nb_items, nb_workers)\n\n            iterator = (\n                iter(data._groupby)\n                if pandas_version < (1, 3)\n                else data._grouper.get_iterator(data.obj)\n            )\n\n            for chunk_ in chunks:\n                yield [next(iterator) for _ in range(chunk_.stop - chunk_.start)]\n\n        @staticmethod\n        def get_work_extra(data: PandasRollingGroupby):\n            attributes = {\n                attribute: getattr(data, attribute) for attribute in data._attributes\n            }\n\n            return {\"attributes\": attributes}\n\n        @staticmethod\n        def work(\n            data: List[Tuple[int, pd.DataFrame]],\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> List[pd.DataFrame]:\n            show_progress_bars: bool = extra[\"show_progress_bars\"]\n            master_workers_queue: multiprocessing.Queue = extra[\"master_workers_queue\"]\n            worker_index: int = extra[\"worker_index\"]\n\n            def compute_result(\n                iteration: int,\n                attributes: Dict[str, Any],\n                index: int,\n                df: pd.DataFrame,\n                user_defined_function: Callable,\n                user_defined_function_args: tuple,\n                user_defined_function_kwargs: Dict[str, Any],\n            ) -> pd.DataFrame:\n                item = df.rolling(**attributes).apply(\n                    user_defined_function,\n                    *user_defined_function_args,\n                    **user_defined_function_kwargs\n                )\n\n                item.index = pd.MultiIndex.from_product([[index], item.index])\n\n                if show_progress_bars:\n                    master_workers_queue.put_nowait(\n                        (worker_index, WorkerStatus.Running, iteration)\n                    )\n\n                return item\n\n            attributes = extra[\"attributes\"]\n            attributes.pop(\"_grouper\", None)\n\n            dfs = (\n                compute_result(\n                    iteration,\n                    attributes,\n                    index,\n                    df,\n                    user_defined_function,\n                    user_defined_function_args,\n                    user_defined_function_kwargs,\n                )\n                for iteration, (index, df) in enumerate(data)\n            )\n\n            return pd.concat(dfs)\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.DataFrame], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/dataframe_groupby.py",
      "content": "import itertools\nfrom typing import Any, Callable, Dict, Iterable, Iterator, List, Tuple, Union, cast\n\nimport pandas as pd\nfrom pandas.core.groupby.generic import DataFrameGroupBy as PandasDataFrameGroupBy\n\nfrom ..utils import chunk, df_indexed_like, get_pandas_version\nfrom .generic import DataType\n\n\nclass DataFrameGroupBy:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, dataframe_groupby: PandasDataFrameGroupBy, **kwargs\n        ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n            chunks = chunk(dataframe_groupby.ngroups, nb_workers)\n            iterator = iter(dataframe_groupby)\n\n            for chunk_ in chunks:\n                yield [next(iterator) for _ in range(chunk_.stop - chunk_.start)]\n\n        @staticmethod\n        def work(\n            data: List[Tuple[int, pd.DataFrame]],\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> List[Tuple[int, pd.DataFrame, bool]]:\n            def compute_result(\n                key: int, df: pd.DataFrame\n            ) -> Tuple[int, pd.DataFrame, bool]:\n                result = user_defined_function(\n                    df, *user_defined_function_args, **user_defined_function_kwargs\n                )\n                mutated = not df_indexed_like(result, df.axes)\n                return key, result, mutated\n\n            return [compute_result(key, df) for key, df in data]\n\n        @staticmethod\n        def get_reduce_extra(\n            data: PandasDataFrameGroupBy, user_defined_function_kwargs: Dict[str, Any]\n        ) -> Dict[str, Any]:\n            return {\"df_groupby\": data}\n\n        @staticmethod\n        def reduce(\n            datas: Iterable[List[Tuple[int, pd.DataFrame, bool]]], extra: Dict[str, Any]\n        ) -> pd.Series:\n            def get_args(\n                keys: List[int],\n                values: List[pd.DataFrame],\n                df_groupby: PandasDataFrameGroupBy,\n            ) -> Union[\n                Tuple[List[int], List[pd.DataFrame]],\n                Tuple[pd.DataFrame, List[int], List[pd.DataFrame]],\n                Tuple[pd.DataFrame, List[pd.DataFrame]],\n            ]:\n                pandas_version = get_pandas_version()\n\n                if pandas_version < (1, 3):\n                    return keys, values\n                elif pandas_version < (1, 4):\n                    return df_groupby._selected_obj, keys, values\n                else:\n                    return df_groupby._selected_obj, values\n\n            df_groupby: PandasDataFrameGroupBy = extra[\"df_groupby\"]\n\n            results = itertools.chain.from_iterable(datas)\n            keys, values, mutated = zip(*results)\n\n            keys = cast(List[int], keys)\n            values = cast(List[pd.DataFrame], values)\n            mutated = cast(List[bool], mutated)\n\n            args = get_args(keys, values, df_groupby)\n  \n            return df_groupby._wrap_applied_output(*args, not_indexed_same=mutated)\n"
    }
  ],
  "ErrorMessage": "--------------------------------------------------------------------------------------- Captured stdout setup ---------------------------------------------------------------------------------------\nINFO: Pandarallel will run on 2 workers.\nINFO: Pandarallel will use Memory file system to transfer data between the main process and workers.\n__________________________________________________________________ test_dataframe_groupby_expanding_apply[anonymous-1-False-False] __________________________________________________________________\nmultiprocessing.pool.RemoteTraceback: \n\"\"\"\nTraceback (most recent call last):\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/multiprocessing/pool.py\", line 125, in worker\n    result = (True, func(*args, **kwds))\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/multiprocessing/pool.py\", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/core.py\", line 158, in __call__\n    results = self.work_function(\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 93, in work\n    return pd.concat(dfs)\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/reshape/concat.py\", line 380, in concat\n    op = _Concatenator(\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/reshape/concat.py\", line 443, in __init__\n    objs, keys = self._clean_keys_and_objs(objs, keys)\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/reshape/concat.py\", line 502, in _clean_keys_and_objs\n    objs_list = list(objs)\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 81, in <genexpr>\n    compute_result(\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 63, in compute_result\n    item = df.expanding(**attributes).apply(\nTypeError: expanding() got an unexpected keyword argument '_grouper'\n\"\"\"\n\nThe above exception was the direct cause of the following exception:\n\npandarallel_init = None, func_dataframe_groupby_expanding_apply = <function func_dataframe_groupby_expanding_apply.<locals>.<lambda> at 0x74cbdaad0af0>, df_size = 1\n\n    def test_dataframe_groupby_expanding_apply(\n        pandarallel_init, func_dataframe_groupby_expanding_apply, df_size\n    ):\n        df = pd.DataFrame(\n            dict(a=np.random.randint(1, 10, df_size), b=np.random.rand(df_size))\n        )\n    \n        res = (\n            df.groupby(\"a\")\n            .b.expanding()\n            .apply(func_dataframe_groupby_expanding_apply, raw=False)\n        )\n        res_parallel = (\n>           df.groupby(\"a\")\n            .b.expanding()\n            .parallel_apply(func_dataframe_groupby_expanding_apply, raw=False)\n        )\n\ntests/test_pandarallel.py:341: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\npandarallel/core.py:444: in closure\n    results = results_promise.get()\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x74cbdc671040>, timeout = None\n\n    def get(self, timeout=None):\n        self.wait(timeout)\n        if not self.ready():\n            raise TimeoutError\n        if self._success:\n            return self._value\n        else:\n>           raise self._value\nE           TypeError: expanding() got an unexpected keyword argument '_grouper'\n\n../../../../anaconda3/envs/py39/lib/python3.9/multiprocessing/pool.py:771: TypeError\n--------------------------------------------------------------------------------------- Captured stdout setup ---------------------------------------------------------------------------------------\nINFO: Pandarallel will run on 2 workers.\nINFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n___________________________________________________________________ test_dataframe_groupby_expanding_apply[anonymous-1-True-None] ___________________________________________________________________\nmultiprocessing.pool.RemoteTraceback: \n\"\"\"\nTraceback (most recent call last):\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/multiprocessing/pool.py\", line 125, in worker\n    result = (True, func(*args, **kwds))\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/multiprocessing/pool.py\", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/core.py\", line 95, in __call__\n    result = self.work_function(\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 93, in work\n    return pd.concat(dfs)\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/reshape/concat.py\", line 380, in concat\n    op = _Concatenator(\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/reshape/concat.py\", line 443, in __init__\n    objs, keys = self._clean_keys_and_objs(objs, keys)\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/reshape/concat.py\", line 502, in _clean_keys_and_objs\n    objs_list = list(objs)\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 81, in <genexpr>\n    compute_result(\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 63, in compute_result\n    item = df.expanding(**attributes).apply(\nTypeError: expanding() got an unexpected keyword argument '_grouper'\n\"\"\"\n\nThe above exception was the direct cause of the following exception:\n\npandarallel_init = None, func_dataframe_groupby_expanding_apply = <function func_dataframe_groupby_expanding_apply.<locals>.<lambda> at 0x74cbdadaf790>, df_size = 1\n\n    def test_dataframe_groupby_expanding_apply(\n        pandarallel_init, func_dataframe_groupby_expanding_apply, df_size\n    ):\n        df = pd.DataFrame(\n            dict(a=np.random.randint(1, 10, df_size), b=np.random.rand(df_size))\n        )\n    \n        res = (\n            df.groupby(\"a\")\n            .b.expanding()\n            .apply(func_dataframe_groupby_expanding_apply, raw=False)\n        )\n        res_parallel = (\n>           df.groupby(\"a\")\n            .b.expanding()\n            .parallel_apply(func_dataframe_groupby_expanding_apply, raw=False)\n        )\n\ntests/test_pandarallel.py:341: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\npandarallel/core.py:333: in closure\n    results_promise.get()\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x74cbdac11610>, timeout = None\n\n    def get(self, timeout=None):\n        self.wait(timeout)\n        if not self.ready():\n            raise TimeoutError\n        if self._success:\n            return self._value\n        else:\n>           raise self._value\nE           TypeError: expanding() got an unexpected keyword argument '_grouper'\n\n../../../../anaconda3/envs/py39/lib/python3.9/multiprocessing/pool.py:771: TypeError\n--------------------------------------------------------------------------------------- Captured stdout setup ---------------------------------------------------------------------------------------\nINFO: Pandarallel will run on 2 workers.\nINFO: Pandarallel will use Memory file system to transfer data between the main process and workers.\n--------------------------------------------------------------------------------------- Captured stdout call ----------------------------------------------------------------------------------------\n   0.00%                                          |        0 /        1 |      \n__________________________________________________________________ test_dataframe_groupby_expanding_apply[anonymous-1-True-False] ___________________________________________________________________\nmultiprocessing.pool.RemoteTraceback: \n\"\"\"\nTraceback (most recent call last):\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/multiprocessing/pool.py\", line 125, in worker\n    result = (True, func(*args, **kwds))\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/multiprocessing/pool.py\", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/core.py\", line 158, in __call__\n    results = self.work_function(\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 93, in work\n    return pd.concat(dfs)\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/reshape/concat.py\", line 380, in concat\n    op = _Concatenator(\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/reshape/concat.py\", line 443, in __init__\n    objs, keys = self._clean_keys_and_objs(objs, keys)\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/reshape/concat.py\", line 502, in _clean_keys_and_objs\n    objs_list = list(objs)\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 81, in <genexpr>\n    compute_result(\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 63, in compute_result\n    item = df.expanding(**attributes).apply(\nTypeError: expanding() got an unexpected keyword argument '_grouper'\n\"\"\"\n\nThe above exception was the direct cause of the following exception:\n\npandarallel_init = None, func_dataframe_groupby_expanding_apply = <function func_dataframe_groupby_expanding_apply.<locals>.<lambda> at 0x74cbdab66940>, df_size = 1\n\n    def test_dataframe_groupby_expanding_apply(\n        pandarallel_init, func_dataframe_groupby_expanding_apply, df_size\n    ):\n        df = pd.DataFrame(\n            dict(a=np.random.randint(1, 10, df_size), b=np.random.rand(df_size))\n        )\n    \n        res = (\n            df.groupby(\"a\")\n            .b.expanding()\n            .apply(func_dataframe_groupby_expanding_apply, raw=False)\n        )\n        res_parallel = (\n>           df.groupby(\"a\")\n            .b.expanding()\n            .parallel_apply(func_dataframe_groupby_expanding_apply, raw=False)\n        )\n\ntests/test_pandarallel.py:341: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\npandarallel/core.py:444: in closure\n    results = results_promise.get()\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x74cbdabdcd90>, timeout = None\n\n    def get(self, timeout=None):\n        self.wait(timeout)\n        if not self.ready():\n            raise TimeoutError\n        if self._success:\n            return self._value\n        else:\n>           raise self._value\nE           TypeError: expanding() got an unexpected keyword argument '_grouper'\n\n../../../../anaconda3/envs/py39/lib/python3.9/multiprocessing/pool.py:771: TypeError\n--------------------------------------------------------------------------------------- Captured stdout setup ---------------------------------------------------------------------------------------\nINFO: Pandarallel will run on 2 workers.\nINFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n--------------------------------------------------------------------------------------- Captured stdout call ----------------------------------------------------------------------------------------\n   0.00%                                          |        0 /        1 |      \n========================================================================================= warnings summary ==========================================================================================\ntests/test_pandarallel.py: 16 warnings\n  /home/user/Documents/repoben/buggycode/nalepae_pandarallel/tests/test_pandarallel.py:235: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n    res = df.applymap(func_dataframe_applymap)\n\n-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html\n====================================================================================== short test summary info ======================================================================================\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1000-False-None] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1000-False-False] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1000-True-None] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1000-True-False] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1-False-None] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1-False-False] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1-True-None] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1-True-False] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1000-False-None] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1000-False-False] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1000-True-None] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1000-True-False] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1-False-None] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1-False-False] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1-True-None] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1-True-False] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[named-1000-False-None] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[named-1000-False-False] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[named-1000-True-None] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[named-1000-True-False] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[named-1-False-None] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[named-1-False-False] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[named-1-True-None] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[named-1-True-False] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[anonymous-1000-False-None] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[anonymous-1000-False-False] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[anonymous-1000-True-None] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[anonymous-1000-True-False] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[anonymous-1-False-None] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[anonymous-1-False-False] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[anonymous-1-True-None] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[anonymous-1-True-False] - TypeError: expanding() got an unexpected keyword argument '_grouper'\n============================================================================ 32 failed, 185 passed, 16 warnings in 7.71s ============================================================================",
  "Patch": "--- a/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\n+++ b/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\n@@ -76,6 +76,7 @@\n                 return item\n \n             attributes = extra[\"attributes\"]\n+            attributes.pop(\"_grouper\", None)\n \n             dfs = (\n                 compute_result(\n--- a/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n+++ b/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n@@ -76,6 +76,7 @@\n                 return item\n \n             attributes = extra[\"attributes\"]\n+            attributes.pop(\"_grouper\", None)\n \n             dfs = (\n                 compute_result(\n",
  "BuggyCodeLocation": [
    {
      "file": "nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py",
      "function": null,
      "content_all": {},
      "content_change": {}
    },
    {
      "file": "nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py",
      "function": null,
      "content_all": {},
      "content_change": {}
    }
  ],
  "Issue": {
    "title": "AttributeError: '_grouper' Key Error in ExpandingGroupBy and RollingGroupBy Operations",
    "description": "Users have reported encountering an `AttributeError` when using `ExpandingGroupBy` and `RollingGroupBy` operations in the `pandarallel` library. The error appears to be triggered due to the presence of an `_grouper` key in the attributes dictionary. This key does not seem to be properly handled, leading to crashes during parallel operations. To reproduce the issue, try using `parallel_apply` on `ExpandingGroupBy` or `RollingGroupBy` objects with any user-defined function. The expected behavior is that the operations should run without triggering an `AttributeError`. This issue needs to be addressed to ensure smooth parallel processing without errors.",
    "explanation": "### Summary of the Issue\n\nThe issue reported is an `AttributeError` encountered when using `ExpandingGroupBy` and `RollingGroupBy` operations in the `pandarallel` library. This error is triggered due to the presence of an `_grouper` key in the attributes dictionary, which is not handled properly, leading to crashes during parallel operations. The error commonly manifests when attempting to use the `parallel_apply` method on `ExpandingGroupBy` or `RollingGroupBy` objects with a user-defined function.\n\n### Detailed Explanation\n\n#### Cause of the Issue\n\nThe `AttributeError` is caused by the `_grouper` key in the attributes dictionary of `ExpandingGroupBy` and `RollingGroupBy` objects. In the context of the `pandarallel` library, these objects store certain attributes that describe how the data should be grouped and processed. The `_grouper` attribute is part of the internals of Pandas' groupby functionality and is not intended to be modified or used directly. However, during the execution of `parallel_apply`, this attribute is inadvertently included in the attributes dictionary, leading to errors when the library attempts to use or manipulate it.\n\n#### Content of the Commit\n\nTo resolve this issue, the developer made changes to the handling of the attributes dictionary in the `ExpandingGroupBy` and `RollingGroupBy` components of the library. Specifically, the commit involves the removal of the `_grouper` key from the attributes dictionary if it exists. This is achieved by adding a line to pop the `_grouper` key:\n\n1. **Location:** The fix is applied in the files handling `ExpandingGroupBy` and `RollingGroupBy`.\n2. **Action:** \n   - Before operating further with the attributes, the `_grouper` key is removed if it is present.\n   - This ensures that the subsequent operations do not encounter an `AttributeError` due to the presence of the `_grouper` key.\n\n### How the Commit Solves the Issue\n\nBy removing the `_grouper` key from the attributes dictionary, the developer ensures that this internal and potentially problematic attribute does not interfere with the functioning of `parallel_apply`. Here's a summarized breakdown of how the solution works:\n\n1. **Identification of Problem Area:** The commit identifies that the issue stems from handling the attributes dictionary in `ExpandingGroupBy` and `RollingGroupBy`.\n2. **Safe Removal:** The solution involves safely removing the `_grouper` key using a pop operation. This method is chosen because it can remove the key if it exists, and do nothing if it doesn't, thus preventing any further errors.\n3. **Prevention of Errors:** With the `_grouper` key removed, the attributes dictionary no longer contains keys that trigger `AttributeError` during parallel operations. This allows the `parallel_apply` method to run smoothly.\n\n### Solution Explanation\n\nIn essence, the solution involves a preventative measure to remove a specific key from a dictionary before it causes a problem. The developer implemented this by adding an additional step in the processing of the attributes dictionary to ensure that `_grouper` is not included. This simple yet effective approach prevents the crashing of `parallel_apply` and is a good example of handling internal dependencies safely within a library.\n\n### Conclusion\n\nThe issue is caused by the mishandling of internal attributes that are not meant to be modified or used directly. The commit solves this by removing the `_grouper` key from the attributes dictionary, thus preventing the `AttributeError`. This solution ensures that `ExpandingGroupBy` and `RollingGroupBy` operations can proceed without encountering errors related to the internal workings of Pandas' groupby functionality. This fix improves the robustness and reliability of the `pandarallel` library when handling parallel operations on grouped data."
  },
  "Explain": "### Summary of the Issue\n\nThe issue reported is an `AttributeError` encountered when using `ExpandingGroupBy` and `RollingGroupBy` operations in the `pandarallel` library. This error is triggered due to the presence of an `_grouper` key in the attributes dictionary, which is not handled properly, leading to crashes during parallel operations. The error commonly manifests when attempting to use the `parallel_apply` method on `ExpandingGroupBy` or `RollingGroupBy` objects with a user-defined function.\n\n### Detailed Explanation\n\n#### Cause of the Issue\n\nThe `AttributeError` is caused by the `_grouper` key in the attributes dictionary of `ExpandingGroupBy` and `RollingGroupBy` objects. In the context of the `pandarallel` library, these objects store certain attributes that describe how the data should be grouped and processed. The `_grouper` attribute is part of the internals of Pandas' groupby functionality and is not intended to be modified or used directly. However, during the execution of `parallel_apply`, this attribute is inadvertently included in the attributes dictionary, leading to errors when the library attempts to use or manipulate it.\n\n#### Content of the Commit\n\nTo resolve this issue, the developer made changes to the handling of the attributes dictionary in the `ExpandingGroupBy` and `RollingGroupBy` components of the library. Specifically, the commit involves the removal of the `_grouper` key from the attributes dictionary if it exists. This is achieved by adding a line to pop the `_grouper` key:\n\n1. **Location:** The fix is applied in the files handling `ExpandingGroupBy` and `RollingGroupBy`.\n2. **Action:** \n   - Before operating further with the attributes, the `_grouper` key is removed if it is present.\n   - This ensures that the subsequent operations do not encounter an `AttributeError` due to the presence of the `_grouper` key.\n\n### How the Commit Solves the Issue\n\nBy removing the `_grouper` key from the attributes dictionary, the developer ensures that this internal and potentially problematic attribute does not interfere with the functioning of `parallel_apply`. Here's a summarized breakdown of how the solution works:\n\n1. **Identification of Problem Area:** The commit identifies that the issue stems from handling the attributes dictionary in `ExpandingGroupBy` and `RollingGroupBy`.\n2. **Safe Removal:** The solution involves safely removing the `_grouper` key using a pop operation. This method is chosen because it can remove the key if it exists, and do nothing if it doesn't, thus preventing any further errors.\n3. **Prevention of Errors:** With the `_grouper` key removed, the attributes dictionary no longer contains keys that trigger `AttributeError` during parallel operations. This allows the `parallel_apply` method to run smoothly.\n\n### Solution Explanation\n\nIn essence, the solution involves a preventative measure to remove a specific key from a dictionary before it causes a problem. The developer implemented this by adding an additional step in the processing of the attributes dictionary to ensure that `_grouper` is not included. This simple yet effective approach prevents the crashing of `parallel_apply` and is a good example of handling internal dependencies safely within a library.\n\n### Conclusion\n\nThe issue is caused by the mishandling of internal attributes that are not meant to be modified or used directly. The commit solves this by removing the `_grouper` key from the attributes dictionary, thus preventing the `AttributeError`. This solution ensures that `ExpandingGroupBy` and `RollingGroupBy` operations can proceed without encountering errors related to the internal workings of Pandas' groupby functionality. This fix improves the robustness and reliability of the `pandarallel` library when handling parallel operations on grouped data.",
  "Source": "Human",
  "Token": 1126,
  "Command": [
    "pytest tests"
  ],
  "FilteredCode": [
    {
      "path": "nalepae_pandarallel/tests/test_pandarallel.py",
      "content": "1 import importlib\n2 import math\n3 \n4 import numpy as np\n5 import pandas as pd\n6 import pytest\n7 from pandarallel import pandarallel\n8 \n9 \n10 @pytest.fixture(params=(1000, 1))\n11 def df_size(request):\n12     return request.param\n13 \n14 \n15 @pytest.fixture(params=(False, True))\n16 def progress_bar(request):\n17     return request.param\n18 \n19 \n20 @pytest.fixture(params=(None, False))\n21 def use_memory_fs(request):\n22     return request.param\n23 \n24 \n25 @pytest.fixture(params=(RuntimeError, AttributeError, ZeroDivisionError))\n26 def exception(request):\n27     return request.param\n28 \n29 \n30 @pytest.fixture(params=(\"named\", \"anonymous\"))\n31 def func_dataframe_apply_axis_0(request):\n32     def func(x):\n33         return max(x) - min(x)\n34 \n35     return dict(named=func, anonymous=lambda x: max(x) - min(x))[request.param]\n36 \n37 \n38 @pytest.fixture(params=(\"named\", \"anonymous\"))\n39 def func_dataframe_apply_axis_1(request):\n40     def func(x):\n41         return math.sin(x.a**2) + math.sin(x.b**2)\n42 \n43     return dict(\n44         named=func, anonymous=lambda x: math.sin(x.a**2) + math.sin(x.b**2)\n45     )[request.param]\n46 \n47 \n48 @pytest.fixture(params=(\"named\", \"anonymous\"))\n49 def func_dataframe_applymap(request):\n50     def func(x):\n51         return math.sin(x**2) - math.cos(x**2)\n52 \n53     return dict(named=func, anonymous=lambda x: math.sin(x**2) - math.cos(x**2))[\n54         request.param\n55     ]\n56 \n57 \n58 @pytest.fixture(params=(\"named\", \"anonymous\"))\n59 def func_series_map(request):\n60     def func(x):\n61         return math.log10(math.sqrt(math.exp(x**2)))\n62 \n63     return dict(\n64         named=func, anonymous=lambda x: math.(...truncated)"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/core.py",
      "content": "1 import multiprocessing\n2 import os\n3 import pickle\n4 from itertools import count\n5 from multiprocessing.managers import SyncManager\n6 from pathlib import Path\n7 from tempfile import NamedTemporaryFile\n8 from typing import Any, Callable, Dict, Iterator, Optional, Tuple, Type, cast\n9 \n10 import dill\n11 import pandas as pd\n12 import psutil\n13 from pandas.core.groupby import DataFrameGroupBy as PandaDataFrameGroupBy\n14 from pandas.core.window.expanding import ExpandingGroupby as PandasExpandingGroupby\n15 from pandas.core.window.rolling import RollingGroupby as PandasRollingGroupby\n16 \n17 from .data_types import (\n18     DataFrame,\n19     DataFrameGroupBy,\n20     DataType,\n21     ExpandingGroupBy,\n22     RollingGroupBy,\n23     Series,\n24     SeriesRolling,\n25 )\n26 from .progress_bars import ProgressBarsType, get_progress_bars, progress_wrapper\n27 from .utils import WorkerStatus\n28 \n29 ON_WINDOWS = os.name == \"nt\"\n30 CONTEXT = multiprocessing.get_context(\"spawn\" if ON_WINDOWS else \"fork\")\n31 \n32 # Root of Memory File System\n33 MEMORY_FS_ROOT = os.environ.get(\"MEMORY_FS_ROOT\", \"/dev/shm\")\n34 \n35 # By default, Pand(...truncated)"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/progress_bars.py",
      "content": "1 import multiprocessing\n2 import os\n3 import shutil\n4 import sys\n5 from abc import ABC, abstractmethod\n6 from enum import Enum\n7 from itertools import count\n8 from time import time_ns\n9 from typing import Callable, List, Union\n10 \n11 from .utils import WorkerStatus\n12 \n13 INTERVAL_NS = 250_000_000  # 0.25 sec\n14 MINIMUM_TERMINAL_WIDTH = 72\n15 \n16 \n17 class ProgressBarsType(int, Enum):\n18     No = 0\n19     InUserDefinedFunction = 1\n20     InUserDefinedFunctionMultiplyByNumberOfColumns = 2\n21     InWorkFunction = 3\n22 \n23 \n24 class ProgressBars(ABC):\n25     @abstractmethod\n26     def __init__(self, maxs: List[int], show: bool) -> None:\n27         ...\n28 \n29     @abstractmethod\n30     def update(self, values: List[int]) -> None:\n31         ...\n32 \n33     def set_error(self, index: int) -> None:\n34         pass\n35 \n36 \n37 class ProgressState(...truncated)"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py",
      "content": "1 import multiprocessing\n2 from typing import Any, Callable, Dict, Iterable, Iterator, List, Tuple\n3 \n4 import pandas as pd\n5 from pandas.core.window.expanding import ExpandingGroupby as PandasExpandingGroupby\n6 \n7 from ..utils import WorkerStatus, chunk, get_pandas_version\n8 from .generic import DataType\n9 \n10 \n11 class ExpandingGroupBy:\n12     class Apply(DataType):\n13         @staticmethod\n14         def get_chunks(\n15             nb_workers: int, data: PandasExpandingGroupby, *args, **kwargs\n16         ) -> Iterator[List[Tuple[int,(...truncated)"
    }
  ],
  "TokenAll": 14220,
  "FilteredLevel": 1500,
  "Results": {
    "model": "GPT-4o",
    "Difficulty": "Easy",
    "issue_origin": {
      "title": "Incomplete lambda expression",
      "description": "The code in 'nalepae_pandarallel/tests/test_pandarallel.py' at line 64 ends with 'math.' which leads to an incomplete lambda expression. The function dictionary entry 'anonymous' under fixture 'func_series_map' is incomplete and will cause a syntax error.",
      "explanation": "### Summary of the Issue\n\nThe reported issue is an \"Incomplete lambda expression\" in the `nalepae_pandarallel/tests/test_pandarallel.py` file at line 64. Specifically, the expression for the lambda function under the \"anonymous\" dictionary entry is incomplete, stopping at `math.`, which leads to a syntax error. The incomplete lambda function is supposed to be returned by the fixture `func_series_map`.\n\n### Content of the Commit\n\nThe commit necessary to fix this issue presumably corrects the lambda expression by completing its definition. Below is a hypothetical content of the commit to address the issue:\n\n```diff\n@@ -63,7 +63,7 @@ def func_series_map(request):\n         return math.log10(math.sqrt(math.exp(x**2)))\n     \n     return dict(\n-        named=func, anonymous=lambda x: math.(...truncated)\n+        named=func, anonymous=lambda x: math.log10(math.sqrt(math.exp(x**2)))\n     )[request.param]\n```\n\n### Explanation of How the Commit Solves the Issue\n\n1. **Identifying the Issue:**\n   - The lambda expression ends prematurely as `math.`. This statement is syntactically incorrect because `math.` is not followed by a valid math function or property.\n   \n2. **Purpose of the Lambda Expression:**\n   - According to the named function implementation (`func`), the purpose of the lambda is to apply several mathematical operations: `log10`, `sqrt`, and `exp` on the variable `x`.\n   \n3. **Correcting the Lambda Expression:**\n   - The commit replaces the incomplete lambda function with a fully defined one. The complete expression is: `lambda x: math.log10(math.sqrt(math.exp(x**2)))`. This makes the lambda function perform the same operations as the named function.\n   - By fixing this expression, the `anonymous` entry in the dictionary becomes valid and no longer causes a syntax error.\n   \n4. **Ensuring Consistency:**\n   - Both the named and anonymous functions now compute the same mathematical expression, ensuring that the tests using this fixture can proceed without error.\n   \n5. **Impact on Fixtures and Tests:**\n   - The fixture `func_series_map` returns either the named function or the anonymous lambda function based on the test parameter. With the corrected lambda expression, any tests using this fixture will execute without encountering syntax errors.\n\n### Detailed Explanation\n\n1. **Imports:**\n   - `import math` makes mathematical functions available to the script, which are used in both the named and lambda functions.\n\n2. **Definition of Named Function:**\n   - The named function `func` is defined as:\n     ```python\n     def func(x):\n         return math.log10(math.sqrt(math.exp(x**2)))\n     ```\n\n3. **Return Dictionary:**\n   - The `return` statement creates a dictionary with two entries: `named` and `anonymous`. The `anonymous` entry previously pointed to an incomplete lambda function:\n     ```python\n     return dict(named=func, anonymous=lambda x: math.(...truncated)\n     )[request.param]\n     ```\n\n4. **Correction Applied:**\n   - The corrected return dictionary as per the hypothetical commit:\n     ```python\n     return dict(named=func, anonymous=lambda x: math.log10(math.sqrt(math.exp(x**2)))\n     )[request.param]\n     ```\n\n5. **How the Correction Fixes the Syntax Error:**\n   - The original lambda `lambda x: math.` is invalid because `math.` requires a method or property to follow (e.g., `math.log10`). By completing it to `lambda x: math.log10(math.sqrt(math.exp(x**2)))`, the lambda now correctly mirrors the named function.\n  \n6. **Outcome:**\n   - Tests using `func_series_map` will now receive a valid lambda function under the `anonymous` key, thus resolving the syntax error and ensuring the expected behavior during test execution.\n\nIn summary, the commit resolves the issue by completing the lambda expression, ensuring that it correctly performs the intended mathematical operations and eliminating the syntax error caused by the incomplete statement."
    },
    "issue_message": {
      "title": "TypeError in Expanding and Rolling GroupBy Apply Functions",
      "description": "The 'expanding' and 'rolling' methods in the 'ExpandingGroupBy' class defined in 'nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py' and used in tests in 'nalepae_pandarallel/tests/test_pandarallel.py' are receiving an unexpected keyword argument '_grouper'. Please update the calls to these methods to remove the '_grouper' argument or handle it appropriately.",
      "explanation": "### Summary of the Issue\n\nThe given issue is related to the `TypeError` caused by the `expanding` and `rolling` methods in the `ExpandingGroupBy` and `RollingGroupBy` classes. Specifically, these methods are receiving an unexpected keyword argument `_grouper`. The error occurs in tests located in `nalepae_pandarallel/tests/test_pandarallel.py`.\n\n### Detailed Analysis\n\n#### Codebase Analysis\n1. **Tests Code (`nalepae_pandarallel/tests/test_pandarallel.py`):**\n   The relevant part of the test that triggers the error:\n   ```python\n   df.groupby(\"a\").b.expanding().apply(func_dataframe_groupby_expanding_apply, raw=False)\n   df.groupby(\"a\").b.expanding().parallel_apply(func_dataframe_groupby_expanding_apply, raw=False)\n   ```\n   These lines attempt to perform operations using `expanding` and `rolling` methods from `ExpandingGroupBy` and `RollingGroupBy` classes.\n\n2. **Core Code (`nalepae_pandarallel/core.py`):**\n   This file contains core logic for parallel execution of tasks, which likely includes invoking the `expanding` and `rolling` methods in a parallel manner.\n\n3. **ExpandingGroupBy Class (`nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py`):**\n   The `ExpandingGroupBy` class includes a nested `Apply` class with an `apply` method that receives various arguments. This method seems to involve a call to the `expanding` method:\n   ```python\n   item = df.expanding(**attributes).apply(func, raw=raw)\n   ```\n\n4. **RollingGroupBy Class (implied structure similar to `ExpandingGroupBy`):**\n   The `RollingGroupBy` class would contain similar logic to `ExpandingGroupBy`, invoking `rolling` with the given attributes.\n\n#### Error Traceback\nThe error traceback shows the specific point of failure:\n   ```\n   File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 63, in compute_result\n       item = df.expanding(**attributes).apply(...)\n   \n   TypeError: expanding() got an unexpected keyword argument '_grouper'\n   ```\n\n### Cause & Solution\n\n#### Cause\nThe root cause is that the `expanding` and `rolling` methods from Pandas do not accept an argument named `_grouper`. Thus, passing this argument causes a `TypeError`.\n\n#### Commit Content\n\nTo solve this issue, the commit would presumably involve modifying the call site of `expanding` and `rolling` methods to exclude the `_grouper` argument or handle it appropriately.\n\n#### Example Commit (hypothetical)\nThe commit would look something like this:\n\n```diff\n# Inside ExpandingGroupBy and RollingGroupBy classes, in the respective methods:\ndef expanding_apply(df, func, **attributes):\n-    result = df.expanding(**attributes).apply(func)\n+    if '_grouper' in attributes:\n+        attributes.pop('_grouper')\n+    result = df.expanding(**attributes).apply(func)\n    return result\n\ndef rolling_apply(df, func, **attributes):\n-    result = df.rolling(**attributes).apply(func)\n+    if '_grouper' in attributes:\n+        attributes.pop('_grouper')\n+    result = df.rolling(**attributes).apply(func)\n    return result\n```\n\n#### Explanation of the Solution\n1. **Removal of `_grouper`:** Before calling the `expanding` or `rolling` method, the dictionary `attributes` is checked. If the `_grouper` key is found, it is removed using `pop('_grouper')`.\n\n2. **Safeguard Modification:** This update ensures that the `expanding` and `rolling` methods are called without unexpected keyword arguments, preventing the `TypeError`.\n\n3. **Maintain Functionality:** By only removing the `_grouper` argument, other required attributes are still passed appropriately, ensuring the original functionality is preserved but without causing the error.\n\n#### Tests Verification\nAfter these changes, running the tests in `nalepae_pandarallel/tests/test_pandarallel.py` should no longer produce the `TypeError`, confirming that the solution is effective."
    },
    "issue_ground": {
      "title": "AttributeError: '_grouper' Key Error in ExpandingGroupBy and RollingGroupBy Operations",
      "description": "Users have reported encountering an `AttributeError` when using `ExpandingGroupBy` and `RollingGroupBy` operations in the `pandarallel` library. The error appears to be triggered due to the presence of an `_grouper` key in the attributes dictionary. This key does not seem to be properly handled, leading to crashes during parallel operations. To reproduce the issue, try using `parallel_apply` on `ExpandingGroupBy` or `RollingGroupBy` objects with any user-defined function. The expected behavior is that the operations should run without triggering an `AttributeError`. This issue needs to be addressed to ensure smooth parallel processing without errors.",
      "explanation": "### Summary of the Issue\n\nThe reported issue pertains to the `pandarallel` library, where users encounter an `AttributeError` during `ExpandingGroupBy` and `RollingGroupBy` operations. The error message indicates that the `expanding` and `rolling` methods receive an unexpected keyword argument `_grouper`, causing parallel operations to crash. The expectation is that these operations should run without triggering any exceptions.\n\n### Detailed Analysis of the Provided Code\n\n#### Related Code Files\n\n1. **nalepae_pandarallel/tests/test_pandarallel.py**: Contains tests for the `pandarallel` library, including those for `ExpandingGroupBy` and `RollingGroupBy`.\n2. **nalepae_pandarallel/pandarallel/core.py**: Core functionality of the `pandarallel` library, likely where parallel operation implementations reside.\n3. **nalepae_pandarallel/pandarallel/progress_bars.py**: Contains the progress bar functionality; not directly related to the issue but involved in the library.\n4. **nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py**: Contains the `ExpandingGroupBy` class, which likely handles `expanding` operations in parallel.\n\n#### Error Details\n\nFrom the error logs provided, the key lines are:\n```python\nFile \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 63, in compute_result\n    item = df.expanding(**attributes).apply(\nTypeError: expanding() got an unexpected keyword argument '_grouper'\n```\nThe error signifies that a `_grouper` attribute is being passed inadvertently to the `expanding` function, which does not expect it.\n\n### Commit Analysis to Resolve the Issue\n\nThe commit likely addresses the improper handling of `_grouper` in the attributes dictionary during the `expanding` and `rolling` operations within the `ExpandingGroupBy` and `RollingGroupBy` classes respectively. \n\n#### Potential Fix\n\nA probable fix would involve filtering out the `_grouper` key from the attributes dictionary before passing it to the `expanding` and `rolling` functions. Here's how this fix might be implemented:\n\n1. **Update to ExpandingGroupBy**:\n   - Modify the `compute_result` method to exclude the `_grouper` attribute.\n   - For instance:\n     ```python\n     def compute_result(df, func, *args, **attributes):\n         if '_grouper' in attributes:\n             del attributes['_grouper']\n         return df.expanding(**attributes).apply(func, *args)\n     ```\n\n#### Expected Impact of the Fix\n\n- **Filtering `_grouper` Key**:\n  - When `compute_result` is called, it ensures that the `_grouper` key is removed from the attributes dictionary before calling `expanding`.\n  - This prevents the `TypeError` from being raised for unexpected keyword arguments.\n\n### Explanation of the Fix and How It Solves the Issue\n\n- **Cause of the Error**:\n  - The error occurs because the `_grouper` key, which is part of the attributes dictionary, is being passed to the `expanding` and `rolling` functions. These functions do not recognize `_grouper` as a valid argument, leading to the `TypeError`.\n\n- **Commit Content**:\n  - The fix likely involves filtering the attributes dictionary to remove the `_grouper` key when passing arguments to `expanding` or `rolling`.\n  - The updated code in the `ExpandingGroupBy` and `RollingGroupBy` classes would ensure that only valid keyword arguments are supplied.\n\n- **Solution Validation**:\n  - By ensuring `_grouper` is not included in the dictionary passed to `expanding` or `rolling`, the functions are invoked with acceptable parameters.\n  - This change would be reflected in both the classes within their respective `compute_result` methods.\n\n### Detailed Example\n\n**Before the Commit**:\n```python\ndef compute_result(df, func, *args, **attributes):\n    return df.expanding(**attributes).apply(func, *args)  # Raises TypeError: unexpected keyword argument '_grouper'\n```\n\n**After the Commit**:\n```python\ndef compute_result(df, func, *args, **attributes):\n    # Remove _grouper from attributes if present\n    attributes.pop('_grouper', None)\n    return df.expanding(**attributes).apply(func, *args)  # No TypeError, as _grouper is not passed\n```\n- The above change ensures that `_grouper` is excluded, thus preventing the error.\n\n### Conclusion\n\nThe `AttributeError` is resolved by ensuring `_grouper` is excluded from the attributes passed to the `expanding` and `rolling` functions. This change allows parallel `Expanding"
    },
    "issue_ground_truth": {
      "title": "AttributeError: '_grouper' Key Error in ExpandingGroupBy and RollingGroupBy Operations",
      "description": "Users have reported encountering an `AttributeError` when using `ExpandingGroupBy` and `RollingGroupBy` operations in the `pandarallel` library. The error appears to be triggered due to the presence of an `_grouper` key in the attributes dictionary. This key does not seem to be properly handled, leading to crashes during parallel operations. To reproduce the issue, try using `parallel_apply` on `ExpandingGroupBy` or `RollingGroupBy` objects with any user-defined function. The expected behavior is that the operations should run without triggering an `AttributeError`. This issue needs to be addressed to ensure smooth parallel processing without errors.",
      "explanation": "### Summary of the Issue\n\nThe issue reported is an `AttributeError` encountered when using `ExpandingGroupBy` and `RollingGroupBy` operations in the `pandarallel` library. This error is triggered due to the presence of an `_grouper` key in the attributes dictionary, which is not handled properly, leading to crashes during parallel operations. The error commonly manifests when attempting to use the `parallel_apply` method on `ExpandingGroupBy` or `RollingGroupBy` objects with a user-defined function.\n\n### Detailed Explanation\n\n#### Cause of the Issue\n\nThe `AttributeError` is caused by the `_grouper` key in the attributes dictionary of `ExpandingGroupBy` and `RollingGroupBy` objects. In the context of the `pandarallel` library, these objects store certain attributes that describe how the data should be grouped and processed. The `_grouper` attribute is part of the internals of Pandas' groupby functionality and is not intended to be modified or used directly. However, during the execution of `parallel_apply`, this attribute is inadvertently included in the attributes dictionary, leading to errors when the library attempts to use or manipulate it.\n\n#### Content of the Commit\n\nTo resolve this issue, the developer made changes to the handling of the attributes dictionary in the `ExpandingGroupBy` and `RollingGroupBy` components of the library. Specifically, the commit involves the removal of the `_grouper` key from the attributes dictionary if it exists. This is achieved by adding a line to pop the `_grouper` key:\n\n1. **Location:** The fix is applied in the files handling `ExpandingGroupBy` and `RollingGroupBy`.\n2. **Action:** \n   - Before operating further with the attributes, the `_grouper` key is removed if it is present.\n   - This ensures that the subsequent operations do not encounter an `AttributeError` due to the presence of the `_grouper` key.\n\n### How the Commit Solves the Issue\n\nBy removing the `_grouper` key from the attributes dictionary, the developer ensures that this internal and potentially problematic attribute does not interfere with the functioning of `parallel_apply`. Here's a summarized breakdown of how the solution works:\n\n1. **Identification of Problem Area:** The commit identifies that the issue stems from handling the attributes dictionary in `ExpandingGroupBy` and `RollingGroupBy`.\n2. **Safe Removal:** The solution involves safely removing the `_grouper` key using a pop operation. This method is chosen because it can remove the key if it exists, and do nothing if it doesn't, thus preventing any further errors.\n3. **Prevention of Errors:** With the `_grouper` key removed, the attributes dictionary no longer contains keys that trigger `AttributeError` during parallel operations. This allows the `parallel_apply` method to run smoothly.\n\n### Solution Explanation\n\nIn essence, the solution involves a preventative measure to remove a specific key from a dictionary before it causes a problem. The developer implemented this by adding an additional step in the processing of the attributes dictionary to ensure that `_grouper` is not included. This simple yet effective approach prevents the crashing of `parallel_apply` and is a good example of handling internal dependencies safely within a library.\n\n### Conclusion\n\nThe issue is caused by the mishandling of internal attributes that are not meant to be modified or used directly. The commit solves this by removing the `_grouper` key from the attributes dictionary, thus preventing the `AttributeError`. This solution ensures that `ExpandingGroupBy` and `RollingGroupBy` operations can proceed without encountering errors related to the internal workings of Pandas' groupby functionality. This fix improves the robustness and reliability of the `pandarallel` library when handling parallel operations on grouped data."
    },
    "location_origin": [
      {
        "file": "nalepae_pandarallel/tests/test_pandarallel.py",
        "function": {
          "59": "func_series_map"
        },
        "content_all": {
          "56": "    )\n",
          "57": "\n",
          "58": "    @pytest.fixture(params=(\"named\", \"anonymous\"))\n",
          "59": "    def func_series_map(request):\n",
          "60": "        def func(x):\n",
          "61": "            return math.log10(math.sqrt(math.exp(x**2)))\n",
          "62": "\n",
          "63": "        return dict(\n",
          "64": "            named=func, anonymous=lambda x: math.\n",
          "65": "        )[request.param]\n",
          "66": "\n",
          "67": "\n"
        },
        "content_change": {
          "64": "            named=func, anonymous=lambda x: math.log10(math.sqrt(math.exp(x**2)))\n"
        }
      }
    ],
    "location_message": [
      {
        "file": "nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py",
        "function": {
          "63": "compute_result"
        },
        "content_all": {
          "60": "            )",
          "61": "\n",
          "62": "        )",
          "63": "        item = df.expanding(**attributes).apply(",
          "64": "            lambda x: x,",
          "65": "            raw=raw,",
          "66": "        )",
          "67": "\n"
        },
        "content_change": {
          "63": "        if '_grouper' in attributes:\n            attributes.pop('_grouper')\n        item = df.expanding(**attributes).apply("
        }
      },
      {
        "file": "nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py",
        "function": {
          "63": "compute_result"
        },
        "content_all": {
          "60": "            )",
          "61": "\n",
          "62": "        )",
          "63": "        item = df.rolling(**attributes).apply(",
          "64": "            lambda x: x,",
          "65": "            raw=raw,",
          "66": "        )",
          "67": "\n"
        },
        "content_change": {
          "63": "        if '_grouper' in attributes:\n            attributes.pop('_grouper')\n        item = df.rolling(**attributes).apply("
        }
      },
      {
        "file": "nalepae_pandarallel/tests/test_pandarallel.py",
        "function": {
          "341": "test_dataframe_groupby_expanding_apply"
        },
        "content_all": {
          "338": "        res = (",
          "339": "            df.groupby(\"a\")",
          "340": "            .b.expanding()",
          "341": "            .apply(func_dataframe_groupby_expanding_apply, raw=False)",
          "342": "        )",
          "343": "        res_parallel = (",
          "344": "            df.groupby(\"a\")",
          "345": "            .b.expanding()",
          "346": "            .parallel_apply(func_dataframe_groupby_expanding_apply, raw=False)",
          "347": "        )"
        },
        "content_change": {
          "341": "            .apply(func_dataframe_groupby_expanding_apply, raw=False)"
        }
      },
      {
        "file": "nalepae_pandarallel/tests/test_pandarallel.py",
        "function": {
          "341": "test_dataframe_groupby_rolling_apply"
        },
        "content_all": {
          "338": "        res = (",
          "339": "            df.groupby(\"a\")",
          "340": "            .b.rolling()",
          "341": "            .apply(func_dataframe_groupby_rolling_apply, raw=False)",
          "342": "        )",
          "343": "        res_parallel = (",
          "344": "            df.groupby(\"a\")",
          "345": "            .b.rolling()",
          "346": "            .parallel_apply(func_dataframe_groupby_rolling_apply, raw=False)",
          "347": "        )"
        },
        "content_change": {
          "341": "            .apply(func_dataframe_groupby_rolling_apply, raw=False)"
        }
      }
    ],
    "location_ground": [
      {
        "file": "nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py",
        "function": {
          "15": "compute_result"
        },
        "content_all": {
          "12": "class ExpandingGroupBy:\n",
          "13": "    class Apply(DataType):\n",
          "14": "        @staticmethod\n",
          "15": "        def compute_result(df, func, *args, **attributes):\n",
          "16": "            item = df.expanding(**attributes).apply(func, *args)\n",
          "17": "            return item\n",
          "18": "        \n"
        },
        "content_change": {
          "15": "        def compute_result(df, func, *args, **attributes):\n",
          "16": "            if '_grouper' in attributes:\n",
          "17": "                del attributes['_grouper']\n",
          "18": "            item = df.expanding(**attributes).apply(func, *args)\n"
        }
      },
      {
        "file": "nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py",
        "function": {
          "15": "compute_result"
        },
        "content_all": {
          "12": "class RollingGroupBy:\n",
          "13": "    class Apply(DataType):\n",
          "14": "        @staticmethod\n",
          "15": "        def compute_result(df, func, *args, **attributes):\n",
          "16": "            item = df.rolling(**attributes).apply(func, *args)\n",
          "17": "            return item\n",
          "18": "        \n"
        },
        "content_change": {
          "15": "        def compute_result(df, func, *args, **attributes):\n",
          "16": "            if '_grouper' in attributes:\n",
          "17": "                del attributes['_grouper']\n",
          "18": "            item = df.rolling(**attributes).apply(func, *args)\n"
        }
      }
    ],
    "location_ground_exp": [
      {
        "file": "nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py",
        "function": {
          "12": "ExpandingGroupBy.Apply.get_chunks"
        },
        "content_all": {
          "11": "class ExpandingGroupBy:",
          "12": "    class Apply(DataType):",
          "13": "        @staticmethod",
          "14": "        def get_chunks(",
          "15": "            nb_workers: int, data: PandasExpandingGroupby, *args, **kwargs",
          "16": "        ) -> Iterator[List[Tuple[int, Dict[str, Any]]]]:"
        },
        "content_change": {
          "15": "            if '_grouper' in data.__dict__:\n                data.__dict__.pop('_grouper')\n            nb_workers: int, data: PandasExpandingGroupby, *args, **kwargs"
        }
      },
      {
        "file": "nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py",
        "function": {
          "12": "RollingGroupBy.Apply.get_chunks"
        },
        "content_all": {
          "11": "class RollingGroupBy:",
          "12": "    class Apply(DataType):",
          "13": "        @staticmethod",
          "14": "        def get_chunks(",
          "15": "            nb_workers: int, data: PandasRollingGroupby, *args, **kwargs",
          "16": "        ) -> Iterator[List[Tuple[int, Dict[str, Any]]]]:"
        },
        "content_change": {
          "15": "            if '_grouper' in data.__dict__:\n                data.__dict__.pop('_grouper')\n            nb_workers: int, data: PandasRollingGroupby, *args, **kwargs"
        }
      }
    ],
    "location_ground_truth": [
      {
        "file": "nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py",
        "function": null,
        "content_all": {},
        "content_change": {}
      },
      {
        "file": "nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py",
        "function": null,
        "content_all": {},
        "content_change": {}
      }
    ],
    "patch_i": "\n--- a/nalepae_pandarallel/tests/test_pandarallel.py\n+++ b/nalepae_pandarallel/tests/test_pandarallel.py\n@@ -63,7 +63,7 @@ def func_series_map(request):\n         return math.log10(math.sqrt(math.exp(x**2)))\n     \n     return dict(\n-        named=func, anonymous=lambda x: math.\n+        named=func, anonymous=lambda x: math.log10(math.sqrt(math.exp(x**2)))\n     )[request.param]\n",
    "patch_im": "\n--- a/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\n+++ b/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\n@@ -61,7 +61,9 @@ class ExpandingGroupBy:\n                 return True\n     class Apply(DataType):\n         @staticmethod\n         def get_chunks(\n-            nb_workers: int, data: PandasExpandingGroupby, *args, **kwargs\n+            nb_workers: int, data: PandasExpandingGroupby, *args, **kwargs\n         ) -> Iterator[List[Tuple[int, str]]]:\n             for i in range(nb_workers):\n                 yield list(data)[i::nb_workers]\n \n@@ -73,7 +75,9 @@ class ExpandingGroupBy:\n             worker_index: int,\n             progress_bar: bool,\n             *args,\n             **kwargs\n-        ):\n+        ) -> pd.DataFrame:\n+            if '_grouper' in kwargs:\n+                kwargs.pop('_grouper')\n             return data.expanding(*args, **kwargs).apply(func)\n \n         @staticmethod\n         def merge(*args\n@@ -90,7 +94,9 @@ class ExpandingGroupBy:\n                 data = data[: int(_max)]\n             return data\n \n class RollingGroupBy:\n     class Apply(DataType):\n         @staticmethod\n         def get_chunks(\n             nb_workers: int, data: PandasRollingGroupby, *args, **kwargs\n+            nb_workers: int, data: PandasRollingGroupby, *args, **kwargs\n         ) -> Iterator[List[Tuple[int, str]]]:\n             for i in range(nb_workers):\n                 yield list(data)[i::nb_workers]\n \n@@ -103,7 +109,9 @@ class RollingGroupBy:\n             worker_index: int,\n             progress_bar: bool,\n             *args,\n             **kwargs\n-        ):\n+        ) -> pd.DataFrame:\n+            if '_grouper' in kwargs:\n+                kwargs.pop('_grouper')\n             return data.rolling(*args, **kwargs).apply(func)\n \n         @staticmethod\n         def merge(*args\n",
    "patch_il": "\n--- a/nalepae_pandarallel/tests/test_pandarallel.py\n+++ b/nalepae_pandarallel/tests/test_pandarallel.py\n@@ -63,7 +63,7 @@ def func_series_map(request):\n         return math.log10(math.sqrt(math.exp(x**2)))\n \n     return dict(\n-        named=func, anonymous=lambda x: math.\n+        named=func, anonymous=lambda x: math.log10(math.sqrt(math.exp(x**2)))\n     )[request.param]\n \n",
    "patch_iml": "\n--- a/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\n+++ b/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\n@@ -60,7 +60,10 @@ class ExpandingGroupBy:\n             )\n \n         )\n-        item = df.expanding(**attributes).apply(\n+        if '_grouper' in attributes:\n+            attributes.pop('_grouper')\n+\n+        item = df.expanding(**attributes).apply(\n             lambda x: x,\n             raw=raw,\n         )\n--- a/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n+++ b/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n@@ -60,7 +60,10 @@ class RollingGroupBy:\n             )\n \n         )\n-        item = df.rolling(**attributes).apply(\n+        if '_grouper' in attributes:\n+            attributes.pop('_grouper')\n+\n+        item = df.rolling(**attributes).apply(\n             lambda x: x,\n             raw=raw,\n         )\n",
    "patch_ground": "\n--- a/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\n+++ b/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\n@@ -60,7 +60,10 @@ class ExpandingGroupBy:\n             def compute_result(df, func, *args, **attributes):\n                 # This function executes in parallel.\n                 # If you need custom pre or post-processing\n                 # for expanding members of df instances, feel free to do so here.\n-                return df.expanding(**attributes).apply(func, *args)\n+                # Remove _grouper if it exists in attributes\n+                attributes.pop('_grouper', None)\n+                return df.expanding(**attributes).apply(func, *args)\n \n---\n--- a/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n+++ b/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n@@ -60,7 +60,10 @@ class RollingGroupBy:\n             def compute_result(df, func, *args, **attributes):\n                 # This function executes in parallel.\n                 # If you need custom pre or post-processing\n                 # for rolling members of df instances, feel free to do so here.\n-                return df.rolling(**attributes).apply(func, *args)\n+                # Remove _grouper if it exists in attributes\n+                attributes.pop('_grouper', None)\n+                return df.rolling(**attributes).apply(func, *args)\n\n",
    "patch_ground_location": "\n--- a/nalepae_pandarparallel/pandarparallel/data_types/expanding_groupby.py\n+++ b/nalepae_pandarparallel/pandarparallel/data_types/expanding_groupby.py\n@@ -13,6 +13,9 @@\n         def compute_result(df, func, *args, **attributes):\n-            item = df.expanding(**attributes).apply(func, *args)\n+            if '_grouper' in attributes:\n+                del attributes['_grouper']\n+            item = df.expanding(**attributes).apply(func, *args)\n             return item\n",
    "patch_ground_exp": "\n--- a/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\n+++ b/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\n@@ -12,6 +12,8 @@\n class ExpandingGroupBy:\n     class Apply(DataType):\n         @staticmethod\n+        def get_chunks(\n+             nb_workers: int, data: PandasExpandingGroupby, *args, **kwargs\n+         ) -> Iterator[List[Tuple[int, Dict[str, Any]]]]:\n+            if '_grouper' in data.__dict__:\n+                data.__dict__.pop('_grouper')\n             nb_workers: int, data: PandasExpandingGroupby, *args, **kwargs\n         ) -> Iterator[List[Tuple[int, Dict[str, Any]]]]:\n",
    "patch_ground_all": "\n--- a/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\n+++ b/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\n@@ -11,6 +11,10 @@\n class ExpandingGroupBy:\n     class Apply(DataType):\n         @staticmethod\n+        def sanitize_attributes(data: PandasExpandingGroupby) -> PandasExpandingGroupby:\n+            if '_grouper' in data.__dict__:\n+                data.__dict__.pop('_grouper')\n+\n         def get_chunks(\n             nb_workers: int, data: PandasExpandingGroupby, *args, **kwargs\n         ) -> Iterator[List[Tuple[int, int, int]]]:\n@@ -18,6 +22,7 @@\n                 raise ValueError(\"No data to parallelize\")\n\n             data = data._selected_obj\n+            data = ExpandingGroupBy.Apply.sanitize_attributes(data)\n             \n             chunks = chunk(data, nb_workers)\n\n--- a/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n+++ b/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n@@ -11,6 +11,10 @@\n class RollingGroupBy:\n     class Apply(DataType):\n         @staticmethod\n+        def sanitize_attributes(data: PandasRollingGroupby) -> PandasRollingGroupby:\n+            if '_grouper' in data.__dict__:\n+                data.__dict__.pop('_grouper')\n+\n         def get_chunks(\n             nb_workers: int, data: PandasRollingGroupby, *args, **kwargs\n         ) -> Iterator[List[Tuple[int, int, int]]]:\n@@ -18,6 +22,7 @@\n                 raise ValueError(\"No data to parallelize\")\n\n             data = data._selected_obj\n+            data = RollingGroupBy.Apply.sanitize_attributes(data)\n             \n             chunks = chunk(data, nb_workers)\n",
    "patch_ground_truth": "--- a/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\n+++ b/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\n@@ -76,6 +76,7 @@\n                 return item\n \n             attributes = extra[\"attributes\"]\n+            attributes.pop(\"_grouper\", None)\n \n             dfs = (\n                 compute_result(\n--- a/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n+++ b/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n@@ -76,6 +76,7 @@\n                 return item\n \n             attributes = extra[\"attributes\"]\n+            attributes.pop(\"_grouper\", None)\n \n             dfs = (\n                 compute_result(\n",
    "message": "--------------------------------------------------------------------------------------- Captured stdout setup ---------------------------------------------------------------------------------------\nINFO: Pandarallel will run on 2 workers.\nINFO: Pandarallel will use Memory file system to transfer data between the main process and workers.\n__________________________________________________________________ test_dataframe_groupby_expanding_apply[anonymous-1-False-False] __________________________________________________________________\nmultiprocessing.pool.RemoteTraceback: \n\"\"\"\nTraceback (most recent call last):\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/multiprocessing/pool.py\", line 125, in worker\n    result = (True, func(*args, **kwds))\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/multiprocessing/pool.py\", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/core.py\", line 158, in __call__\n    results = self.work_function(\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 93, in work\n    return pd.concat(dfs)\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/reshape/concat.py\", line 380, in concat\n    op = _Concatenator(\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/reshape/concat.py\", line 443, in __init__\n    objs, keys = self._clean_keys_and_objs(objs, keys)\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/reshape/concat.py\", line 502, in _clean_keys_and_objs\n    objs_list = list(objs)\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 81, in <genexpr>\n    compute_result(\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 63, in compute_result\n    item = df.expanding(**attributes).apply(\nTypeError: expanding() got an unexpected keyword argument '_grouper'\n\"\"\"\n\nThe above exception was the direct cause of the following exception:\n\npandarallel_init = None, func_dataframe_groupby_expanding_apply = <function func_dataframe_groupby_expanding_apply.<locals>.<lambda> at 0x74cbdaad0af0>, df_size = 1\n\n    def test_dataframe_groupby_expanding_apply(\n        pandarallel_init, func_dataframe_groupby_expanding_apply, df_size\n    ):\n        df = pd.DataFrame(\n            dict(a=np.random.randint(1, 10, df_size), b=np.random.rand(df_size))\n        )\n    \n        res = (\n            df.groupby(\"a\")\n            .b.expanding()\n            .apply(func_dataframe_groupby_expanding_apply, raw=False)\n        )\n        res_parallel = (\n>           df.groupby(\"a\")\n            .b.expanding()\n            .parallel_apply(func_dataframe_groupby_expanding_apply, raw=False)\n        )\n\ntests/test_pandarallel.py:341: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\npandarallel/core.py:444: in closure\n    results = results_promise.get()\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x74cbdc671040>, timeout = None\n\n    def get(self, timeout=None):\n        self.wait(timeout)\n        if not self.ready():\n            raise TimeoutError\n        if self._success:\n            return self._value\n        else:\n>           raise self._value\nE           TypeError: expanding() got an unexpected keyword argument '_grouper'\n\n../../../../anaconda3/envs/py39/lib/python3.9/multiprocessing/pool.py:771: TypeError\n--------------------------------------------------------------------------------------- Captured stdout setup ---------------------------------------------------------------------------------------\nINFO: Pandarallel will run on 2 workers.\nINFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n___________________________________________________________________ test_dataframe_groupby_expanding_apply[anonymous-1-True-None] ___________________________________________________________________\nmultiprocessing.pool.RemoteTraceback: \n\"\"\"\nTraceback (most recent call last):\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/multiprocessing/pool.py\", line 125, in worker\n    result = (True, func(*args, **kwds))\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/multiprocessing/pool.py\", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/core.py\", line 95, in __call__\n    result = self.work_function(\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 93, in work\n    return pd.concat(dfs)\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/reshape/concat.py\", line 380, in concat\n    op = _Concatenator(\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/reshape/concat.py\", line 443, in __init__\n    objs, keys = self._clean_keys_and_objs(objs, keys)\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/reshape/concat.py\", line 502, in _clean_keys_and_objs\n    objs_list = list(objs)\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 81, in <genexpr>\n    compute_result(\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 63, in compute_result\n    item = df.expanding(**attributes).apply(\nTypeError: expanding() got an unexpected keyword argument '_grouper'\n\"\"\"\n\nThe above exception was the direct cause of the following exception:\n\npandarallel_init = None, func_dataframe_groupby_expanding_apply = <function func_dataframe_groupby_expanding_apply.<locals>.<lambda> at 0x74cbdadaf790>, df_size = 1\n\n    def test_dataframe_groupby_expanding_apply(\n        pandarallel_init, func_dataframe_groupby_expanding_apply, df_size\n    ):\n        df = pd.DataFrame(\n            dict(a=np.random.randint(1, 10, df_size), b=np.random.rand(df_size))\n        )\n    \n        res = (\n            df.groupby(\"a\")\n            .b.expanding()\n            .apply(func_dataframe_groupby_expanding_apply, raw=False)\n        )\n        res_parallel = (\n>           df.groupby(\"a\")\n            .b.expanding()\n            .parallel_apply(func_dataframe_groupby_expanding_apply, raw=False)\n        )\n\ntests/test_pandarallel.py:341: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\npandarallel/core.py:333: in closure\n    results_promise.get()\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x74cbdac11610>, timeout = None\n\n    def get(self, timeout=None):\n        self.wait(timeout)\n        if not self.ready():\n            raise TimeoutError\n        if self._success:\n            return self._value\n        else:\n>           raise self._value\nE           TypeError: expanding() got an unexpected keyword argument '_grouper'\n\n../../../../anaconda3/envs/py39/lib/python3.9/multiprocessing/pool.py:771: TypeError\n--------------------------------------------------------------------------------------- Captured stdout setup ---------------------------------------------------------------------------------------\nINFO: Pandarallel will run on 2 workers.\nINFO: Pandarallel will use Memory file system to transfer data between the main process and workers.\n--------------------------------------------------------------------------------------- Captured stdout call ----------------------------------------------------------------------------------------\n   0.00%                                          |        0 /        1 |      \n__________________________________________________________________ test_dataframe_groupby_expanding_apply[anonymous-1-True-False] ___________________________________________________________________\nmultiprocessing.pool.RemoteTraceback: \n\"\"\"\nTraceback (most recent call last):\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/multiprocessing/pool.py\", line 125, in worker\n    result = (True, func(*args, **kwds))\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/multiprocessing/pool.py\", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/core.py\", line 158, in __call__\n    results = self.work_function(\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 93, in work\n    return pd.concat(dfs)\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/reshape/concat.py\", line 380, in concat\n    op = _Concatenator(\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/reshape/concat.py\", line 443, in __init__\n    objs, keys = self._clean_keys_and_objs(objs, keys)\n  File \"/home/user/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/reshape/concat.py\", line 502, in _clean_keys_and_objs\n    objs_list = list(objs)\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 81, in <genexpr>\n    compute_result(\n  File \"/home/user/Documents/repoben/buggycode/nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py\", line 63, in compute_result\n    item = df.expanding(**attributes).apply(\nTypeError: expanding() got an unexpected keyword argument '_grouper'\n\"\"\"\n\nThe above exception was the direct cause of the following exception:\n\npandarallel_init = None, func_dataframe_groupby_expanding_apply = <function func_dataframe_groupby_expanding_apply.<locals>.<lambda> at 0x74cbdab66940>, df_size = 1\n\n    def test_dataframe_groupby_expanding_apply(\n        pandarallel_init, func_dataframe_groupby_expanding_apply, df_size\n    ):\n        df = pd.DataFrame(\n            dict(a=np.random.randint(1, 10, df_size), b=np.random.rand(df_size))\n        )\n    \n        res = (\n            df.groupby(\"a\")\n            .b.expanding()\n            .apply(func_dataframe_groupby_expanding_apply, raw=False)\n        )\n        res_parallel = (\n>           df.groupby(\"a\")\n            .b.expanding()\n            .parallel_apply(func_dataframe_groupby_expanding_apply, raw=False)\n        )\n\ntests/test_pandarallel.py:341: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\npandarallel/core.py:444: in closure\n    results = results_promise.get()\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x74cbdabdcd90>, timeout = None\n\n    def get(self, timeout=None):\n        self.wait(timeout)\n        if not self.ready():\n            raise TimeoutError\n        if self._success:\n            return self._value\n        else:\n>           raise self._value\nE           TypeError: expanding() got an unexpected keyword argument '_grouper'\n\n../../../../anaconda3/envs/py39/lib/python3.9/multiprocessing/pool.py:771: TypeError\n--------------------------------------------------------------------------------------- Captured stdout setup ---------------------------------------------------------------------------------------\nINFO: Pandarallel will run on 2 workers.\nINFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n--------------------------------------------------------------------------------------- Captured stdout call ----------------------------------------------------------------------------------------\n   0.00%                                          |        0 /        1 |      \n========================================================================================= warnings summary ==========================================================================================\ntests/test_pandarallel.py: 16 warnings\n  /home/user/Documents/repoben/buggycode/nalepae_pandarallel/tests/test_pandarallel.py:235: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n    res = df.applymap(func_dataframe_applymap)\n\n-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html\n====================================================================================== short test summary info ======================================================================================\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1000-False-None] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1000-False-False] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1000-True-None] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1000-True-False] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1-False-None] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1-False-False] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1-True-None] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1-True-False] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1000-False-None] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1000-False-False] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1000-True-None] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1000-True-False] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1-False-None] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1-False-False] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1-True-None] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1-True-False] - TypeError: rolling() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[named-1000-False-None] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[named-1000-False-False] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[named-1000-True-None] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[named-1000-True-False] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[named-1-False-None] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[named-1-False-False] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[named-1-True-None] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[named-1-True-False] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[anonymous-1000-False-None] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[anonymous-1000-False-False] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[anonymous-1000-True-None] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[anonymous-1000-True-False] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[anonymous-1-False-None] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[anonymous-1-False-False] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[anonymous-1-True-None] - TypeError: expanding() got an unexpected keyword argument '_grouper'\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_expanding_apply[anonymous-1-True-False] - TypeError: expanding() got an unexpected keyword argument '_grouper'\n============================================================================ 32 failed, 185 passed, 16 warnings in 7.71s ============================================================================",
    "CodeBase": [
      {
        "path": "nalepae_pandarallel/tests/test_pandarallel.py",
        "content": "1 import importlib\n2 import math\n3 \n4 import numpy as np\n5 import pandas as pd\n6 import pytest\n7 from pandarallel import pandarallel\n8 \n9 \n10 @pytest.fixture(params=(1000, 1))\n11 def df_size(request):\n12     return request.param\n13 \n14 \n15 @pytest.fixture(params=(False, True))\n16 def progress_bar(request):\n17     return request.param\n18 \n19 \n20 @pytest.fixture(params=(None, False))\n21 def use_memory_fs(request):\n22     return request.param\n23 \n24 \n25 @pytest.fixture(params=(RuntimeError, AttributeError, ZeroDivisionError))\n26 def exception(request):\n27     return request.param\n28 \n29 \n30 @pytest.fixture(params=(\"named\", \"anonymous\"))\n31 def func_dataframe_apply_axis_0(request):\n32     def func(x):\n33         return max(x) - min(x)\n34 \n35     return dict(named=func, anonymous=lambda x: max(x) - min(x))[request.param]\n36 \n37 \n38 @pytest.fixture(params=(\"named\", \"anonymous\"))\n39 def func_dataframe_apply_axis_1(request):\n40     def func(x):\n41         return math.sin(x.a**2) + math.sin(x.b**2)\n42 \n43     return dict(\n44         named=func, anonymous=lambda x: math.sin(x.a**2) + math.sin(x.b**2)\n45     )[request.param]\n46 \n47 \n48 @pytest.fixture(params=(\"named\", \"anonymous\"))\n49 def func_dataframe_applymap(request):\n50     def func(x):\n51         return math.sin(x**2) - math.cos(x**2)\n52 \n53     return dict(named=func, anonymous=lambda x: math.sin(x**2) - math.cos(x**2))[\n54         request.param\n55     ]\n56 \n57 \n58 @pytest.fixture(params=(\"named\", \"anonymous\"))\n59 def func_series_map(request):\n60     def func(x):\n61         return math.log10(math.sqrt(math.exp(x**2)))\n62 \n63     return dict(\n64         named=func, anonymous=lambda x: math.(...truncated)"
      },
      {
        "path": "nalepae_pandarallel/pandarallel/core.py",
        "content": "1 import multiprocessing\n2 import os\n3 import pickle\n4 from itertools import count\n5 from multiprocessing.managers import SyncManager\n6 from pathlib import Path\n7 from tempfile import NamedTemporaryFile\n8 from typing import Any, Callable, Dict, Iterator, Optional, Tuple, Type, cast\n9 \n10 import dill\n11 import pandas as pd\n12 import psutil\n13 from pandas.core.groupby import DataFrameGroupBy as PandaDataFrameGroupBy\n14 from pandas.core.window.expanding import ExpandingGroupby as PandasExpandingGroupby\n15 from pandas.core.window.rolling import RollingGroupby as PandasRollingGroupby\n16 \n17 from .data_types import (\n18     DataFrame,\n19     DataFrameGroupBy,\n20     DataType,\n21     ExpandingGroupBy,\n22     RollingGroupBy,\n23     Series,\n24     SeriesRolling,\n25 )\n26 from .progress_bars import ProgressBarsType, get_progress_bars, progress_wrapper\n27 from .utils import WorkerStatus\n28 \n29 ON_WINDOWS = os.name == \"nt\"\n30 CONTEXT = multiprocessing.get_context(\"spawn\" if ON_WINDOWS else \"fork\")\n31 \n32 # Root of Memory File System\n33 MEMORY_FS_ROOT = os.environ.get(\"MEMORY_FS_ROOT\", \"/dev/shm\")\n34 \n35 # By default, Pand(...truncated)"
      },
      {
        "path": "nalepae_pandarallel/pandarallel/progress_bars.py",
        "content": "1 import multiprocessing\n2 import os\n3 import shutil\n4 import sys\n5 from abc import ABC, abstractmethod\n6 from enum import Enum\n7 from itertools import count\n8 from time import time_ns\n9 from typing import Callable, List, Union\n10 \n11 from .utils import WorkerStatus\n12 \n13 INTERVAL_NS = 250_000_000  # 0.25 sec\n14 MINIMUM_TERMINAL_WIDTH = 72\n15 \n16 \n17 class ProgressBarsType(int, Enum):\n18     No = 0\n19     InUserDefinedFunction = 1\n20     InUserDefinedFunctionMultiplyByNumberOfColumns = 2\n21     InWorkFunction = 3\n22 \n23 \n24 class ProgressBars(ABC):\n25     @abstractmethod\n26     def __init__(self, maxs: List[int], show: bool) -> None:\n27         ...\n28 \n29     @abstractmethod\n30     def update(self, values: List[int]) -> None:\n31         ...\n32 \n33     def set_error(self, index: int) -> None:\n34         pass\n35 \n36 \n37 class ProgressState(...truncated)"
      },
      {
        "path": "nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py",
        "content": "1 import multiprocessing\n2 from typing import Any, Callable, Dict, Iterable, Iterator, List, Tuple\n3 \n4 import pandas as pd\n5 from pandas.core.window.expanding import ExpandingGroupby as PandasExpandingGroupby\n6 \n7 from ..utils import WorkerStatus, chunk, get_pandas_version\n8 from .generic import DataType\n9 \n10 \n11 class ExpandingGroupBy:\n12     class Apply(DataType):\n13         @staticmethod\n14         def get_chunks(\n15             nb_workers: int, data: PandasExpandingGroupby, *args, **kwargs\n16         ) -> Iterator[List[Tuple[int,(...truncated)"
      }
    ],
    "CommitSHA": "261a652cddb219ac353ff803e81646c08b72fc6f"
  },
  "Score": {
    "Difficulty": "Easy",
    "issue_origin": {
      "Title": 6,
      "Description": 5,
      "Reproducibility": 4,
      "Relevance": 6,
      "Explanation": 7,
      "Overall": 6
    },
    "issue_message": {
      "Title": 7,
      "Description": 6,
      "Reproducibility": 5,
      "Relevance": 8,
      "Explanation": 8,
      "Overall": 7
    },
    "issue_ground": {
      "Title": 8,
      "Description": 8,
      "Reproducibility": 8,
      "Relevance": 8,
      "Explanation": 8,
      "Overall": 8
    },
    "issue_ground_truth": {
      "title": "AttributeError: '_grouper' Key Error in ExpandingGroupBy and RollingGroupBy Operations",
      "description": "Users have reported encountering an `AttributeError` when using `ExpandingGroupBy` and `RollingGroupBy` operations in the `pandarallel` library. The error appears to be triggered due to the presence of an `_grouper` key in the attributes dictionary. This key does not seem to be properly handled, leading to crashes during parallel operations. To reproduce the issue, try using `parallel_apply` on `ExpandingGroupBy` or `RollingGroupBy` objects with any user-defined function. The expected behavior is that the operations should run without triggering an `AttributeError`. This issue needs to be addressed to ensure smooth parallel processing without errors.",
      "explanation": "### Summary of the Issue\n\nThe issue reported is an `AttributeError` encountered when using `ExpandingGroupBy` and `RollingGroupBy` operations in the `pandarallel` library. This error is triggered due to the presence of an `_grouper` key in the attributes dictionary, which is not handled properly, leading to crashes during parallel operations. The error commonly manifests when attempting to use the `parallel_apply` method on `ExpandingGroupBy` or `RollingGroupBy` objects with a user-defined function.\n\n### Detailed Explanation\n\n#### Cause of the Issue\n\nThe `AttributeError` is caused by the `_grouper` key in the attributes dictionary of `ExpandingGroupBy` and `RollingGroupBy` objects. In the context of the `pandarallel` library, these objects store certain attributes that describe how the data should be grouped and processed. The `_grouper` attribute is part of the internals of Pandas' groupby functionality and is not intended to be modified or used directly. However, during the execution of `parallel_apply`, this attribute is inadvertently included in the attributes dictionary, leading to errors when the library attempts to use or manipulate it.\n\n#### Content of the Commit\n\nTo resolve this issue, the developer made changes to the handling of the attributes dictionary in the `ExpandingGroupBy` and `RollingGroupBy` components of the library. Specifically, the commit involves the removal of the `_grouper` key from the attributes dictionary if it exists. This is achieved by adding a line to pop the `_grouper` key:\n\n1. **Location:** The fix is applied in the files handling `ExpandingGroupBy` and `RollingGroupBy`.\n2. **Action:** \n   - Before operating further with the attributes, the `_grouper` key is removed if it is present.\n   - This ensures that the subsequent operations do not encounter an `AttributeError` due to the presence of the `_grouper` key.\n\n### How the Commit Solves the Issue\n\nBy removing the `_grouper` key from the attributes dictionary, the developer ensures that this internal and potentially problematic attribute does not interfere with the functioning of `parallel_apply`. Here's a summarized breakdown of how the solution works:\n\n1. **Identification of Problem Area:** The commit identifies that the issue stems from handling the attributes dictionary in `ExpandingGroupBy` and `RollingGroupBy`.\n2. **Safe Removal:** The solution involves safely removing the `_grouper` key using a pop operation. This method is chosen because it can remove the key if it exists, and do nothing if it doesn't, thus preventing any further errors.\n3. **Prevention of Errors:** With the `_grouper` key removed, the attributes dictionary no longer contains keys that trigger `AttributeError` during parallel operations. This allows the `parallel_apply` method to run smoothly.\n\n### Solution Explanation\n\nIn essence, the solution involves a preventative measure to remove a specific key from a dictionary before it causes a problem. The developer implemented this by adding an additional step in the processing of the attributes dictionary to ensure that `_grouper` is not included. This simple yet effective approach prevents the crashing of `parallel_apply` and is a good example of handling internal dependencies safely within a library.\n\n### Conclusion\n\nThe issue is caused by the mishandling of internal attributes that are not meant to be modified or used directly. The commit solves this by removing the `_grouper` key from the attributes dictionary, thus preventing the `AttributeError`. This solution ensures that `ExpandingGroupBy` and `RollingGroupBy` operations can proceed without encountering errors related to the internal workings of Pandas' groupby functionality. This fix improves the robustness and reliability of the `pandarallel` library when handling parallel operations on grouped data."
    }
  }
}