{
  "RepoName": "https://github.com/nalepae/pandarallel.git",
  "CommitSHA": "261a652cddb219ac353ff803e81646c08b72fc6f",
  "Time": "",
  "Difficulty": "Easy",
  "Type": "logic error",
  "BuggyCode": [
    {
      "path": "nalepae_pandarallel/setup.py",
      "content": "from setuptools import setup\n\nsetup()\n"
    },
    {
      "path": "nalepae_pandarallel/tests/test_pandarallel.py",
      "content": "import importlib\nimport math\n\nimport numpy as np\nimport pandas as pd\nimport pytest\nfrom pandarallel import pandarallel\n\n\n@pytest.fixture(params=(1000, 1))\ndef df_size(request):\n    return request.param\n\n\n@pytest.fixture(params=(False, True))\ndef progress_bar(request):\n    return request.param\n\n\n@pytest.fixture(params=(None, False))\ndef use_memory_fs(request):\n    return request.param\n\n\n@pytest.fixture(params=(RuntimeError, AttributeError, ZeroDivisionError))\ndef exception(request):\n    return request.param\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_apply_axis_0(request):\n    def func(x):\n        return max(x) - min(x)\n\n    return dict(named=func, anonymous=lambda x: max(x) - min(x))[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_apply_axis_1(request):\n    def func(x):\n        return math.sin(x.a**2) + math.sin(x.b**2)\n\n    return dict(\n        named=func, anonymous=lambda x: math.sin(x.a**2) + math.sin(x.b**2)\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_applymap(request):\n    def func(x):\n        return math.sin(x**2) - math.cos(x**2)\n\n    return dict(named=func, anonymous=lambda x: math.sin(x**2) - math.cos(x**2))[\n        request.param\n    ]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_series_map(request):\n    def func(x):\n        return math.log10(math.sqrt(math.exp(x**2)))\n\n    return dict(\n        named=func, anonymous=lambda x: math.log10(math.sqrt(math.exp(x**2)))\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_series_apply(request):\n    def func(x, power, bias=0):\n        return math.log10(math.sqrt(math.exp(x**power))) + bias\n\n    return dict(\n        named=func,\n        anonymous=lambda x, power, bias=0: math.log10(math.sqrt(math.exp(x**power)))\n        + bias,\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_series_rolling_apply(request):\n    def func(x):\n        return x.iloc[0] + x.iloc[1] ** 2 + x.iloc[2] ** 3 + x.iloc[3] ** 4\n\n    return dict(\n        named=func,\n        anonymous=lambda x: x.iloc[0]\n        + x.iloc[1] ** 2\n        + x.iloc[2] ** 3\n        + x.iloc[3] ** 4,\n    )[request.param]\n\n\n@pytest.fixture()\ndef func_dataframe_groupby_apply():\n    def func(df):\n        dum = 0\n        for item in df.b:\n            dum += math.log10(math.sqrt(math.exp(item**2)))\n\n        return dum / len(df.b)\n\n    return func\n\n\n@pytest.fixture()\ndef func_dataframe_groupby_apply_complex():\n    def func(df):\n        return pd.DataFrame(\n            [[df.b.mean(), df.b.min(), df.b.max()]],\n            columns=[\"b_mean\", \"b_min\", \"b_max\"],\n        )\n\n    return func\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_groupby_rolling_apply(request):\n    def func(x):\n        return x.iloc[0] + x.iloc[1] ** 2 + x.iloc[2] ** 3 + x.iloc[3] ** 4\n\n    return dict(\n        named=func,\n        anonymous=lambda x: x.iloc[0]\n        + x.iloc[1] ** 2\n        + x.iloc[2] ** 3\n        + x.iloc[3] ** 4,\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_groupby_expanding_apply(request):\n    def func(x):\n        return (x.multiply(pd.Series(range(1, len(x)), dtype=\"float\"))).sum()\n\n    return dict(\n        named=func,\n        anonymous=lambda x: (\n            x.multiply(pd.Series(range(1, len(x)), dtype=\"float\"))\n        ).sum(),\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_apply_axis_0_no_reduce(request):\n    def func(x):\n        return x\n\n    return dict(named=func, anonymous=lambda x: x)[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_apply_axis_1_no_reduce(request):\n    def func(x):\n        return x**2\n\n    return dict(named=func, anonymous=lambda x: x**2)[request.param]\n\n\n@pytest.fixture\ndef pandarallel_init(progress_bar, use_memory_fs):\n    pandarallel.initialize(\n        progress_bar=progress_bar, use_memory_fs=use_memory_fs, nb_workers=2\n    )\n\n\ndef test_dataframe_apply_invalid_function(pandarallel_init, exception):\n    def f(_):\n        raise exception\n\n    df = pd.DataFrame(dict(a=[1, 2, 3, 4]))\n\n    with pytest.raises(exception):\n        df.parallel_apply(f)\n\n\ndef test_dataframe_apply_axis_0(pandarallel_init, func_dataframe_apply_axis_0, df_size):\n    df = pd.DataFrame(\n        dict(\n            a=np.random.randint(1, 8, df_size),\n            b=np.random.rand(df_size),\n            c=np.random.randint(1, 8, df_size),\n            d=np.random.rand(df_size),\n            e=np.random.randint(1, 8, df_size),\n            f=np.random.rand(df_size),\n            g=np.random.randint(1, 8, df_size),\n            h=np.random.rand(df_size),\n        )\n    )\n    df.index = [item / 10 for item in df.index]\n\n    res = df.apply(func_dataframe_apply_axis_0)\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_0)\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_apply_axis_1(pandarallel_init, func_dataframe_apply_axis_1, df_size):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 8, df_size), b=np.random.rand(df_size))\n    )\n    df.index = [item / 10 for item in df.index]\n\n    res = df.apply(func_dataframe_apply_axis_1, axis=1)\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_1, axis=1)\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_apply_invalid_axis(pandarallel_init):\n    df = pd.DataFrame(dict(a=[1, 2, 3, 4]))\n\n    with pytest.raises(ValueError):\n        df.parallel_apply(lambda x: x, axis=\"invalid\")\n    \ndef test_empty_dataframe_apply_axis_0(pandarallel_init, func_dataframe_apply_axis_0):\n    df = pd.DataFrame()\n\n    res = df.apply(func_dataframe_apply_axis_0)\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_0)\n    assert res.equals(res_parallel)\n\ndef test_empty_dataframe_apply_axis_1(pandarallel_init, func_dataframe_apply_axis_1):\n    df = pd.DataFrame()\n\n    res = df.apply(func_dataframe_apply_axis_1)\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_1)\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_applymap(pandarallel_init, func_dataframe_applymap, df_size):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 8, df_size), b=np.random.rand(df_size))\n    )\n    df.index = [item / 10 for item in df.index]\n\n    res = df.applymap(func_dataframe_applymap)\n    res_parallel = df.parallel_applymap(func_dataframe_applymap)\n    assert res.equals(res_parallel)\n\n\ndef test_series_map(pandarallel_init, func_series_map, df_size):\n    df = pd.DataFrame(dict(a=np.random.rand(df_size) + 1))\n\n    res = df.a.map(func_series_map)\n    res_parallel = df.a.parallel_map(func_series_map)\n    assert res.equals(res_parallel)\n\n\ndef test_series_apply(pandarallel_init, func_series_apply, df_size):\n    df = pd.DataFrame(dict(a=np.random.rand(df_size) + 1))\n\n    res = df.a.apply(func_series_apply, args=(2,), bias=3)\n    res_parallel = df.a.parallel_apply(func_series_apply, args=(2,), bias=3)\n    assert res.equals(res_parallel)\n\ndef test_empty_series_apply(pandarallel_init, func_series_apply):\n    df = pd.DataFrame(dict(a=[]))\n\n    res = df.a.apply(func_series_apply, args=(2,), bias=3)\n    res_parallel = df.a.parallel_apply(func_series_apply, args=(2,), bias=3)\n    assert res.equals(res_parallel)\n\n\ndef test_series_rolling_apply(pandarallel_init, func_series_rolling_apply, df_size):\n    df = pd.DataFrame(dict(a=np.random.randint(1, 8, df_size), b=list(range(df_size))))\n\n    res = df.b.rolling(4).apply(func_series_rolling_apply, raw=False)\n    res_parallel = df.b.rolling(4).parallel_apply(func_series_rolling_apply, raw=False)\n\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_groupby_apply(\n    pandarallel_init, func_dataframe_groupby_apply, df_size\n):\n    df = pd.DataFrame(\n        dict(\n            a=np.random.randint(1, 8, df_size),\n            b=np.random.rand(df_size),\n            c=np.random.rand(df_size),\n        )\n    )\n\n    res = df.groupby(\"a\").apply(func_dataframe_groupby_apply)\n    res_parallel = df.groupby(\"a\").parallel_apply(func_dataframe_groupby_apply)\n    assert res.equals(res_parallel)\n\n    res = df.groupby([\"a\"]).apply(func_dataframe_groupby_apply)\n    res_parallel = df.groupby([\"a\"]).parallel_apply(func_dataframe_groupby_apply)\n    assert res.equals(res_parallel)\n\n    res = df.groupby([\"a\", \"b\"]).apply(func_dataframe_groupby_apply)\n    res_parallel = df.groupby([\"a\", \"b\"]).parallel_apply(func_dataframe_groupby_apply)\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_groupby_apply_complex(\n    pandarallel_init, func_dataframe_groupby_apply_complex, df_size\n):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 100, df_size), b=np.random.rand(df_size))\n    )\n\n    res = df.groupby(\"a\").apply(func_dataframe_groupby_apply_complex)\n    res_parallel = df.groupby(\"a\").parallel_apply(func_dataframe_groupby_apply_complex)\n    res.equals(res_parallel)\n\n\ndef test_dataframe_groupby_rolling_apply(\n    pandarallel_init, func_dataframe_groupby_rolling_apply, df_size\n):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 10, df_size), b=np.random.rand(df_size))\n    )\n\n    res = (\n        df.groupby(\"a\")\n        .b.rolling(4)\n        .apply(func_dataframe_groupby_rolling_apply, raw=False)\n    )\n    res_parallel = (\n        df.groupby(\"a\")\n        .b.rolling(4)\n        .parallel_apply(func_dataframe_groupby_rolling_apply, raw=False)\n    )\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_groupby_expanding_apply(\n    pandarallel_init, func_dataframe_groupby_expanding_apply, df_size\n):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 10, df_size), b=np.random.rand(df_size))\n    )\n\n    res = (\n        df.groupby(\"a\")\n        .b.expanding()\n        .apply(func_dataframe_groupby_expanding_apply, raw=False)\n    )\n    res_parallel = (\n        df.groupby(\"a\")\n        .b.expanding()\n        .parallel_apply(func_dataframe_groupby_expanding_apply, raw=False)\n    )\n    res.equals(res_parallel)\n\n\ndef test_dataframe_axis_0_no_reduction(\n    pandarallel_init, func_dataframe_apply_axis_0_no_reduce, df_size\n):\n    df = pd.DataFrame(\n        dict(\n            a=np.random.randint(1, 10, df_size),\n            b=np.random.randint(1, 10, df_size),\n            c=np.random.randint(1, 10, df_size),\n        )\n    )\n    res = df.apply(func_dataframe_apply_axis_0_no_reduce)\n\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_0_no_reduce)\n\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_axis_1_no_reduction(\n    pandarallel_init, func_dataframe_apply_axis_1_no_reduce, df_size\n):\n    df = pd.DataFrame(\n        dict(\n            a=np.random.randint(1, 10, df_size),\n            b=np.random.randint(1, 10, df_size),\n            c=np.random.randint(1, 10, df_size),\n        )\n    )\n\n    res = df.apply(func_dataframe_apply_axis_1_no_reduce, axis=1)\n\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_1_no_reduce, axis=1)\n\n    assert res.equals(res_parallel)\n\ndef test_memory_fs_root_environment_variable(monkeypatch):\n    monkeypatch.setenv(\"MEMORY_FS_ROOT\", \"/test\")\n    from pandarallel import core\n    importlib.reload(core)\n\n    assert core.MEMORY_FS_ROOT == \"/test\"\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/utils.py",
      "content": "import itertools\nfrom enum import Enum\nfrom typing import Any, Dict, List, Tuple\n\nimport pandas as pd\nfrom pandas import DataFrame, Index\n\n\ndef chunk(nb_item: int, nb_chunks: int, start_offset=0) -> List[slice]:\n    \"\"\"\n    Return `nb_chunks` slices of approximatively `nb_item / nb_chunks` each.\n\n    Parameters\n    ----------\n    nb_item : int\n        Total number of items\n\n    nb_chunks : int\n        Number of chunks to return\n\n    start_offset : int\n        Shift start of slice by this amount\n\n    Returns\n    -------\n    A list of slices\n\n    Examples\n    --------\n    >>> chunks = chunk(103, 4)\n    >>> chunks\n    [slice(0, 26, None), slice(26, 52, None), slice(52, 78, None), slice(78, 103, None)]\n    \"\"\"\n    if nb_item == 0:\n        return [slice(0)]\n    \n    if nb_item <= nb_chunks:\n        return [slice(max(0, idx - start_offset), idx + 1) for idx in range(nb_item)]\n\n    quotient = nb_item // nb_chunks\n    remainder = nb_item % nb_chunks\n\n    quotients = [quotient] * nb_chunks\n    remainders = [1] * remainder + [0] * (nb_chunks - remainder)\n\n    nb_elems_per_chunk = [\n        quotient + remainder for quotient, remainder in zip(quotients, remainders)\n    ]\n\n    accumulated = list(itertools.accumulate(nb_elems_per_chunk))\n    shifted_accumulated = accumulated.copy()\n    shifted_accumulated.insert(0, 0)\n    shifted_accumulated.pop()\n\n    return [\n        slice(max(0, begin - start_offset), end)\n        for begin, end in zip(shifted_accumulated, accumulated)\n    ]\n\n\ndef df_indexed_like(df: DataFrame, axes: List[Index]) -> bool:\n    \"\"\"\n    Returns whether a data frame is indexed in the way specified by the\n    provided axes.\n\n    Used by DataFrameGroupBy to determine whether a group has been modified.\n\n    Function adapted from pandas.core.groupby.ops._is_indexed_like\n\n    Parameters\n    ----------\n    df : DataFrame\n        The data frame in question\n\n    axes : List[Index]\n        The axes to which the data frame is compared\n\n    Returns\n    -------\n    Whether or not the data frame is indexed in the same wa as the axes.\n    \"\"\"\n    if isinstance(df, DataFrame):\n        return df.axes[0].equals(axes[0])\n\n    return False\n\n\ndef get_pandas_version() -> Tuple[int, int]:\n    major_str, minor_str, *_ = pd.__version__.split(\".\")\n    return int(major_str), int(minor_str)\n\n\ndef get_axis_int(user_defined_function_kwargs: Dict[str, Any]):\n    axis = user_defined_function_kwargs.get(\"axis\", 0)\n\n    if axis not in {0, 1, \"index\", \"columns\"}:\n        raise ValueError(f\"No axis named {axis} for object type DataFrame\")\n\n    return {0: 0, 1: 1, \"index\": 0, \"columns\": 1}[axis]\n\n\nclass WorkerStatus(int, Enum):\n    Running = 0\n    Success = 1\n    Error = 2\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/core.py",
      "content": "import multiprocessing\nimport os\nimport pickle\nfrom itertools import count\nfrom multiprocessing.managers import SyncManager\nfrom pathlib import Path\nfrom tempfile import NamedTemporaryFile\nfrom typing import Any, Callable, Dict, Iterator, Optional, Tuple, Type, cast\n\nimport dill\nimport pandas as pd\nimport psutil\nfrom pandas.core.groupby import DataFrameGroupBy as PandaDataFrameGroupBy\nfrom pandas.core.window.expanding import ExpandingGroupby as PandasExpandingGroupby\nfrom pandas.core.window.rolling import RollingGroupby as PandasRollingGroupby\n\nfrom .data_types import (\n    DataFrame,\n    DataFrameGroupBy,\n    DataType,\n    ExpandingGroupBy,\n    RollingGroupBy,\n    Series,\n    SeriesRolling,\n)\nfrom .progress_bars import ProgressBarsType, get_progress_bars, progress_wrapper\nfrom .utils import WorkerStatus\n\nON_WINDOWS = os.name == \"nt\"\nCONTEXT = multiprocessing.get_context(\"spawn\" if ON_WINDOWS else \"fork\")\n\n# Root of Memory File System\nMEMORY_FS_ROOT = os.environ.get(\"MEMORY_FS_ROOT\", \"/dev/shm\")\n\n# By default, Pandarallel use all available CPUs\nNB_PHYSICAL_CORES = psutil.cpu_count(logical=False)\n\n# Prefix and suffix for files used with Memory File System\nPREFIX = \"pandarallel\"\nPREFIX_INPUT = f\"{PREFIX}_input_\"\nPREFIX_OUTPUT = f\"{PREFIX}_output_\"\nSUFFIX = \".pickle\"\n\n# We use these classes decorators pattern instead of the classic one because of this:\n# https://www.stevenengelhardt.com/2013/01/16/python-multiprocessing-module-and-closures/\n\n\nclass WrapWorkFunctionForFileSystem:\n    def __init__(\n        self,\n        work_function: Callable[\n            [Any, Callable, tuple, Dict[str, Any], Dict[str, Any]], Any\n        ],\n    ) -> None:\n        self.work_function = work_function\n\n    def __call__(\n        self,\n        input_file_path: Path,\n        output_file_path: Path,\n        progress_bars_type: ProgressBarsType,\n        worker_index: int,\n        master_workers_queue: multiprocessing.Queue,\n        dilled_user_defined_function: bytes,\n        user_defined_function_args: tuple,\n        user_defined_function_kwargs: Dict[str, Any],\n        extra: Dict[str, Any],\n    ) -> None:\n        try:\n            # Load dataframe from input file\n            with input_file_path.open(\"rb\") as file_descriptor:\n                data = pickle.load(file_descriptor)\n\n            # Delete input file since we don't need it any more. It will free some RAM\n            # since the input file is stored into Shared Memory.\n            input_file_path.unlink()\n\n            data_size = len(data)\n            user_defined_function: Callable = dill.loads(dilled_user_defined_function)\n\n            progress_wrapped_user_defined_function = progress_wrapper(\n                user_defined_function, master_workers_queue, worker_index, data_size\n            )\n\n            used_user_defined_function = (\n                progress_wrapped_user_defined_function\n                if progress_bars_type\n                in (\n                    ProgressBarsType.InUserDefinedFunction,\n                    ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns,\n                )\n                else user_defined_function\n            )\n\n            result = self.work_function(\n                data,\n                used_user_defined_function,\n                user_defined_function_args,\n                user_defined_function_kwargs,\n                extra,\n            )\n\n            with output_file_path.open(\"wb\") as file_descriptor:\n                pickle.dump(result, file_descriptor)\n\n            master_workers_queue.put((worker_index, WorkerStatus.Success, None))\n\n        except:\n            master_workers_queue.put((worker_index, WorkerStatus.Error, None))\n            raise\n\n\nclass WrapWorkFunctionForPipe:\n    def __init__(\n        self,\n        work_function: Callable[\n            [\n                Any,\n                Callable,\n                tuple,\n                Dict[str, Any],\n                Dict[str, Any],\n            ],\n            Any,\n        ],\n    ) -> None:\n        self.work_function = work_function\n\n    def __call__(\n        self,\n        data: Any,\n        progress_bars_type: ProgressBarsType,\n        worker_index: int,\n        master_workers_queue: multiprocessing.Queue,\n        dilled_user_defined_function: bytes,\n        user_defined_function_args: tuple,\n        user_defined_function_kwargs: Dict[str, Any],\n        extra: Dict[str, Any],\n    ) -> Any:\n        try:\n            data_size = len(data)\n            user_defined_function: Callable = dill.loads(dilled_user_defined_function)\n\n            progress_wrapped_user_defined_function = progress_wrapper(\n                user_defined_function, master_workers_queue, worker_index, data_size\n            )\n\n            used_user_defined_function = (\n                progress_wrapped_user_defined_function\n                if progress_bars_type\n                in (\n                    ProgressBarsType.InUserDefinedFunction,\n                    ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns,\n                )\n                else user_defined_function\n            )\n\n            results = self.work_function(\n                data,\n                used_user_defined_function,\n                user_defined_function_args,\n                user_defined_function_kwargs,\n                extra,\n            )\n\n            master_workers_queue.put((worker_index, WorkerStatus.Success, None))\n\n            return results\n\n        except:\n            master_workers_queue.put((worker_index, WorkerStatus.Error, None))\n            raise\n\n\ndef wrap_reduce_function_for_file_system(\n    reduce_function: Callable[[Iterator, Dict[str, Any]], Any]\n) -> Callable[[Iterator[Path], Dict[str, Any]], Any]:\n    \"\"\"This wrapper transforms a `reduce` function which takes as input:\n    - A list of pandas Dataframe\n    - An user defined function\n    and which returns a pandas Dataframe, into a `reduct` function which takes as input:\n    - A list of paths where  pandas Dataframe are pickled\n    which returns a pandas Dataframe.\n    \"\"\"\n\n    def closure(output_file_paths: Iterator[Path], extra: Dict[str, Any]) -> Any:\n        def get_dataframe_and_delete_file(file_path: Path) -> Any:\n            with file_path.open(\"rb\") as file_descriptor:\n                data = pickle.load(file_descriptor)\n\n            file_path.unlink()\n            return data\n\n        dfs = (\n            get_dataframe_and_delete_file(output_file_path)\n            for output_file_path in output_file_paths\n        )\n\n        return reduce_function(dfs, extra)\n\n    return closure\n\n\ndef parallelize_with_memory_file_system(\n    nb_requested_workers: int,\n    data_type: Type[DataType],\n    progress_bars_type: ProgressBarsType,\n):\n    def closure(\n        data: Any,\n        user_defined_function: Callable,\n        *user_defined_function_args: tuple,\n        **user_defined_function_kwargs: Dict[str, Any],\n    ):\n        wrapped_work_function = WrapWorkFunctionForFileSystem(data_type.work)\n        wrapped_reduce_function = wrap_reduce_function_for_file_system(data_type.reduce)\n\n        chunks = list(\n            data_type.get_chunks(\n                nb_requested_workers,\n                data,\n                user_defined_function_kwargs=user_defined_function_kwargs,\n            )\n        )\n\n        nb_workers = len(chunks)\n\n        multiplicator_factor = (\n            len(cast(pd.DataFrame, data).columns)\n            if progress_bars_type\n            == ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns\n            else 1\n        )\n\n        progresses_length = [len(chunk_) * multiplicator_factor for chunk_ in chunks]\n\n        work_extra = data_type.get_work_extra(data)\n        reduce_extra = data_type.get_reduce_extra(data, user_defined_function_kwargs)\n\n        show_progress_bars = progress_bars_type != ProgressBarsType.No\n\n        progress_bars = get_progress_bars(progresses_length, show_progress_bars)\n        progresses = [0] * nb_workers\n        workers_status = [WorkerStatus.Running] * nb_workers\n\n        input_files = [\n            NamedTemporaryFile(\n                prefix=PREFIX_INPUT, suffix=SUFFIX, dir=MEMORY_FS_ROOT, delete=False\n            )\n            for _ in range(nb_workers)\n        ]\n\n        output_files = [\n            NamedTemporaryFile(\n                prefix=PREFIX_OUTPUT, suffix=SUFFIX, dir=MEMORY_FS_ROOT, delete=False\n            )\n            for _ in range(nb_workers)\n        ]\n\n        try:\n            for chunk, input_file in zip(chunks, input_files):\n                with Path(input_file.name).open(\"wb\") as file_descriptor:\n                    pickle.dump(chunk, file_descriptor)\n\n            dilled_user_defined_function = dill.dumps(user_defined_function)\n            manager: SyncManager = CONTEXT.Manager()\n            master_workers_queue = manager.Queue()\n\n            work_args_list = [\n                (\n                    Path(input_file.name),\n                    Path(output_file.name),\n                    progress_bars_type,\n                    worker_index,\n                    master_workers_queue,\n                    dilled_user_defined_function,\n                    user_defined_function_args,\n                    user_defined_function_kwargs,\n                    {\n                        **work_extra,\n                        **{\n                            \"master_workers_queue\": master_workers_queue,\n                            \"show_progress_bars\": show_progress_bars,\n                            \"worker_index\": worker_index,\n                        },\n                    },\n                )\n                for worker_index, (\n                    input_file,\n                    output_file,\n                ) in enumerate(zip(input_files, output_files))\n            ]\n\n            pool = CONTEXT.Pool(nb_workers)\n            results_promise = pool.starmap_async(wrapped_work_function, work_args_list)\n\n            pool.close()\n\n            generation = count()\n\n            while any(\n                (\n                    worker_status == WorkerStatus.Running\n                    for worker_status in workers_status\n                )\n            ):\n                message: Tuple[int, WorkerStatus, Any] = master_workers_queue.get()\n                worker_index, worker_status, payload = message\n                workers_status[worker_index] = worker_status\n\n                if worker_status == WorkerStatus.Success:\n                    progresses[worker_index] = progresses_length[worker_index]\n                    progress_bars.update(progresses)\n                elif worker_status == WorkerStatus.Running:\n                    progress = cast(int, payload)\n                    progresses[worker_index] = progress\n\n                    if next(generation) % nb_workers == 0:\n                        progress_bars.update(progresses)\n                elif worker_status == WorkerStatus.Error:\n                    progress_bars.set_error(worker_index)\n                    progress_bars.update(progresses)\n\n            try:\n                return wrapped_reduce_function(\n                    (Path(output_file.name) for output_file in output_files),\n                    reduce_extra,\n                )\n            except EOFError:\n                # Loading the files failed, this most likely means that there\n                # was some error during processing and the files were never\n                # saved at all.\n                results_promise.get()\n\n                # If the above statement does not raise an exception, that\n                # means the multiprocessing went well and we want to re-raise\n                # the original EOFError.\n                raise\n\n        finally:\n            for output_file in output_files:\n                # When pandarallel stop supporting Python 3.7 and older, replace this\n                # try/except clause by:\n                # Path(output_file.name).unlink(missing_ok=True)\n                try:\n                    Path(output_file.name).unlink()\n                except FileNotFoundError:\n                    # Do nothing, this is the nominal case.\n                    pass\n\n    return closure\n\n\ndef parallelize_with_pipe(\n    nb_requested_workers: int,\n    data_type: Type[DataType],\n    progress_bars_type: ProgressBarsType,\n):\n    def closure(\n        data: Any,\n        user_defined_function: Callable,\n        *user_defined_function_args: tuple,\n        **user_defined_function_kwargs: Dict[str, Any],\n    ):\n        wrapped_work_function = WrapWorkFunctionForPipe(data_type.work)\n        dilled_user_defined_function = dill.dumps(user_defined_function)\n        manager: SyncManager = CONTEXT.Manager()\n        master_workers_queue = manager.Queue()\n\n        chunks = list(\n            data_type.get_chunks(\n                nb_requested_workers,\n                data,\n                user_defined_function_kwargs=user_defined_function_kwargs,\n            )\n        )\n\n        nb_workers = len(chunks)\n\n        multiplicator_factor = (\n            len(cast(pd.DataFrame, data).columns)\n            if progress_bars_type\n            == ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns\n            else 1\n        )\n\n        progresses_length = [len(chunk_) * multiplicator_factor for chunk_ in chunks]\n\n        work_extra = data_type.get_work_extra(data)\n        reduce_extra = data_type.get_reduce_extra(data, user_defined_function_kwargs)\n\n        show_progress_bars = progress_bars_type != ProgressBarsType.No\n\n        progress_bars = get_progress_bars(progresses_length, show_progress_bars)\n        progresses = [0] * nb_workers\n        workers_status = [WorkerStatus.Running] * nb_workers\n\n        work_args_list = [\n            (\n                chunk,\n                progress_bars_type,\n                worker_index,\n                master_workers_queue,\n                dilled_user_defined_function,\n                user_defined_function_args,\n                user_defined_function_kwargs,\n                {\n                    **work_extra,\n                    **{\n                        \"master_workers_queue\": master_workers_queue,\n                        \"show_progress_bars\": show_progress_bars,\n                        \"worker_index\": worker_index,\n                    },\n                },\n            )\n            for worker_index, chunk in enumerate(chunks)\n        ]\n\n        pool = CONTEXT.Pool(nb_workers)\n        results_promise = pool.starmap_async(wrapped_work_function, work_args_list)\n        pool.close()\n\n        generation = count()\n\n        while any(\n            (worker_status == WorkerStatus.Running for worker_status in workers_status)\n        ):\n            message: Tuple[int, WorkerStatus, Any] = master_workers_queue.get()\n            worker_index, worker_status, payload = message\n            workers_status[worker_index] = worker_status\n\n            if worker_status == WorkerStatus.Success:\n                progresses[worker_index] = progresses_length[worker_index]\n                progress_bars.update(progresses)\n            elif worker_status == WorkerStatus.Running:\n                progress = cast(int, payload)\n                progresses[worker_index] = progress\n\n                if next(generation) % nb_workers == 0:\n                    progress_bars.update(progresses)\n            elif worker_status == WorkerStatus.Error:\n                progress_bars.set_error(worker_index)\n\n        results = results_promise.get()\n\n        return data_type.reduce(results, reduce_extra)\n\n    return closure\n\n\nclass pandarallel:\n    @classmethod\n    def initialize(\n        cls,\n        shm_size_mb=None,\n        nb_workers=NB_PHYSICAL_CORES,\n        progress_bar=False,\n        verbose=2,\n        use_memory_fs: Optional[bool] = None,\n    ) -> None:\n        show_progress_bars = progress_bar\n        is_memory_fs_available = Path(MEMORY_FS_ROOT).exists()\n\n        use_memory_fs = (\n            use_memory_fs if use_memory_fs is not None else is_memory_fs_available\n        )\n\n        parallelize = (\n            parallelize_with_memory_file_system\n            if use_memory_fs\n            else parallelize_with_pipe\n        )\n\n        if use_memory_fs and not is_memory_fs_available:\n            raise SystemError(\"Memory file system is not available\")\n\n        if verbose >= 2:\n            print(f\"INFO: Pandarallel will run on {nb_workers} workers.\")\n\n            message = (\n                (\n                    \"INFO: Pandarallel will use Memory file system to transfer data \"\n                    \"between the main process and workers.\"\n                )\n                if use_memory_fs\n                else (\n                    \"INFO: Pandarallel will use standard multiprocessing data transfer \"\n                    \"(pipe) to transfer data between the main process and workers.\"\n                )\n            )\n\n            print(message)\n\n            if ON_WINDOWS and verbose >= 2:\n                print()\n                print(\n                    (\n                        \"WARNING: You are on Windows. If you detect any issue with \"\n                        \"pandarallel, be sure you checked out the Troubleshooting page:\"\n                    )\n                )\n                print(\"https://nalepae.github.io/pandarallel/troubleshooting/\")\n\n        progress_bars_in_user_defined_function = (\n            ProgressBarsType.InUserDefinedFunction\n            if show_progress_bars\n            else ProgressBarsType.No\n        )\n\n        progress_bars_in_user_defined_function_multiply_by_number_of_columns = (\n            ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns\n            if show_progress_bars\n            else ProgressBarsType.No\n        )\n\n        progress_bars_in_work_function = (\n            ProgressBarsType.InWorkFunction\n            if show_progress_bars\n            else ProgressBarsType.No\n        )\n\n        # DataFrame\n        pd.DataFrame.parallel_apply = parallelize(\n            nb_workers, DataFrame.Apply, progress_bars_in_user_defined_function\n        )\n        pd.DataFrame.parallel_applymap = parallelize(\n            nb_workers,\n            DataFrame.ApplyMap,\n            progress_bars_in_user_defined_function_multiply_by_number_of_columns,\n        )\n\n        # DataFrame GroupBy\n        PandaDataFrameGroupBy.parallel_apply = parallelize(\n            nb_workers, DataFrameGroupBy.Apply, progress_bars_in_user_defined_function\n        )\n\n        # Expanding GroupBy\n        PandasExpandingGroupby.parallel_apply = parallelize(\n            nb_workers, ExpandingGroupBy.Apply, progress_bars_in_work_function\n        )\n\n        # Rolling GroupBy\n        PandasRollingGroupby.parallel_apply = parallelize(\n            nb_workers, RollingGroupBy.Apply, progress_bars_in_work_function\n        )\n\n        # Series\n        pd.Series.parallel_apply = parallelize(\n            nb_workers, Series.Apply, progress_bars_in_user_defined_function\n        )\n        pd.Series.parallel_map = parallelize(nb_workers, Series.Map, show_progress_bars)\n\n        # Series Rolling\n        pd.core.window.Rolling.parallel_apply = parallelize(\n            nb_workers, SeriesRolling.Apply, progress_bars_in_user_defined_function\n        )\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/progress_bars.py",
      "content": "import multiprocessing\nimport os\nimport shutil\nimport sys\nfrom abc import ABC, abstractmethod\nfrom enum import Enum\nfrom itertools import count\nfrom time import time_ns\nfrom typing import Callable, List, Union\n\nfrom .utils import WorkerStatus\n\nINTERVAL_NS = 250_000_000  # 0.25 sec\nMINIMUM_TERMINAL_WIDTH = 72\n\n\nclass ProgressBarsType(int, Enum):\n    No = 0\n    InUserDefinedFunction = 1\n    InUserDefinedFunctionMultiplyByNumberOfColumns = 2\n    InWorkFunction = 3\n\n\nclass ProgressBars(ABC):\n    @abstractmethod\n    def __init__(self, maxs: List[int], show: bool) -> None:\n        ...\n\n    @abstractmethod\n    def update(self, values: List[int]) -> None:\n        ...\n\n    def set_error(self, index: int) -> None:\n        pass\n\n\nclass ProgressState:\n    def __init__(self, chunk_size: int) -> None:\n        self.last_put_iteration = 0\n        self.next_put_iteration = max(chunk_size // 100, 1)\n        self.last_put_time = time_ns()\n\n\ndef is_notebook_lab() -> bool:\n    try:\n        shell: str = get_ipython().__class__.__name__  # type: ignore\n\n        # Shell: Google Colab\n        # TerminalInteractiveShell: Terminal running IPython\n        # ZMQInteractiveShell: Jupyter notebook/lab or qtconsole\n        return shell in {\"Shell\", \"ZMQInteractiveShell\"}\n    except NameError:\n        # Probably standard Python interpreter\n        return False\n\n\nclass ProgressBarsConsole(ProgressBars):\n    def __init__(self, maxs: List[int], show: bool) -> None:\n        self.__show = show\n        self.__bars = [[0, max] for max in maxs]\n        self.__width = self.__get_width()\n\n        self.__lines = self.__update_lines()\n\n        if show:\n            sys.stdout.write(\"\\n\".join(self.__lines))\n            sys.stdout.flush()\n\n    def __get_width(self) -> int:\n        try:\n            columns = shutil.get_terminal_size().columns\n            return max(MINIMUM_TERMINAL_WIDTH, columns - 1)\n        except AttributeError:\n            # Python 2\n            pass\n\n        try:\n            columns = int(os.popen(\"stty size\", \"r\").read().split()[1])\n            return max(MINIMUM_TERMINAL_WIDTH, columns - 1)\n        except:\n            return MINIMUM_TERMINAL_WIDTH\n\n    def __remove_displayed_lines(self) -> None:\n        if len(self.__bars) >= 1:\n            sys.stdout.write(\"\\b\" * len(self.__lines[-1]))\n\n        if len(self.__bars) >= 2:\n            sys.stdout.write(\"\\033M\" * (len(self.__lines) - 1))\n\n        self.__lines = []\n\n    def __update_line(self, done: int, total: int) -> str:\n        if total == 0:\n            percent = 0\n        else:\n            percent = done / total\n        bar = (\":\" * int(percent * 40)).ljust(40, \" \")\n        percent = round(percent * 100, 2)\n        format = \" {percent:6.2f}% {bar:s} | {done:8d} / {total:8d} |\"\n        ret = format.format(percent=percent, bar=bar, done=done, total=total)\n        return ret[: self.__width].ljust(self.__width, \" \")\n\n    def __update_lines(self) -> List[str]:\n        return [self.__update_line(value, max) for value, max in self.__bars]\n\n    def update(self, values: List[int]) -> None:\n        \"\"\"Update a bar value.\n        Positional arguments:\n        values - The new values of each bar\n        \"\"\"\n        if not self.__show:\n            return\n\n        for index, value in enumerate(values):\n            self.__bars[index][0] = value\n\n        self.__remove_displayed_lines()\n        self.__lines = self.__update_lines()\n\n        sys.stdout.write(\"\\n\".join(self.__lines))\n        sys.stdout.flush()\n\n\nclass ProgressBarsNotebookLab(ProgressBars):\n    def __init__(self, maxs: List[int], show: bool) -> None:\n        \"\"\"Initialization.\n        Positional argument:\n        maxs - List containing the max value of each progress bar\n        \"\"\"\n        self.__show = show\n\n        if not show:\n            return\n\n        from IPython.display import display\n        from ipywidgets import HBox, IntProgress, Label, VBox\n\n        self.__bars = [\n            HBox(\n                [\n                    IntProgress(0, 0, max, description=\"{:.2f}%\".format(0)),\n                    Label(\"{} / {}\".format(0, max)),\n                ]\n            )\n            for max in maxs\n        ]\n\n        display(VBox(self.__bars))\n\n    def update(self, values: List[int]) -> None:\n        \"\"\"Update a bar value.\n        Positional arguments:\n        values - The new values of each bar\n        \"\"\"\n        if not self.__show:\n            return\n\n        for index, value in enumerate(values):\n            bar, label = self.__bars[index].children\n\n            label.value = \"{} / {}\".format(value, bar.max)\n            \n            bar.value = value\n\n            if value >= bar.max:\n                bar.bar_style = \"success\"\n\n            if bar.max != 0:\n                bar.description = \"{:.2f}%\".format(bar.value / bar.max * 100)\n\n    def set_error(self, index: int) -> None:\n        \"\"\"Set a bar on error\"\"\"\n        if not self.__show:\n            return\n\n        bar, _ = self.__bars[index].children\n        bar.bar_style = \"danger\"\n\n\ndef get_progress_bars(\n    maxs: List[int], show\n) -> Union[ProgressBarsNotebookLab, ProgressBarsConsole]:\n    return (\n        ProgressBarsNotebookLab(maxs, show)\n        if is_notebook_lab()\n        else ProgressBarsConsole(maxs, show)\n    )\n\n\ndef progress_wrapper(\n    user_defined_function: Callable,\n    master_workers_queue: multiprocessing.Queue,\n    index: int,\n    chunk_size: int,\n) -> Callable:\n    \"\"\"Wrap the function to apply in a function which monitor the part of work already\n    done.\n    \"\"\"\n    counter = count()\n    state = ProgressState(chunk_size)\n\n    def closure(*user_defined_function_args, **user_defined_functions_kwargs):\n        iteration = next(counter)\n\n        if iteration == state.next_put_iteration:\n            time_now = time_ns()\n            master_workers_queue.put_nowait((index, WorkerStatus.Running, iteration))\n\n            delta_t = time_now - state.last_put_time\n            delta_i = iteration - state.last_put_iteration\n\n            state.next_put_iteration += (\n                max(int((delta_i / delta_t) * INTERVAL_NS), 1) if delta_t != 0 else 1\n            )\n\n            state.last_put_iteration = iteration\n            state.last_put_time = time_now\n\n        return user_defined_function(\n            *user_defined_function_args, **user_defined_functions_kwargs\n        )\n\n    return closure\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/__init__.py",
      "content": "from .core import pandarallel\n\n__version__ = \"1.6.5\"\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py",
      "content": "import multiprocessing\nfrom typing import Any, Callable, Dict, Iterable, Iterator, List, Tuple\n\nimport pandas as pd\nfrom pandas.core.window.expanding import ExpandingGroupby as PandasExpandingGroupby\n\nfrom ..utils import WorkerStatus, chunk, get_pandas_version\nfrom .generic import DataType\n\n\nclass ExpandingGroupBy:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: PandasExpandingGroupby, *args, **kwargs\n        ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n            pandas_version = get_pandas_version()\n\n            nb_items = (\n                len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n            )\n\n            chunks = chunk(nb_items, nb_workers)\n\n            iterator = (\n                iter(data._groupby)\n                if pandas_version < (1, 3)\n                else data._grouper.get_iterator(data.obj)\n            )\n\n            for chunk_ in chunks:\n                yield [next(iterator) for _ in range(chunk_.stop - chunk_.start)]\n\n        @staticmethod\n        def get_work_extra(data: PandasExpandingGroupby):\n            attributes = {\n                attribute: getattr(data, attribute) for attribute in data._attributes\n            }\n\n            return {\"attributes\": attributes}\n\n        @staticmethod\n        def work(\n            data: List[Tuple[int, pd.DataFrame]],\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> List[pd.DataFrame]:\n            show_progress_bars: bool = extra[\"show_progress_bars\"]\n            master_workers_queue: multiprocessing.Queue = extra[\"master_workers_queue\"]\n            worker_index: int = extra[\"worker_index\"]\n\n            def compute_result(\n                iteration: int,\n                attributes: Dict[str, Any],\n                index: int,\n                df: pd.DataFrame,\n                user_defined_function: Callable,\n                user_defined_function_args: tuple,\n                user_defined_function_kwargs: Dict[str, Any],\n            ) -> pd.DataFrame:\n                item = df.expanding(**attributes).apply(\n                    user_defined_function,\n                    *user_defined_function_args,\n                    **user_defined_function_kwargs\n                )\n\n                item.index = pd.MultiIndex.from_product([[index], item.index])\n\n                if show_progress_bars:\n                    master_workers_queue.put_nowait(\n                        (worker_index, WorkerStatus.Running, iteration)\n                    )\n\n                return item\n\n            attributes = extra[\"attributes\"]\n            attributes.pop(\"_grouper\", None)\n\n            dfs = (\n                compute_result(\n                    iteration,\n                    attributes,\n                    index,\n                    df,\n                    user_defined_function,\n                    user_defined_function_args,\n                    user_defined_function_kwargs,\n                )\n                for iteration, (index, df) in enumerate(data)\n            )\n\n            return pd.concat(dfs)\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.DataFrame], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/series.py",
      "content": "from typing import Any, Callable, Dict, Iterable, Iterator\n\nimport pandas as pd\n\nfrom ..utils import chunk\nfrom .generic import DataType\n\n\nclass Series:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: pd.Series, **kwargs\n        ) -> Iterator[pd.Series]:\n            for chunk_ in chunk(data.size, nb_workers):\n                yield data.iloc[chunk_]\n\n        @staticmethod\n        def work(\n            data: pd.Series,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.Series:\n            return data.apply(\n                user_defined_function,\n                *user_defined_function_args,\n                **user_defined_function_kwargs\n            )\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.Series], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n\n    class Map(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: pd.Series, **kwargs\n        ) -> Iterator[pd.Series]:\n            for chunk_ in chunk(data.size, nb_workers):\n                yield data.iloc[chunk_]\n\n        @staticmethod\n        def work(\n            data: pd.Series,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.Series:\n            return data.map(\n                user_defined_function,\n                *user_defined_function_args,\n                **user_defined_function_kwargs\n            )\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.Series], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/series_rolling.py",
      "content": "from typing import Any, Callable, Dict, Iterable, Iterator\n\nimport pandas as pd\nfrom pandas.core.window.rolling import Rolling\n\nfrom ..utils import chunk\nfrom .generic import DataType\n\n\nclass SeriesRolling:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, rolling: Rolling, **kwargs\n        ) -> Iterator[pd.Series]:\n            chunks = chunk(rolling.obj.size, nb_workers, rolling.window)\n\n            for chunk_ in chunks:\n                yield rolling.obj[chunk_]\n\n        @staticmethod\n        def get_work_extra(data: Rolling) -> Dict[str, Any]:\n            return {\n                \"attributes\": {\n                    attribute: getattr(data, attribute)\n                    for attribute in data._attributes\n                }\n            }\n\n        @staticmethod\n        def work(\n            data: pd.Series,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.Series:\n            attributes: Dict[str, Any] = extra[\"attributes\"]\n            worker_index: int = extra[\"worker_index\"]\n\n            result = data.rolling(**attributes).apply(\n                user_defined_function,\n                *user_defined_function_args,\n                **user_defined_function_kwargs\n            )\n\n            return result if worker_index == 0 else result[attributes[\"window\"] :]\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.Series], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/__init__.py",
      "content": "from .dataframe import DataFrame\nfrom .dataframe_groupby import DataFrameGroupBy\nfrom .expanding_groupby import ExpandingGroupBy\nfrom .rolling_groupby import RollingGroupBy\nfrom .generic import DataType\nfrom .series import Series\nfrom .series_rolling import SeriesRolling\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/dataframe.py",
      "content": "from typing import Any, Callable, Dict, Iterable, Iterator\nfrom types import GeneratorType\n\nimport pandas as pd\n\nfrom ..utils import chunk, get_axis_int\nfrom .generic import DataType\n\n\nclass DataFrame:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: pd.DataFrame, **kwargs\n        ) -> Iterator[pd.DataFrame]:\n            user_defined_function_kwargs = kwargs[\"user_defined_function_kwargs\"]\n\n            axis_int = get_axis_int(user_defined_function_kwargs)\n            opposite_axis_int = 1 - axis_int\n\n            for chunk_ in chunk(data.shape[opposite_axis_int], nb_workers):\n                yield data.iloc[chunk_] if axis_int == 1 else data.iloc[:, chunk_]\n\n        @staticmethod\n        def work(\n            data: pd.DataFrame,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.DataFrame:\n            return data.apply(\n                user_defined_function,\n                *user_defined_function_args,\n                **user_defined_function_kwargs,\n            )\n\n        @staticmethod\n        def get_reduce_extra(\n            data: Any, user_defined_function_kwargs: Dict[str, Any]\n        ) -> Dict[str, Any]:\n            return {\"axis\": get_axis_int(user_defined_function_kwargs)}\n\n        @staticmethod\n        def reduce(\n            datas: Iterable[pd.DataFrame], extra: Dict[str, Any]\n        ) -> pd.DataFrame:\n            if isinstance(datas, GeneratorType):\n                datas = list(datas)\n            axis = 0 if isinstance(datas[0], pd.Series) else 1 - extra[\"axis\"]\n            return pd.concat(datas, copy=False, axis=axis)\n\n    class ApplyMap(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: pd.DataFrame, **kwargs\n        ) -> Iterator[pd.DataFrame]:\n            for chunk_ in chunk(data.shape[0], nb_workers):\n                yield data.iloc[chunk_]\n\n        @staticmethod\n        def work(\n            data: pd.DataFrame,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.DataFrame:\n            return data.applymap(user_defined_function)\n\n        @staticmethod\n        def reduce(\n            datas: Iterable[pd.DataFrame], extra: Dict[str, Any]\n        ) -> pd.DataFrame:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/generic.py",
      "content": "from abc import ABC, abstractmethod\nfrom typing import Any, Callable, Dict, Iterable, Iterator\n\n\nclass DataType(ABC):\n    @staticmethod\n    @abstractmethod\n    def get_chunks(nb_workers: int, data: Any, **kwargs) -> Iterator[Any]:\n        ...\n\n    @staticmethod\n    def get_work_extra(data: Any) -> Dict[str, Any]:\n        return dict()\n\n    @staticmethod\n    @abstractmethod\n    def work(\n        data: Any,\n        user_defined_function: Callable,\n        user_defined_function_args: tuple,\n        user_defined_function_kwargs: Dict[str, Any],\n        extra: Dict[str, Any],\n    ) -> Any:\n        ...\n\n    @staticmethod\n    def get_reduce_extra(\n        data: Any, user_defined_function_kwargs: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        return dict()\n\n    @staticmethod\n    @abstractmethod\n    def reduce(datas: Iterable[Any], extra: Dict[str, Any]) -> Any:\n        ...\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py",
      "content": "import multiprocessing\nfrom typing import Any, Callable, Dict, Iterable, Iterator, List, Tuple\n\nimport pandas as pd\nfrom pandas.core.window.rolling import RollingGroupby as PandasRollingGroupby\n\nfrom ..utils import WorkerStatus, chunk, get_pandas_version\nfrom .generic import DataType\n\n\nclass RollingGroupBy:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: PandasRollingGroupby, *args, **kwargs\n        ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n            pandas_version = get_pandas_version()\n\n            nb_items = (\n                len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n            )\n\n            chunks = chunk(nb_items, nb_workers)\n\n            iterator = (\n                iter(data._groupby)\n                if pandas_version < (1, 3)\n                else data._grouper.get_iterator(data.obj)\n            )\n\n            for chunk_ in chunks:\n                yield [next(iterator) for _ in range(chunk_.stop)]\n\n        @staticmethod\n        def get_work_extra(data: PandasRollingGroupby):\n            attributes = {\n                attribute: getattr(data, attribute) for attribute in data._attributes\n            }\n\n            return {\"attributes\": attributes}\n\n        @staticmethod\n        def work(\n            data: List[Tuple[int, pd.DataFrame]],\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> List[pd.DataFrame]:\n            show_progress_bars: bool = extra[\"show_progress_bars\"]\n            master_workers_queue: multiprocessing.Queue = extra[\"master_workers_queue\"]\n            worker_index: int = extra[\"worker_index\"]\n\n            def compute_result(\n                iteration: int,\n                attributes: Dict[str, Any],\n                index: int,\n                df: pd.DataFrame,\n                user_defined_function: Callable,\n                user_defined_function_args: tuple,\n                user_defined_function_kwargs: Dict[str, Any],\n            ) -> pd.DataFrame:\n                item = df.rolling(**attributes).apply(\n                    user_defined_function,\n                    *user_defined_function_args,\n                    **user_defined_function_kwargs\n                )\n\n                item.index = pd.MultiIndex.from_product([[index], item.index])\n\n                if show_progress_bars:\n                    master_workers_queue.put_nowait(\n                        (worker_index, WorkerStatus.Running, iteration)\n                    )\n\n                return item\n\n            attributes = extra[\"attributes\"]\n            attributes.pop(\"_grouper\", None)\n\n            dfs = (\n                compute_result(\n                    iteration,\n                    attributes,\n                    index,\n                    df,\n                    user_defined_function,\n                    user_defined_function_args,\n                    user_defined_function_kwargs,\n                )\n                for iteration, (index, df) in enumerate(data)\n            )\n\n            return pd.concat(dfs)\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.DataFrame], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/dataframe_groupby.py",
      "content": "import itertools\nfrom typing import Any, Callable, Dict, Iterable, Iterator, List, Tuple, Union, cast\n\nimport pandas as pd\nfrom pandas.core.groupby.generic import DataFrameGroupBy as PandasDataFrameGroupBy\n\nfrom ..utils import chunk, df_indexed_like, get_pandas_version\nfrom .generic import DataType\n\n\nclass DataFrameGroupBy:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, dataframe_groupby: PandasDataFrameGroupBy, **kwargs\n        ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n            chunks = chunk(dataframe_groupby.ngroups, nb_workers)\n            iterator = iter(dataframe_groupby)\n\n            for chunk_ in chunks:\n                yield [next(iterator) for _ in range(chunk_.stop - chunk_.start)]\n\n        @staticmethod\n        def work(\n            data: List[Tuple[int, pd.DataFrame]],\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> List[Tuple[int, pd.DataFrame, bool]]:\n            def compute_result(\n                key: int, df: pd.DataFrame\n            ) -> Tuple[int, pd.DataFrame, bool]:\n                result = user_defined_function(\n                    df, *user_defined_function_args, **user_defined_function_kwargs\n                )\n                mutated = not df_indexed_like(result, df.axes)\n                return key, result, mutated\n\n            return [compute_result(key, df) for key, df in data]\n\n        @staticmethod\n        def get_reduce_extra(\n            data: PandasDataFrameGroupBy, user_defined_function_kwargs: Dict[str, Any]\n        ) -> Dict[str, Any]:\n            return {\"df_groupby\": data}\n\n        @staticmethod\n        def reduce(\n            datas: Iterable[List[Tuple[int, pd.DataFrame, bool]]], extra: Dict[str, Any]\n        ) -> pd.Series:\n            def get_args(\n                keys: List[int],\n                values: List[pd.DataFrame],\n                df_groupby: PandasDataFrameGroupBy,\n            ) -> Union[\n                Tuple[List[int], List[pd.DataFrame]],\n                Tuple[pd.DataFrame, List[int], List[pd.DataFrame]],\n                Tuple[pd.DataFrame, List[pd.DataFrame]],\n            ]:\n                pandas_version = get_pandas_version()\n\n                if pandas_version < (1, 3):\n                    return keys, values\n                elif pandas_version < (1, 4):\n                    return df_groupby._selected_obj, keys, values\n                else:\n                    return df_groupby._selected_obj, values\n\n            df_groupby: PandasDataFrameGroupBy = extra[\"df_groupby\"]\n\n            results = itertools.chain.from_iterable(datas)\n            keys, values, mutated = zip(*results)\n\n            keys = cast(List[int], keys)\n            values = cast(List[pd.DataFrame], values)\n            mutated = cast(List[bool], mutated)\n\n            args = get_args(keys, values, df_groupby)\n  \n            return df_groupby._wrap_applied_output(*args, not_indexed_same=mutated)\n"
    }
  ],
  "OriginCode": [
    {
      "path": "nalepae_pandarallel/setup.py",
      "content": "from setuptools import setup\n\nsetup()\n"
    },
    {
      "path": "nalepae_pandarallel/tests/test_pandarallel.py",
      "content": "import importlib\nimport math\n\nimport numpy as np\nimport pandas as pd\nimport pytest\nfrom pandarallel import pandarallel\n\n\n@pytest.fixture(params=(1000, 1))\ndef df_size(request):\n    return request.param\n\n\n@pytest.fixture(params=(False, True))\ndef progress_bar(request):\n    return request.param\n\n\n@pytest.fixture(params=(None, False))\ndef use_memory_fs(request):\n    return request.param\n\n\n@pytest.fixture(params=(RuntimeError, AttributeError, ZeroDivisionError))\ndef exception(request):\n    return request.param\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_apply_axis_0(request):\n    def func(x):\n        return max(x) - min(x)\n\n    return dict(named=func, anonymous=lambda x: max(x) - min(x))[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_apply_axis_1(request):\n    def func(x):\n        return math.sin(x.a**2) + math.sin(x.b**2)\n\n    return dict(\n        named=func, anonymous=lambda x: math.sin(x.a**2) + math.sin(x.b**2)\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_applymap(request):\n    def func(x):\n        return math.sin(x**2) - math.cos(x**2)\n\n    return dict(named=func, anonymous=lambda x: math.sin(x**2) - math.cos(x**2))[\n        request.param\n    ]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_series_map(request):\n    def func(x):\n        return math.log10(math.sqrt(math.exp(x**2)))\n\n    return dict(\n        named=func, anonymous=lambda x: math.log10(math.sqrt(math.exp(x**2)))\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_series_apply(request):\n    def func(x, power, bias=0):\n        return math.log10(math.sqrt(math.exp(x**power))) + bias\n\n    return dict(\n        named=func,\n        anonymous=lambda x, power, bias=0: math.log10(math.sqrt(math.exp(x**power)))\n        + bias,\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_series_rolling_apply(request):\n    def func(x):\n        return x.iloc[0] + x.iloc[1] ** 2 + x.iloc[2] ** 3 + x.iloc[3] ** 4\n\n    return dict(\n        named=func,\n        anonymous=lambda x: x.iloc[0]\n        + x.iloc[1] ** 2\n        + x.iloc[2] ** 3\n        + x.iloc[3] ** 4,\n    )[request.param]\n\n\n@pytest.fixture()\ndef func_dataframe_groupby_apply():\n    def func(df):\n        dum = 0\n        for item in df.b:\n            dum += math.log10(math.sqrt(math.exp(item**2)))\n\n        return dum / len(df.b)\n\n    return func\n\n\n@pytest.fixture()\ndef func_dataframe_groupby_apply_complex():\n    def func(df):\n        return pd.DataFrame(\n            [[df.b.mean(), df.b.min(), df.b.max()]],\n            columns=[\"b_mean\", \"b_min\", \"b_max\"],\n        )\n\n    return func\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_groupby_rolling_apply(request):\n    def func(x):\n        return x.iloc[0] + x.iloc[1] ** 2 + x.iloc[2] ** 3 + x.iloc[3] ** 4\n\n    return dict(\n        named=func,\n        anonymous=lambda x: x.iloc[0]\n        + x.iloc[1] ** 2\n        + x.iloc[2] ** 3\n        + x.iloc[3] ** 4,\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_groupby_expanding_apply(request):\n    def func(x):\n        return (x.multiply(pd.Series(range(1, len(x)), dtype=\"float\"))).sum()\n\n    return dict(\n        named=func,\n        anonymous=lambda x: (\n            x.multiply(pd.Series(range(1, len(x)), dtype=\"float\"))\n        ).sum(),\n    )[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_apply_axis_0_no_reduce(request):\n    def func(x):\n        return x\n\n    return dict(named=func, anonymous=lambda x: x)[request.param]\n\n\n@pytest.fixture(params=(\"named\", \"anonymous\"))\ndef func_dataframe_apply_axis_1_no_reduce(request):\n    def func(x):\n        return x**2\n\n    return dict(named=func, anonymous=lambda x: x**2)[request.param]\n\n\n@pytest.fixture\ndef pandarallel_init(progress_bar, use_memory_fs):\n    pandarallel.initialize(\n        progress_bar=progress_bar, use_memory_fs=use_memory_fs, nb_workers=2\n    )\n\n\ndef test_dataframe_apply_invalid_function(pandarallel_init, exception):\n    def f(_):\n        raise exception\n\n    df = pd.DataFrame(dict(a=[1, 2, 3, 4]))\n\n    with pytest.raises(exception):\n        df.parallel_apply(f)\n\n\ndef test_dataframe_apply_axis_0(pandarallel_init, func_dataframe_apply_axis_0, df_size):\n    df = pd.DataFrame(\n        dict(\n            a=np.random.randint(1, 8, df_size),\n            b=np.random.rand(df_size),\n            c=np.random.randint(1, 8, df_size),\n            d=np.random.rand(df_size),\n            e=np.random.randint(1, 8, df_size),\n            f=np.random.rand(df_size),\n            g=np.random.randint(1, 8, df_size),\n            h=np.random.rand(df_size),\n        )\n    )\n    df.index = [item / 10 for item in df.index]\n\n    res = df.apply(func_dataframe_apply_axis_0)\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_0)\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_apply_axis_1(pandarallel_init, func_dataframe_apply_axis_1, df_size):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 8, df_size), b=np.random.rand(df_size))\n    )\n    df.index = [item / 10 for item in df.index]\n\n    res = df.apply(func_dataframe_apply_axis_1, axis=1)\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_1, axis=1)\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_apply_invalid_axis(pandarallel_init):\n    df = pd.DataFrame(dict(a=[1, 2, 3, 4]))\n\n    with pytest.raises(ValueError):\n        df.parallel_apply(lambda x: x, axis=\"invalid\")\n    \ndef test_empty_dataframe_apply_axis_0(pandarallel_init, func_dataframe_apply_axis_0):\n    df = pd.DataFrame()\n\n    res = df.apply(func_dataframe_apply_axis_0)\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_0)\n    assert res.equals(res_parallel)\n\ndef test_empty_dataframe_apply_axis_1(pandarallel_init, func_dataframe_apply_axis_1):\n    df = pd.DataFrame()\n\n    res = df.apply(func_dataframe_apply_axis_1)\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_1)\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_applymap(pandarallel_init, func_dataframe_applymap, df_size):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 8, df_size), b=np.random.rand(df_size))\n    )\n    df.index = [item / 10 for item in df.index]\n\n    res = df.applymap(func_dataframe_applymap)\n    res_parallel = df.parallel_applymap(func_dataframe_applymap)\n    assert res.equals(res_parallel)\n\n\ndef test_series_map(pandarallel_init, func_series_map, df_size):\n    df = pd.DataFrame(dict(a=np.random.rand(df_size) + 1))\n\n    res = df.a.map(func_series_map)\n    res_parallel = df.a.parallel_map(func_series_map)\n    assert res.equals(res_parallel)\n\n\ndef test_series_apply(pandarallel_init, func_series_apply, df_size):\n    df = pd.DataFrame(dict(a=np.random.rand(df_size) + 1))\n\n    res = df.a.apply(func_series_apply, args=(2,), bias=3)\n    res_parallel = df.a.parallel_apply(func_series_apply, args=(2,), bias=3)\n    assert res.equals(res_parallel)\n\ndef test_empty_series_apply(pandarallel_init, func_series_apply):\n    df = pd.DataFrame(dict(a=[]))\n\n    res = df.a.apply(func_series_apply, args=(2,), bias=3)\n    res_parallel = df.a.parallel_apply(func_series_apply, args=(2,), bias=3)\n    assert res.equals(res_parallel)\n\n\ndef test_series_rolling_apply(pandarallel_init, func_series_rolling_apply, df_size):\n    df = pd.DataFrame(dict(a=np.random.randint(1, 8, df_size), b=list(range(df_size))))\n\n    res = df.b.rolling(4).apply(func_series_rolling_apply, raw=False)\n    res_parallel = df.b.rolling(4).parallel_apply(func_series_rolling_apply, raw=False)\n\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_groupby_apply(\n    pandarallel_init, func_dataframe_groupby_apply, df_size\n):\n    df = pd.DataFrame(\n        dict(\n            a=np.random.randint(1, 8, df_size),\n            b=np.random.rand(df_size),\n            c=np.random.rand(df_size),\n        )\n    )\n\n    res = df.groupby(\"a\").apply(func_dataframe_groupby_apply)\n    res_parallel = df.groupby(\"a\").parallel_apply(func_dataframe_groupby_apply)\n    assert res.equals(res_parallel)\n\n    res = df.groupby([\"a\"]).apply(func_dataframe_groupby_apply)\n    res_parallel = df.groupby([\"a\"]).parallel_apply(func_dataframe_groupby_apply)\n    assert res.equals(res_parallel)\n\n    res = df.groupby([\"a\", \"b\"]).apply(func_dataframe_groupby_apply)\n    res_parallel = df.groupby([\"a\", \"b\"]).parallel_apply(func_dataframe_groupby_apply)\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_groupby_apply_complex(\n    pandarallel_init, func_dataframe_groupby_apply_complex, df_size\n):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 100, df_size), b=np.random.rand(df_size))\n    )\n\n    res = df.groupby(\"a\").apply(func_dataframe_groupby_apply_complex)\n    res_parallel = df.groupby(\"a\").parallel_apply(func_dataframe_groupby_apply_complex)\n    res.equals(res_parallel)\n\n\ndef test_dataframe_groupby_rolling_apply(\n    pandarallel_init, func_dataframe_groupby_rolling_apply, df_size\n):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 10, df_size), b=np.random.rand(df_size))\n    )\n\n    res = (\n        df.groupby(\"a\")\n        .b.rolling(4)\n        .apply(func_dataframe_groupby_rolling_apply, raw=False)\n    )\n    res_parallel = (\n        df.groupby(\"a\")\n        .b.rolling(4)\n        .parallel_apply(func_dataframe_groupby_rolling_apply, raw=False)\n    )\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_groupby_expanding_apply(\n    pandarallel_init, func_dataframe_groupby_expanding_apply, df_size\n):\n    df = pd.DataFrame(\n        dict(a=np.random.randint(1, 10, df_size), b=np.random.rand(df_size))\n    )\n\n    res = (\n        df.groupby(\"a\")\n        .b.expanding()\n        .apply(func_dataframe_groupby_expanding_apply, raw=False)\n    )\n    res_parallel = (\n        df.groupby(\"a\")\n        .b.expanding()\n        .parallel_apply(func_dataframe_groupby_expanding_apply, raw=False)\n    )\n    res.equals(res_parallel)\n\n\ndef test_dataframe_axis_0_no_reduction(\n    pandarallel_init, func_dataframe_apply_axis_0_no_reduce, df_size\n):\n    df = pd.DataFrame(\n        dict(\n            a=np.random.randint(1, 10, df_size),\n            b=np.random.randint(1, 10, df_size),\n            c=np.random.randint(1, 10, df_size),\n        )\n    )\n    res = df.apply(func_dataframe_apply_axis_0_no_reduce)\n\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_0_no_reduce)\n\n    assert res.equals(res_parallel)\n\n\ndef test_dataframe_axis_1_no_reduction(\n    pandarallel_init, func_dataframe_apply_axis_1_no_reduce, df_size\n):\n    df = pd.DataFrame(\n        dict(\n            a=np.random.randint(1, 10, df_size),\n            b=np.random.randint(1, 10, df_size),\n            c=np.random.randint(1, 10, df_size),\n        )\n    )\n\n    res = df.apply(func_dataframe_apply_axis_1_no_reduce, axis=1)\n\n    res_parallel = df.parallel_apply(func_dataframe_apply_axis_1_no_reduce, axis=1)\n\n    assert res.equals(res_parallel)\n\ndef test_memory_fs_root_environment_variable(monkeypatch):\n    monkeypatch.setenv(\"MEMORY_FS_ROOT\", \"/test\")\n    from pandarallel import core\n    importlib.reload(core)\n\n    assert core.MEMORY_FS_ROOT == \"/test\"\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/utils.py",
      "content": "import itertools\nfrom enum import Enum\nfrom typing import Any, Dict, List, Tuple\n\nimport pandas as pd\nfrom pandas import DataFrame, Index\n\n\ndef chunk(nb_item: int, nb_chunks: int, start_offset=0) -> List[slice]:\n    \"\"\"\n    Return `nb_chunks` slices of approximatively `nb_item / nb_chunks` each.\n\n    Parameters\n    ----------\n    nb_item : int\n        Total number of items\n\n    nb_chunks : int\n        Number of chunks to return\n\n    start_offset : int\n        Shift start of slice by this amount\n\n    Returns\n    -------\n    A list of slices\n\n    Examples\n    --------\n    >>> chunks = chunk(103, 4)\n    >>> chunks\n    [slice(0, 26, None), slice(26, 52, None), slice(52, 78, None), slice(78, 103, None)]\n    \"\"\"\n    if nb_item == 0:\n        return [slice(0)]\n    \n    if nb_item <= nb_chunks:\n        return [slice(max(0, idx - start_offset), idx + 1) for idx in range(nb_item)]\n\n    quotient = nb_item // nb_chunks\n    remainder = nb_item % nb_chunks\n\n    quotients = [quotient] * nb_chunks\n    remainders = [1] * remainder + [0] * (nb_chunks - remainder)\n\n    nb_elems_per_chunk = [\n        quotient + remainder for quotient, remainder in zip(quotients, remainders)\n    ]\n\n    accumulated = list(itertools.accumulate(nb_elems_per_chunk))\n    shifted_accumulated = accumulated.copy()\n    shifted_accumulated.insert(0, 0)\n    shifted_accumulated.pop()\n\n    return [\n        slice(max(0, begin - start_offset), end)\n        for begin, end in zip(shifted_accumulated, accumulated)\n    ]\n\n\ndef df_indexed_like(df: DataFrame, axes: List[Index]) -> bool:\n    \"\"\"\n    Returns whether a data frame is indexed in the way specified by the\n    provided axes.\n\n    Used by DataFrameGroupBy to determine whether a group has been modified.\n\n    Function adapted from pandas.core.groupby.ops._is_indexed_like\n\n    Parameters\n    ----------\n    df : DataFrame\n        The data frame in question\n\n    axes : List[Index]\n        The axes to which the data frame is compared\n\n    Returns\n    -------\n    Whether or not the data frame is indexed in the same wa as the axes.\n    \"\"\"\n    if isinstance(df, DataFrame):\n        return df.axes[0].equals(axes[0])\n\n    return False\n\n\ndef get_pandas_version() -> Tuple[int, int]:\n    major_str, minor_str, *_ = pd.__version__.split(\".\")\n    return int(major_str), int(minor_str)\n\n\ndef get_axis_int(user_defined_function_kwargs: Dict[str, Any]):\n    axis = user_defined_function_kwargs.get(\"axis\", 0)\n\n    if axis not in {0, 1, \"index\", \"columns\"}:\n        raise ValueError(f\"No axis named {axis} for object type DataFrame\")\n\n    return {0: 0, 1: 1, \"index\": 0, \"columns\": 1}[axis]\n\n\nclass WorkerStatus(int, Enum):\n    Running = 0\n    Success = 1\n    Error = 2\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/core.py",
      "content": "import multiprocessing\nimport os\nimport pickle\nfrom itertools import count\nfrom multiprocessing.managers import SyncManager\nfrom pathlib import Path\nfrom tempfile import NamedTemporaryFile\nfrom typing import Any, Callable, Dict, Iterator, Optional, Tuple, Type, cast\n\nimport dill\nimport pandas as pd\nimport psutil\nfrom pandas.core.groupby import DataFrameGroupBy as PandaDataFrameGroupBy\nfrom pandas.core.window.expanding import ExpandingGroupby as PandasExpandingGroupby\nfrom pandas.core.window.rolling import RollingGroupby as PandasRollingGroupby\n\nfrom .data_types import (\n    DataFrame,\n    DataFrameGroupBy,\n    DataType,\n    ExpandingGroupBy,\n    RollingGroupBy,\n    Series,\n    SeriesRolling,\n)\nfrom .progress_bars import ProgressBarsType, get_progress_bars, progress_wrapper\nfrom .utils import WorkerStatus\n\nON_WINDOWS = os.name == \"nt\"\nCONTEXT = multiprocessing.get_context(\"spawn\" if ON_WINDOWS else \"fork\")\n\n# Root of Memory File System\nMEMORY_FS_ROOT = os.environ.get(\"MEMORY_FS_ROOT\", \"/dev/shm\")\n\n# By default, Pandarallel use all available CPUs\nNB_PHYSICAL_CORES = psutil.cpu_count(logical=False)\n\n# Prefix and suffix for files used with Memory File System\nPREFIX = \"pandarallel\"\nPREFIX_INPUT = f\"{PREFIX}_input_\"\nPREFIX_OUTPUT = f\"{PREFIX}_output_\"\nSUFFIX = \".pickle\"\n\n# We use these classes decorators pattern instead of the classic one because of this:\n# https://www.stevenengelhardt.com/2013/01/16/python-multiprocessing-module-and-closures/\n\n\nclass WrapWorkFunctionForFileSystem:\n    def __init__(\n        self,\n        work_function: Callable[\n            [Any, Callable, tuple, Dict[str, Any], Dict[str, Any]], Any\n        ],\n    ) -> None:\n        self.work_function = work_function\n\n    def __call__(\n        self,\n        input_file_path: Path,\n        output_file_path: Path,\n        progress_bars_type: ProgressBarsType,\n        worker_index: int,\n        master_workers_queue: multiprocessing.Queue,\n        dilled_user_defined_function: bytes,\n        user_defined_function_args: tuple,\n        user_defined_function_kwargs: Dict[str, Any],\n        extra: Dict[str, Any],\n    ) -> None:\n        try:\n            # Load dataframe from input file\n            with input_file_path.open(\"rb\") as file_descriptor:\n                data = pickle.load(file_descriptor)\n\n            # Delete input file since we don't need it any more. It will free some RAM\n            # since the input file is stored into Shared Memory.\n            input_file_path.unlink()\n\n            data_size = len(data)\n            user_defined_function: Callable = dill.loads(dilled_user_defined_function)\n\n            progress_wrapped_user_defined_function = progress_wrapper(\n                user_defined_function, master_workers_queue, worker_index, data_size\n            )\n\n            used_user_defined_function = (\n                progress_wrapped_user_defined_function\n                if progress_bars_type\n                in (\n                    ProgressBarsType.InUserDefinedFunction,\n                    ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns,\n                )\n                else user_defined_function\n            )\n\n            result = self.work_function(\n                data,\n                used_user_defined_function,\n                user_defined_function_args,\n                user_defined_function_kwargs,\n                extra,\n            )\n\n            with output_file_path.open(\"wb\") as file_descriptor:\n                pickle.dump(result, file_descriptor)\n\n            master_workers_queue.put((worker_index, WorkerStatus.Success, None))\n\n        except:\n            master_workers_queue.put((worker_index, WorkerStatus.Error, None))\n            raise\n\n\nclass WrapWorkFunctionForPipe:\n    def __init__(\n        self,\n        work_function: Callable[\n            [\n                Any,\n                Callable,\n                tuple,\n                Dict[str, Any],\n                Dict[str, Any],\n            ],\n            Any,\n        ],\n    ) -> None:\n        self.work_function = work_function\n\n    def __call__(\n        self,\n        data: Any,\n        progress_bars_type: ProgressBarsType,\n        worker_index: int,\n        master_workers_queue: multiprocessing.Queue,\n        dilled_user_defined_function: bytes,\n        user_defined_function_args: tuple,\n        user_defined_function_kwargs: Dict[str, Any],\n        extra: Dict[str, Any],\n    ) -> Any:\n        try:\n            data_size = len(data)\n            user_defined_function: Callable = dill.loads(dilled_user_defined_function)\n\n            progress_wrapped_user_defined_function = progress_wrapper(\n                user_defined_function, master_workers_queue, worker_index, data_size\n            )\n\n            used_user_defined_function = (\n                progress_wrapped_user_defined_function\n                if progress_bars_type\n                in (\n                    ProgressBarsType.InUserDefinedFunction,\n                    ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns,\n                )\n                else user_defined_function\n            )\n\n            results = self.work_function(\n                data,\n                used_user_defined_function,\n                user_defined_function_args,\n                user_defined_function_kwargs,\n                extra,\n            )\n\n            master_workers_queue.put((worker_index, WorkerStatus.Success, None))\n\n            return results\n\n        except:\n            master_workers_queue.put((worker_index, WorkerStatus.Error, None))\n            raise\n\n\ndef wrap_reduce_function_for_file_system(\n    reduce_function: Callable[[Iterator, Dict[str, Any]], Any]\n) -> Callable[[Iterator[Path], Dict[str, Any]], Any]:\n    \"\"\"This wrapper transforms a `reduce` function which takes as input:\n    - A list of pandas Dataframe\n    - An user defined function\n    and which returns a pandas Dataframe, into a `reduct` function which takes as input:\n    - A list of paths where  pandas Dataframe are pickled\n    which returns a pandas Dataframe.\n    \"\"\"\n\n    def closure(output_file_paths: Iterator[Path], extra: Dict[str, Any]) -> Any:\n        def get_dataframe_and_delete_file(file_path: Path) -> Any:\n            with file_path.open(\"rb\") as file_descriptor:\n                data = pickle.load(file_descriptor)\n\n            file_path.unlink()\n            return data\n\n        dfs = (\n            get_dataframe_and_delete_file(output_file_path)\n            for output_file_path in output_file_paths\n        )\n\n        return reduce_function(dfs, extra)\n\n    return closure\n\n\ndef parallelize_with_memory_file_system(\n    nb_requested_workers: int,\n    data_type: Type[DataType],\n    progress_bars_type: ProgressBarsType,\n):\n    def closure(\n        data: Any,\n        user_defined_function: Callable,\n        *user_defined_function_args: tuple,\n        **user_defined_function_kwargs: Dict[str, Any],\n    ):\n        wrapped_work_function = WrapWorkFunctionForFileSystem(data_type.work)\n        wrapped_reduce_function = wrap_reduce_function_for_file_system(data_type.reduce)\n\n        chunks = list(\n            data_type.get_chunks(\n                nb_requested_workers,\n                data,\n                user_defined_function_kwargs=user_defined_function_kwargs,\n            )\n        )\n\n        nb_workers = len(chunks)\n\n        multiplicator_factor = (\n            len(cast(pd.DataFrame, data).columns)\n            if progress_bars_type\n            == ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns\n            else 1\n        )\n\n        progresses_length = [len(chunk_) * multiplicator_factor for chunk_ in chunks]\n\n        work_extra = data_type.get_work_extra(data)\n        reduce_extra = data_type.get_reduce_extra(data, user_defined_function_kwargs)\n\n        show_progress_bars = progress_bars_type != ProgressBarsType.No\n\n        progress_bars = get_progress_bars(progresses_length, show_progress_bars)\n        progresses = [0] * nb_workers\n        workers_status = [WorkerStatus.Running] * nb_workers\n\n        input_files = [\n            NamedTemporaryFile(\n                prefix=PREFIX_INPUT, suffix=SUFFIX, dir=MEMORY_FS_ROOT, delete=False\n            )\n            for _ in range(nb_workers)\n        ]\n\n        output_files = [\n            NamedTemporaryFile(\n                prefix=PREFIX_OUTPUT, suffix=SUFFIX, dir=MEMORY_FS_ROOT, delete=False\n            )\n            for _ in range(nb_workers)\n        ]\n\n        try:\n            for chunk, input_file in zip(chunks, input_files):\n                with Path(input_file.name).open(\"wb\") as file_descriptor:\n                    pickle.dump(chunk, file_descriptor)\n\n            dilled_user_defined_function = dill.dumps(user_defined_function)\n            manager: SyncManager = CONTEXT.Manager()\n            master_workers_queue = manager.Queue()\n\n            work_args_list = [\n                (\n                    Path(input_file.name),\n                    Path(output_file.name),\n                    progress_bars_type,\n                    worker_index,\n                    master_workers_queue,\n                    dilled_user_defined_function,\n                    user_defined_function_args,\n                    user_defined_function_kwargs,\n                    {\n                        **work_extra,\n                        **{\n                            \"master_workers_queue\": master_workers_queue,\n                            \"show_progress_bars\": show_progress_bars,\n                            \"worker_index\": worker_index,\n                        },\n                    },\n                )\n                for worker_index, (\n                    input_file,\n                    output_file,\n                ) in enumerate(zip(input_files, output_files))\n            ]\n\n            pool = CONTEXT.Pool(nb_workers)\n            results_promise = pool.starmap_async(wrapped_work_function, work_args_list)\n\n            pool.close()\n\n            generation = count()\n\n            while any(\n                (\n                    worker_status == WorkerStatus.Running\n                    for worker_status in workers_status\n                )\n            ):\n                message: Tuple[int, WorkerStatus, Any] = master_workers_queue.get()\n                worker_index, worker_status, payload = message\n                workers_status[worker_index] = worker_status\n\n                if worker_status == WorkerStatus.Success:\n                    progresses[worker_index] = progresses_length[worker_index]\n                    progress_bars.update(progresses)\n                elif worker_status == WorkerStatus.Running:\n                    progress = cast(int, payload)\n                    progresses[worker_index] = progress\n\n                    if next(generation) % nb_workers == 0:\n                        progress_bars.update(progresses)\n                elif worker_status == WorkerStatus.Error:\n                    progress_bars.set_error(worker_index)\n                    progress_bars.update(progresses)\n\n            try:\n                return wrapped_reduce_function(\n                    (Path(output_file.name) for output_file in output_files),\n                    reduce_extra,\n                )\n            except EOFError:\n                # Loading the files failed, this most likely means that there\n                # was some error during processing and the files were never\n                # saved at all.\n                results_promise.get()\n\n                # If the above statement does not raise an exception, that\n                # means the multiprocessing went well and we want to re-raise\n                # the original EOFError.\n                raise\n\n        finally:\n            for output_file in output_files:\n                # When pandarallel stop supporting Python 3.7 and older, replace this\n                # try/except clause by:\n                # Path(output_file.name).unlink(missing_ok=True)\n                try:\n                    Path(output_file.name).unlink()\n                except FileNotFoundError:\n                    # Do nothing, this is the nominal case.\n                    pass\n\n    return closure\n\n\ndef parallelize_with_pipe(\n    nb_requested_workers: int,\n    data_type: Type[DataType],\n    progress_bars_type: ProgressBarsType,\n):\n    def closure(\n        data: Any,\n        user_defined_function: Callable,\n        *user_defined_function_args: tuple,\n        **user_defined_function_kwargs: Dict[str, Any],\n    ):\n        wrapped_work_function = WrapWorkFunctionForPipe(data_type.work)\n        dilled_user_defined_function = dill.dumps(user_defined_function)\n        manager: SyncManager = CONTEXT.Manager()\n        master_workers_queue = manager.Queue()\n\n        chunks = list(\n            data_type.get_chunks(\n                nb_requested_workers,\n                data,\n                user_defined_function_kwargs=user_defined_function_kwargs,\n            )\n        )\n\n        nb_workers = len(chunks)\n\n        multiplicator_factor = (\n            len(cast(pd.DataFrame, data).columns)\n            if progress_bars_type\n            == ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns\n            else 1\n        )\n\n        progresses_length = [len(chunk_) * multiplicator_factor for chunk_ in chunks]\n\n        work_extra = data_type.get_work_extra(data)\n        reduce_extra = data_type.get_reduce_extra(data, user_defined_function_kwargs)\n\n        show_progress_bars = progress_bars_type != ProgressBarsType.No\n\n        progress_bars = get_progress_bars(progresses_length, show_progress_bars)\n        progresses = [0] * nb_workers\n        workers_status = [WorkerStatus.Running] * nb_workers\n\n        work_args_list = [\n            (\n                chunk,\n                progress_bars_type,\n                worker_index,\n                master_workers_queue,\n                dilled_user_defined_function,\n                user_defined_function_args,\n                user_defined_function_kwargs,\n                {\n                    **work_extra,\n                    **{\n                        \"master_workers_queue\": master_workers_queue,\n                        \"show_progress_bars\": show_progress_bars,\n                        \"worker_index\": worker_index,\n                    },\n                },\n            )\n            for worker_index, chunk in enumerate(chunks)\n        ]\n\n        pool = CONTEXT.Pool(nb_workers)\n        results_promise = pool.starmap_async(wrapped_work_function, work_args_list)\n        pool.close()\n\n        generation = count()\n\n        while any(\n            (worker_status == WorkerStatus.Running for worker_status in workers_status)\n        ):\n            message: Tuple[int, WorkerStatus, Any] = master_workers_queue.get()\n            worker_index, worker_status, payload = message\n            workers_status[worker_index] = worker_status\n\n            if worker_status == WorkerStatus.Success:\n                progresses[worker_index] = progresses_length[worker_index]\n                progress_bars.update(progresses)\n            elif worker_status == WorkerStatus.Running:\n                progress = cast(int, payload)\n                progresses[worker_index] = progress\n\n                if next(generation) % nb_workers == 0:\n                    progress_bars.update(progresses)\n            elif worker_status == WorkerStatus.Error:\n                progress_bars.set_error(worker_index)\n\n        results = results_promise.get()\n\n        return data_type.reduce(results, reduce_extra)\n\n    return closure\n\n\nclass pandarallel:\n    @classmethod\n    def initialize(\n        cls,\n        shm_size_mb=None,\n        nb_workers=NB_PHYSICAL_CORES,\n        progress_bar=False,\n        verbose=2,\n        use_memory_fs: Optional[bool] = None,\n    ) -> None:\n        show_progress_bars = progress_bar\n        is_memory_fs_available = Path(MEMORY_FS_ROOT).exists()\n\n        use_memory_fs = (\n            use_memory_fs if use_memory_fs is not None else is_memory_fs_available\n        )\n\n        parallelize = (\n            parallelize_with_memory_file_system\n            if use_memory_fs\n            else parallelize_with_pipe\n        )\n\n        if use_memory_fs and not is_memory_fs_available:\n            raise SystemError(\"Memory file system is not available\")\n\n        if verbose >= 2:\n            print(f\"INFO: Pandarallel will run on {nb_workers} workers.\")\n\n            message = (\n                (\n                    \"INFO: Pandarallel will use Memory file system to transfer data \"\n                    \"between the main process and workers.\"\n                )\n                if use_memory_fs\n                else (\n                    \"INFO: Pandarallel will use standard multiprocessing data transfer \"\n                    \"(pipe) to transfer data between the main process and workers.\"\n                )\n            )\n\n            print(message)\n\n            if ON_WINDOWS and verbose >= 2:\n                print()\n                print(\n                    (\n                        \"WARNING: You are on Windows. If you detect any issue with \"\n                        \"pandarallel, be sure you checked out the Troubleshooting page:\"\n                    )\n                )\n                print(\"https://nalepae.github.io/pandarallel/troubleshooting/\")\n\n        progress_bars_in_user_defined_function = (\n            ProgressBarsType.InUserDefinedFunction\n            if show_progress_bars\n            else ProgressBarsType.No\n        )\n\n        progress_bars_in_user_defined_function_multiply_by_number_of_columns = (\n            ProgressBarsType.InUserDefinedFunctionMultiplyByNumberOfColumns\n            if show_progress_bars\n            else ProgressBarsType.No\n        )\n\n        progress_bars_in_work_function = (\n            ProgressBarsType.InWorkFunction\n            if show_progress_bars\n            else ProgressBarsType.No\n        )\n\n        # DataFrame\n        pd.DataFrame.parallel_apply = parallelize(\n            nb_workers, DataFrame.Apply, progress_bars_in_user_defined_function\n        )\n        pd.DataFrame.parallel_applymap = parallelize(\n            nb_workers,\n            DataFrame.ApplyMap,\n            progress_bars_in_user_defined_function_multiply_by_number_of_columns,\n        )\n\n        # DataFrame GroupBy\n        PandaDataFrameGroupBy.parallel_apply = parallelize(\n            nb_workers, DataFrameGroupBy.Apply, progress_bars_in_user_defined_function\n        )\n\n        # Expanding GroupBy\n        PandasExpandingGroupby.parallel_apply = parallelize(\n            nb_workers, ExpandingGroupBy.Apply, progress_bars_in_work_function\n        )\n\n        # Rolling GroupBy\n        PandasRollingGroupby.parallel_apply = parallelize(\n            nb_workers, RollingGroupBy.Apply, progress_bars_in_work_function\n        )\n\n        # Series\n        pd.Series.parallel_apply = parallelize(\n            nb_workers, Series.Apply, progress_bars_in_user_defined_function\n        )\n        pd.Series.parallel_map = parallelize(nb_workers, Series.Map, show_progress_bars)\n\n        # Series Rolling\n        pd.core.window.Rolling.parallel_apply = parallelize(\n            nb_workers, SeriesRolling.Apply, progress_bars_in_user_defined_function\n        )\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/progress_bars.py",
      "content": "import multiprocessing\nimport os\nimport shutil\nimport sys\nfrom abc import ABC, abstractmethod\nfrom enum import Enum\nfrom itertools import count\nfrom time import time_ns\nfrom typing import Callable, List, Union\n\nfrom .utils import WorkerStatus\n\nINTERVAL_NS = 250_000_000  # 0.25 sec\nMINIMUM_TERMINAL_WIDTH = 72\n\n\nclass ProgressBarsType(int, Enum):\n    No = 0\n    InUserDefinedFunction = 1\n    InUserDefinedFunctionMultiplyByNumberOfColumns = 2\n    InWorkFunction = 3\n\n\nclass ProgressBars(ABC):\n    @abstractmethod\n    def __init__(self, maxs: List[int], show: bool) -> None:\n        ...\n\n    @abstractmethod\n    def update(self, values: List[int]) -> None:\n        ...\n\n    def set_error(self, index: int) -> None:\n        pass\n\n\nclass ProgressState:\n    def __init__(self, chunk_size: int) -> None:\n        self.last_put_iteration = 0\n        self.next_put_iteration = max(chunk_size // 100, 1)\n        self.last_put_time = time_ns()\n\n\ndef is_notebook_lab() -> bool:\n    try:\n        shell: str = get_ipython().__class__.__name__  # type: ignore\n\n        # Shell: Google Colab\n        # TerminalInteractiveShell: Terminal running IPython\n        # ZMQInteractiveShell: Jupyter notebook/lab or qtconsole\n        return shell in {\"Shell\", \"ZMQInteractiveShell\"}\n    except NameError:\n        # Probably standard Python interpreter\n        return False\n\n\nclass ProgressBarsConsole(ProgressBars):\n    def __init__(self, maxs: List[int], show: bool) -> None:\n        self.__show = show\n        self.__bars = [[0, max] for max in maxs]\n        self.__width = self.__get_width()\n\n        self.__lines = self.__update_lines()\n\n        if show:\n            sys.stdout.write(\"\\n\".join(self.__lines))\n            sys.stdout.flush()\n\n    def __get_width(self) -> int:\n        try:\n            columns = shutil.get_terminal_size().columns\n            return max(MINIMUM_TERMINAL_WIDTH, columns - 1)\n        except AttributeError:\n            # Python 2\n            pass\n\n        try:\n            columns = int(os.popen(\"stty size\", \"r\").read().split()[1])\n            return max(MINIMUM_TERMINAL_WIDTH, columns - 1)\n        except:\n            return MINIMUM_TERMINAL_WIDTH\n\n    def __remove_displayed_lines(self) -> None:\n        if len(self.__bars) >= 1:\n            sys.stdout.write(\"\\b\" * len(self.__lines[-1]))\n\n        if len(self.__bars) >= 2:\n            sys.stdout.write(\"\\033M\" * (len(self.__lines) - 1))\n\n        self.__lines = []\n\n    def __update_line(self, done: int, total: int) -> str:\n        if total == 0:\n            percent = 0\n        else:\n            percent = done / total\n        bar = (\":\" * int(percent * 40)).ljust(40, \" \")\n        percent = round(percent * 100, 2)\n        format = \" {percent:6.2f}% {bar:s} | {done:8d} / {total:8d} |\"\n        ret = format.format(percent=percent, bar=bar, done=done, total=total)\n        return ret[: self.__width].ljust(self.__width, \" \")\n\n    def __update_lines(self) -> List[str]:\n        return [self.__update_line(value, max) for value, max in self.__bars]\n\n    def update(self, values: List[int]) -> None:\n        \"\"\"Update a bar value.\n        Positional arguments:\n        values - The new values of each bar\n        \"\"\"\n        if not self.__show:\n            return\n\n        for index, value in enumerate(values):\n            self.__bars[index][0] = value\n\n        self.__remove_displayed_lines()\n        self.__lines = self.__update_lines()\n\n        sys.stdout.write(\"\\n\".join(self.__lines))\n        sys.stdout.flush()\n\n\nclass ProgressBarsNotebookLab(ProgressBars):\n    def __init__(self, maxs: List[int], show: bool) -> None:\n        \"\"\"Initialization.\n        Positional argument:\n        maxs - List containing the max value of each progress bar\n        \"\"\"\n        self.__show = show\n\n        if not show:\n            return\n\n        from IPython.display import display\n        from ipywidgets import HBox, IntProgress, Label, VBox\n\n        self.__bars = [\n            HBox(\n                [\n                    IntProgress(0, 0, max, description=\"{:.2f}%\".format(0)),\n                    Label(\"{} / {}\".format(0, max)),\n                ]\n            )\n            for max in maxs\n        ]\n\n        display(VBox(self.__bars))\n\n    def update(self, values: List[int]) -> None:\n        \"\"\"Update a bar value.\n        Positional arguments:\n        values - The new values of each bar\n        \"\"\"\n        if not self.__show:\n            return\n\n        for index, value in enumerate(values):\n            bar, label = self.__bars[index].children\n\n            label.value = \"{} / {}\".format(value, bar.max)\n            \n            bar.value = value\n\n            if value >= bar.max:\n                bar.bar_style = \"success\"\n\n            if bar.max != 0:\n                bar.description = \"{:.2f}%\".format(bar.value / bar.max * 100)\n\n    def set_error(self, index: int) -> None:\n        \"\"\"Set a bar on error\"\"\"\n        if not self.__show:\n            return\n\n        bar, _ = self.__bars[index].children\n        bar.bar_style = \"danger\"\n\n\ndef get_progress_bars(\n    maxs: List[int], show\n) -> Union[ProgressBarsNotebookLab, ProgressBarsConsole]:\n    return (\n        ProgressBarsNotebookLab(maxs, show)\n        if is_notebook_lab()\n        else ProgressBarsConsole(maxs, show)\n    )\n\n\ndef progress_wrapper(\n    user_defined_function: Callable,\n    master_workers_queue: multiprocessing.Queue,\n    index: int,\n    chunk_size: int,\n) -> Callable:\n    \"\"\"Wrap the function to apply in a function which monitor the part of work already\n    done.\n    \"\"\"\n    counter = count()\n    state = ProgressState(chunk_size)\n\n    def closure(*user_defined_function_args, **user_defined_functions_kwargs):\n        iteration = next(counter)\n\n        if iteration == state.next_put_iteration:\n            time_now = time_ns()\n            master_workers_queue.put_nowait((index, WorkerStatus.Running, iteration))\n\n            delta_t = time_now - state.last_put_time\n            delta_i = iteration - state.last_put_iteration\n\n            state.next_put_iteration += (\n                max(int((delta_i / delta_t) * INTERVAL_NS), 1) if delta_t != 0 else 1\n            )\n\n            state.last_put_iteration = iteration\n            state.last_put_time = time_now\n\n        return user_defined_function(\n            *user_defined_function_args, **user_defined_functions_kwargs\n        )\n\n    return closure\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/__init__.py",
      "content": "from .core import pandarallel\n\n__version__ = \"1.6.5\"\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py",
      "content": "import multiprocessing\nfrom typing import Any, Callable, Dict, Iterable, Iterator, List, Tuple\n\nimport pandas as pd\nfrom pandas.core.window.expanding import ExpandingGroupby as PandasExpandingGroupby\n\nfrom ..utils import WorkerStatus, chunk, get_pandas_version\nfrom .generic import DataType\n\n\nclass ExpandingGroupBy:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: PandasExpandingGroupby, *args, **kwargs\n        ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n            pandas_version = get_pandas_version()\n\n            nb_items = (\n                len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n            )\n\n            chunks = chunk(nb_items, nb_workers)\n\n            iterator = (\n                iter(data._groupby)\n                if pandas_version < (1, 3)\n                else data._grouper.get_iterator(data.obj)\n            )\n\n            for chunk_ in chunks:\n                yield [next(iterator) for _ in range(chunk_.stop - chunk_.start)]\n\n        @staticmethod\n        def get_work_extra(data: PandasExpandingGroupby):\n            attributes = {\n                attribute: getattr(data, attribute) for attribute in data._attributes\n            }\n\n            return {\"attributes\": attributes}\n\n        @staticmethod\n        def work(\n            data: List[Tuple[int, pd.DataFrame]],\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> List[pd.DataFrame]:\n            show_progress_bars: bool = extra[\"show_progress_bars\"]\n            master_workers_queue: multiprocessing.Queue = extra[\"master_workers_queue\"]\n            worker_index: int = extra[\"worker_index\"]\n\n            def compute_result(\n                iteration: int,\n                attributes: Dict[str, Any],\n                index: int,\n                df: pd.DataFrame,\n                user_defined_function: Callable,\n                user_defined_function_args: tuple,\n                user_defined_function_kwargs: Dict[str, Any],\n            ) -> pd.DataFrame:\n                item = df.expanding(**attributes).apply(\n                    user_defined_function,\n                    *user_defined_function_args,\n                    **user_defined_function_kwargs\n                )\n\n                item.index = pd.MultiIndex.from_product([[index], item.index])\n\n                if show_progress_bars:\n                    master_workers_queue.put_nowait(\n                        (worker_index, WorkerStatus.Running, iteration)\n                    )\n\n                return item\n\n            attributes = extra[\"attributes\"]\n            attributes.pop(\"_grouper\", None)\n\n            dfs = (\n                compute_result(\n                    iteration,\n                    attributes,\n                    index,\n                    df,\n                    user_defined_function,\n                    user_defined_function_args,\n                    user_defined_function_kwargs,\n                )\n                for iteration, (index, df) in enumerate(data)\n            )\n\n            return pd.concat(dfs)\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.DataFrame], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/series.py",
      "content": "from typing import Any, Callable, Dict, Iterable, Iterator\n\nimport pandas as pd\n\nfrom ..utils import chunk\nfrom .generic import DataType\n\n\nclass Series:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: pd.Series, **kwargs\n        ) -> Iterator[pd.Series]:\n            for chunk_ in chunk(data.size, nb_workers):\n                yield data.iloc[chunk_]\n\n        @staticmethod\n        def work(\n            data: pd.Series,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.Series:\n            return data.apply(\n                user_defined_function,\n                *user_defined_function_args,\n                **user_defined_function_kwargs\n            )\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.Series], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n\n    class Map(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: pd.Series, **kwargs\n        ) -> Iterator[pd.Series]:\n            for chunk_ in chunk(data.size, nb_workers):\n                yield data.iloc[chunk_]\n\n        @staticmethod\n        def work(\n            data: pd.Series,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.Series:\n            return data.map(\n                user_defined_function,\n                *user_defined_function_args,\n                **user_defined_function_kwargs\n            )\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.Series], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/series_rolling.py",
      "content": "from typing import Any, Callable, Dict, Iterable, Iterator\n\nimport pandas as pd\nfrom pandas.core.window.rolling import Rolling\n\nfrom ..utils import chunk\nfrom .generic import DataType\n\n\nclass SeriesRolling:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, rolling: Rolling, **kwargs\n        ) -> Iterator[pd.Series]:\n            chunks = chunk(rolling.obj.size, nb_workers, rolling.window)\n\n            for chunk_ in chunks:\n                yield rolling.obj[chunk_]\n\n        @staticmethod\n        def get_work_extra(data: Rolling) -> Dict[str, Any]:\n            return {\n                \"attributes\": {\n                    attribute: getattr(data, attribute)\n                    for attribute in data._attributes\n                }\n            }\n\n        @staticmethod\n        def work(\n            data: pd.Series,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.Series:\n            attributes: Dict[str, Any] = extra[\"attributes\"]\n            worker_index: int = extra[\"worker_index\"]\n\n            result = data.rolling(**attributes).apply(\n                user_defined_function,\n                *user_defined_function_args,\n                **user_defined_function_kwargs\n            )\n\n            return result if worker_index == 0 else result[attributes[\"window\"] :]\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.Series], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/__init__.py",
      "content": "from .dataframe import DataFrame\nfrom .dataframe_groupby import DataFrameGroupBy\nfrom .expanding_groupby import ExpandingGroupBy\nfrom .rolling_groupby import RollingGroupBy\nfrom .generic import DataType\nfrom .series import Series\nfrom .series_rolling import SeriesRolling\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/dataframe.py",
      "content": "from typing import Any, Callable, Dict, Iterable, Iterator\nfrom types import GeneratorType\n\nimport pandas as pd\n\nfrom ..utils import chunk, get_axis_int\nfrom .generic import DataType\n\n\nclass DataFrame:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: pd.DataFrame, **kwargs\n        ) -> Iterator[pd.DataFrame]:\n            user_defined_function_kwargs = kwargs[\"user_defined_function_kwargs\"]\n\n            axis_int = get_axis_int(user_defined_function_kwargs)\n            opposite_axis_int = 1 - axis_int\n\n            for chunk_ in chunk(data.shape[opposite_axis_int], nb_workers):\n                yield data.iloc[chunk_] if axis_int == 1 else data.iloc[:, chunk_]\n\n        @staticmethod\n        def work(\n            data: pd.DataFrame,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.DataFrame:\n            return data.apply(\n                user_defined_function,\n                *user_defined_function_args,\n                **user_defined_function_kwargs,\n            )\n\n        @staticmethod\n        def get_reduce_extra(\n            data: Any, user_defined_function_kwargs: Dict[str, Any]\n        ) -> Dict[str, Any]:\n            return {\"axis\": get_axis_int(user_defined_function_kwargs)}\n\n        @staticmethod\n        def reduce(\n            datas: Iterable[pd.DataFrame], extra: Dict[str, Any]\n        ) -> pd.DataFrame:\n            if isinstance(datas, GeneratorType):\n                datas = list(datas)\n            axis = 0 if isinstance(datas[0], pd.Series) else 1 - extra[\"axis\"]\n            return pd.concat(datas, copy=False, axis=axis)\n\n    class ApplyMap(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: pd.DataFrame, **kwargs\n        ) -> Iterator[pd.DataFrame]:\n            for chunk_ in chunk(data.shape[0], nb_workers):\n                yield data.iloc[chunk_]\n\n        @staticmethod\n        def work(\n            data: pd.DataFrame,\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> pd.DataFrame:\n            return data.applymap(user_defined_function)\n\n        @staticmethod\n        def reduce(\n            datas: Iterable[pd.DataFrame], extra: Dict[str, Any]\n        ) -> pd.DataFrame:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/generic.py",
      "content": "from abc import ABC, abstractmethod\nfrom typing import Any, Callable, Dict, Iterable, Iterator\n\n\nclass DataType(ABC):\n    @staticmethod\n    @abstractmethod\n    def get_chunks(nb_workers: int, data: Any, **kwargs) -> Iterator[Any]:\n        ...\n\n    @staticmethod\n    def get_work_extra(data: Any) -> Dict[str, Any]:\n        return dict()\n\n    @staticmethod\n    @abstractmethod\n    def work(\n        data: Any,\n        user_defined_function: Callable,\n        user_defined_function_args: tuple,\n        user_defined_function_kwargs: Dict[str, Any],\n        extra: Dict[str, Any],\n    ) -> Any:\n        ...\n\n    @staticmethod\n    def get_reduce_extra(\n        data: Any, user_defined_function_kwargs: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        return dict()\n\n    @staticmethod\n    @abstractmethod\n    def reduce(datas: Iterable[Any], extra: Dict[str, Any]) -> Any:\n        ...\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py",
      "content": "import multiprocessing\nfrom typing import Any, Callable, Dict, Iterable, Iterator, List, Tuple\n\nimport pandas as pd\nfrom pandas.core.window.rolling import RollingGroupby as PandasRollingGroupby\n\nfrom ..utils import WorkerStatus, chunk, get_pandas_version\nfrom .generic import DataType\n\n\nclass RollingGroupBy:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, data: PandasRollingGroupby, *args, **kwargs\n        ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n            pandas_version = get_pandas_version()\n\n            nb_items = (\n                len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n            )\n\n            chunks = chunk(nb_items, nb_workers)\n\n            iterator = (\n                iter(data._groupby)\n                if pandas_version < (1, 3)\n                else data._grouper.get_iterator(data.obj)\n            )\n\n            for chunk_ in chunks:\n                yield [next(iterator) for _ in range(chunk_.stop - chunk_.start)]\n\n        @staticmethod\n        def get_work_extra(data: PandasRollingGroupby):\n            attributes = {\n                attribute: getattr(data, attribute) for attribute in data._attributes\n            }\n\n            return {\"attributes\": attributes}\n\n        @staticmethod\n        def work(\n            data: List[Tuple[int, pd.DataFrame]],\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> List[pd.DataFrame]:\n            show_progress_bars: bool = extra[\"show_progress_bars\"]\n            master_workers_queue: multiprocessing.Queue = extra[\"master_workers_queue\"]\n            worker_index: int = extra[\"worker_index\"]\n\n            def compute_result(\n                iteration: int,\n                attributes: Dict[str, Any],\n                index: int,\n                df: pd.DataFrame,\n                user_defined_function: Callable,\n                user_defined_function_args: tuple,\n                user_defined_function_kwargs: Dict[str, Any],\n            ) -> pd.DataFrame:\n                item = df.rolling(**attributes).apply(\n                    user_defined_function,\n                    *user_defined_function_args,\n                    **user_defined_function_kwargs\n                )\n\n                item.index = pd.MultiIndex.from_product([[index], item.index])\n\n                if show_progress_bars:\n                    master_workers_queue.put_nowait(\n                        (worker_index, WorkerStatus.Running, iteration)\n                    )\n\n                return item\n\n            attributes = extra[\"attributes\"]\n            attributes.pop(\"_grouper\", None)\n\n            dfs = (\n                compute_result(\n                    iteration,\n                    attributes,\n                    index,\n                    df,\n                    user_defined_function,\n                    user_defined_function_args,\n                    user_defined_function_kwargs,\n                )\n                for iteration, (index, df) in enumerate(data)\n            )\n\n            return pd.concat(dfs)\n\n        @staticmethod\n        def reduce(datas: Iterable[pd.DataFrame], extra: Dict[str, Any]) -> pd.Series:\n            return pd.concat(datas, copy=False)\n"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/dataframe_groupby.py",
      "content": "import itertools\nfrom typing import Any, Callable, Dict, Iterable, Iterator, List, Tuple, Union, cast\n\nimport pandas as pd\nfrom pandas.core.groupby.generic import DataFrameGroupBy as PandasDataFrameGroupBy\n\nfrom ..utils import chunk, df_indexed_like, get_pandas_version\nfrom .generic import DataType\n\n\nclass DataFrameGroupBy:\n    class Apply(DataType):\n        @staticmethod\n        def get_chunks(\n            nb_workers: int, dataframe_groupby: PandasDataFrameGroupBy, **kwargs\n        ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n            chunks = chunk(dataframe_groupby.ngroups, nb_workers)\n            iterator = iter(dataframe_groupby)\n\n            for chunk_ in chunks:\n                yield [next(iterator) for _ in range(chunk_.stop - chunk_.start)]\n\n        @staticmethod\n        def work(\n            data: List[Tuple[int, pd.DataFrame]],\n            user_defined_function: Callable,\n            user_defined_function_args: tuple,\n            user_defined_function_kwargs: Dict[str, Any],\n            extra: Dict[str, Any],\n        ) -> List[Tuple[int, pd.DataFrame, bool]]:\n            def compute_result(\n                key: int, df: pd.DataFrame\n            ) -> Tuple[int, pd.DataFrame, bool]:\n                result = user_defined_function(\n                    df, *user_defined_function_args, **user_defined_function_kwargs\n                )\n                mutated = not df_indexed_like(result, df.axes)\n                return key, result, mutated\n\n            return [compute_result(key, df) for key, df in data]\n\n        @staticmethod\n        def get_reduce_extra(\n            data: PandasDataFrameGroupBy, user_defined_function_kwargs: Dict[str, Any]\n        ) -> Dict[str, Any]:\n            return {\"df_groupby\": data}\n\n        @staticmethod\n        def reduce(\n            datas: Iterable[List[Tuple[int, pd.DataFrame, bool]]], extra: Dict[str, Any]\n        ) -> pd.Series:\n            def get_args(\n                keys: List[int],\n                values: List[pd.DataFrame],\n                df_groupby: PandasDataFrameGroupBy,\n            ) -> Union[\n                Tuple[List[int], List[pd.DataFrame]],\n                Tuple[pd.DataFrame, List[int], List[pd.DataFrame]],\n                Tuple[pd.DataFrame, List[pd.DataFrame]],\n            ]:\n                pandas_version = get_pandas_version()\n\n                if pandas_version < (1, 3):\n                    return keys, values\n                elif pandas_version < (1, 4):\n                    return df_groupby._selected_obj, keys, values\n                else:\n                    return df_groupby._selected_obj, values\n\n            df_groupby: PandasDataFrameGroupBy = extra[\"df_groupby\"]\n\n            results = itertools.chain.from_iterable(datas)\n            keys, values, mutated = zip(*results)\n\n            keys = cast(List[int], keys)\n            values = cast(List[pd.DataFrame], values)\n            mutated = cast(List[bool], mutated)\n\n            args = get_args(keys, values, df_groupby)\n  \n            return df_groupby._wrap_applied_output(*args, not_indexed_same=mutated)\n"
    }
  ],
  "ErrorMessage": "__________________________________________________________________ test_dataframe_groupby_rolling_apply[anonymous-1000-True-None] ___________________________________________________________________\n\nnb_workers = 2, data = RollingGroupby [window=4,center=False,axis=0,method=single], args = (), kwargs = {'user_defined_function_kwargs': {'raw': False}}, pandas_version = (2, 1), nb_items = 9\nchunks = [slice(0, 5, None), slice(5, 9, None)], chunk_ = slice(5, 9, None)\n\n    @staticmethod\n    def get_chunks(\n        nb_workers: int, data: PandasRollingGroupby, *args, **kwargs\n    ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n        pandas_version = get_pandas_version()\n    \n        nb_items = (\n            len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n        )\n    \n        chunks = chunk(nb_items, nb_workers)\n    \n        iterator = (\n            iter(data._groupby)\n            if pandas_version < (1, 3)\n            else data._grouper.get_iterator(data.obj)\n        )\n    \n        for chunk_ in chunks:\n>           yield [next(iterator) for _ in range(chunk_.stop)]\n\npandarallel/data_types/rolling_groupby.py:32: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\n.0 = <range_iterator object at 0x795bd66eb570>\n\n>   yield [next(iterator) for _ in range(chunk_.stop)]\nE   StopIteration\n\npandarallel/data_types/rolling_groupby.py:32: StopIteration\n\nThe above exception was the direct cause of the following exception:\n\npandarallel_init = None, func_dataframe_groupby_rolling_apply = <function func_dataframe_groupby_rolling_apply.<locals>.<lambda> at 0x795bd4c4f0d0>, df_size = 1000\n\n    def test_dataframe_groupby_rolling_apply(\n        pandarallel_init, func_dataframe_groupby_rolling_apply, df_size\n    ):\n        df = pd.DataFrame(\n            dict(a=np.random.randint(1, 10, df_size), b=np.random.rand(df_size))\n        )\n    \n        res = (\n            df.groupby(\"a\")\n            .b.rolling(4)\n            .apply(func_dataframe_groupby_rolling_apply, raw=False)\n        )\n        res_parallel = (\n>           df.groupby(\"a\")\n            .b.rolling(4)\n            .parallel_apply(func_dataframe_groupby_rolling_apply, raw=False)\n        )\n\ntests/test_pandarallel.py:321: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\ndata = RollingGroupby [window=4,center=False,axis=0,method=single], user_defined_function = <function func_dataframe_groupby_rolling_apply.<locals>.<lambda> at 0x795bd4c4f0d0>\nuser_defined_function_args = (), user_defined_function_kwargs = {'raw': False}, wrapped_work_function = <pandarallel.core.WrapWorkFunctionForFileSystem object at 0x795bd4e32d90>\nwrapped_reduce_function = <function wrap_reduce_function_for_file_system.<locals>.closure at 0x795bd675b790>\n\n    def closure(\n        data: Any,\n        user_defined_function: Callable,\n        *user_defined_function_args: tuple,\n        **user_defined_function_kwargs: Dict[str, Any],\n    ):\n        wrapped_work_function = WrapWorkFunctionForFileSystem(data_type.work)\n        wrapped_reduce_function = wrap_reduce_function_for_file_system(data_type.reduce)\n    \n>       chunks = list(\n            data_type.get_chunks(\n                nb_requested_workers,\n                data,\n                user_defined_function_kwargs=user_defined_function_kwargs,\n            )\n        )\nE       RuntimeError: generator raised StopIteration\n\npandarallel/core.py:218: RuntimeError\n--------------------------------------------------------------------------------------- Captured stdout setup ---------------------------------------------------------------------------------------\nINFO: Pandarallel will run on 2 workers.\nINFO: Pandarallel will use Memory file system to transfer data between the main process and workers.\n__________________________________________________________________ test_dataframe_groupby_rolling_apply[anonymous-1000-True-False] __________________________________________________________________\n\nnb_workers = 2, data = RollingGroupby [window=4,center=False,axis=0,method=single], args = (), kwargs = {'user_defined_function_kwargs': {'raw': False}}, pandas_version = (2, 1), nb_items = 9\nchunks = [slice(0, 5, None), slice(5, 9, None)], chunk_ = slice(5, 9, None)\n\n    @staticmethod\n    def get_chunks(\n        nb_workers: int, data: PandasRollingGroupby, *args, **kwargs\n    ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n        pandas_version = get_pandas_version()\n    \n        nb_items = (\n            len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n        )\n    \n        chunks = chunk(nb_items, nb_workers)\n    \n        iterator = (\n            iter(data._groupby)\n            if pandas_version < (1, 3)\n            else data._grouper.get_iterator(data.obj)\n        )\n    \n        for chunk_ in chunks:\n>           yield [next(iterator) for _ in range(chunk_.stop)]\n\npandarallel/data_types/rolling_groupby.py:32: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\n.0 = <range_iterator object at 0x795bd4d8c3c0>\n\n>   yield [next(iterator) for _ in range(chunk_.stop)]\nE   StopIteration\n\npandarallel/data_types/rolling_groupby.py:32: StopIteration\n\nThe above exception was the direct cause of the following exception:\n\npandarallel_init = None, func_dataframe_groupby_rolling_apply = <function func_dataframe_groupby_rolling_apply.<locals>.<lambda> at 0x795bd4c4f9d0>, df_size = 1000\n\n    def test_dataframe_groupby_rolling_apply(\n        pandarallel_init, func_dataframe_groupby_rolling_apply, df_size\n    ):\n        df = pd.DataFrame(\n            dict(a=np.random.randint(1, 10, df_size), b=np.random.rand(df_size))\n        )\n    \n        res = (\n            df.groupby(\"a\")\n            .b.rolling(4)\n            .apply(func_dataframe_groupby_rolling_apply, raw=False)\n        )\n        res_parallel = (\n>           df.groupby(\"a\")\n            .b.rolling(4)\n            .parallel_apply(func_dataframe_groupby_rolling_apply, raw=False)\n        )\n\ntests/test_pandarallel.py:321: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\ndata = RollingGroupby [window=4,center=False,axis=0,method=single], user_defined_function = <function func_dataframe_groupby_rolling_apply.<locals>.<lambda> at 0x795bd4c4f9d0>\nuser_defined_function_args = (), user_defined_function_kwargs = {'raw': False}, wrapped_work_function = <pandarallel.core.WrapWorkFunctionForPipe object at 0x795bd4dd7e20>\nmanager = <multiprocessing.managers.SyncManager object at 0x795bd4dd78b0>\n\n    def closure(\n        data: Any,\n        user_defined_function: Callable,\n        *user_defined_function_args: tuple,\n        **user_defined_function_kwargs: Dict[str, Any],\n    ):\n        wrapped_work_function = WrapWorkFunctionForPipe(data_type.work)\n        dilled_user_defined_function = dill.dumps(user_defined_function)\n        manager: SyncManager = CONTEXT.Manager()\n        master_workers_queue = manager.Queue()\n    \n>       chunks = list(\n            data_type.get_chunks(\n                nb_requested_workers,\n                data,\n                user_defined_function_kwargs=user_defined_function_kwargs,\n            )\n        )\nE       RuntimeError: generator raised StopIteration\n\npandarallel/core.py:370: RuntimeError\n--------------------------------------------------------------------------------------- Captured stdout setup ---------------------------------------------------------------------------------------\nINFO: Pandarallel will run on 2 workers.\nINFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n========================================================================================= warnings summary ==========================================================================================\ntests/test_pandarallel.py: 16 warnings\n  /home/user/Documents/repoben/buggycode/nalepae_pandarallel/tests/test_pandarallel.py:235: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n    res = df.applymap(func_dataframe_applymap)\n\n-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html\n====================================================================================== short test summary info ======================================================================================\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1000-False-None] - RuntimeError: generator raised StopIteration\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1000-False-False] - RuntimeError: generator raised StopIteration\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1000-True-None] - RuntimeError: generator raised StopIteration\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1000-True-False] - RuntimeError: generator raised StopIteration\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1000-False-None] - RuntimeError: generator raised StopIteration\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1000-False-False] - RuntimeError: generator raised StopIteration\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1000-True-None] - RuntimeError: generator raised StopIteration\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1000-True-False] - RuntimeError: generator raised StopIteration\n============================================================================ 8 failed, 209 passed, 16 warnings in 7.76s =============================================================================",
  "Patch": "--- a/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n+++ b/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n@@ -29,7 +29,7 @@\n             )\n \n             for chunk_ in chunks:\n-                yield [next(iterator) for _ in range(chunk_.stop)]\n+                yield [next(iterator) for _ in range(chunk_.stop - chunk_.start)]\n \n         @staticmethod\n         def get_work_extra(data: PandasRollingGroupby):\n",
  "BuggyCodeLocation": [
    {
      "file": "nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py",
      "function": null,
      "content_all": {
        "32": "                yield [next(iterator) for _ in range(chunk_.stop)]\n",
        "33": "\n"
      },
      "content_change": {
        "32": "                yield [next(iterator) for _ in range(chunk_.stop)]\n"
      }
    }
  ],
  "Issue": {
    "title": "Fix Incorrect Iterator Usage for Rolling GroupBy Chunks",
    "description": "There appears to be an issue with the iteration logic in the RollingGroupBy class within the rolling_groupby.py file. When generating chunks for parallel processing, the iteration limits are not being calculated correctly. Specifically, the current code assumes a stopping condition solely based on chunk_.stop, which does not account for the starting index of the chunk.\n\nIn its current state, this could lead to incorrect partitioning of data, causing potential errors or inefficiencies in parallel processing tasks. Users may experience issues where data is not evenly split across workers, resulting in inaccurate calculations or suboptimal performance.\n\nThe commit modifies the iteration logic to account for both the start and stop bounds of each chunk (`chunk_.stop - chunk_.start`). By ensuring the correct calculation of iteration limits, the fix addresses potential inaccuracies in data chunking for rolling groupby operations.",
    "explanation": "Certainly! Let's break down and analyze the information:\n\n### Summary of the Issue\n\nThe issue is located in the `RollingGroupBy` class within the `rolling_groupby.py` file. This class is responsible for managing grouped rolling calculations, which can be used for parallel processing. However, there is a problem with the iteration logic that determines the bounds for chunking the data into pieces for these parallel operations.\n\n**Problem Description:**\n- The current iteration limits assume a stopping condition based solely on `chunk_.stop`.\n- This approach does not take into account the starting index of the chunk (`chunk_.start`).\n- Incorrect chunking could lead to uneven data partitions across workers. This can result in errors or inefficiencies during parallel processing tasks, leading to inaccurate calculations or suboptimal performance.\n\n### Explanation of the Commit\n\nThe commit **\"Fix Incorrect Iterator Usage for Rolling GroupBy Chunks\"** addresses this issue by modifying how the iteration bounds are calculated.\n\n**Commit Modifications:**\n- The iteration logic in `rolling_groupby.py` is adjusted to consider both the starting and stopping indexes of each chunk.\n- Specifically, the iteration range is modified to be `chunk_.stop - chunk_.start`.\n\n### Understanding the Commit and the Issue Resolution\n\n#### Issue Cause:\nThe original code incorrectly determines the number of elements within each chunk by only using the stopping index (`chunk_.stop`). This oversight ignores how far into the data each chunk should start, leading to potential miscalculations of which parts of the data belong within which chunk.\n\nFor example, if a chunk starts at index 5 and stops at index 10, the correct number of iterations should be the difference between these two values, i.e., `10 - 5 = 5`. However, the original iteration logic would incorrectly assume the number of iterations to be 10.\n\n#### Commit Solution:\nThe commit addresses this by ensuring the iteration accurately reflects the number of elements within the chunk bounds. This strips out any miscalculations and ensures:\n\n1. **Accurate Chunk Bounds:** Each worker gets an appropriately sized chunk of data, calculated with respect to both the `chunk_.start` and `chunk_.stop`.\n2. **Even Data Distribution:** The adjusted iteration logic prevents uneven data distribution among workers, maintaining balanced workload.\n3. **Correct Data Processing:** Correct chunk sizes eliminate chances of processing errors and improve the overall efficiency and accuracy of parallel computations.\n\n### How the Commit Solves the Issue\n\nThe commit fundamentally rectifies the calculation of chunk sizes. By subtracting `chunk_.start` from `chunk_.stop`, it ensures that the number of iterations truly matches the intended size of each chunk:\n\n- **Even Workload:** Balances the chunks so that each processing worker gets an equal slice of the data.\n- **Improved Performance:** Balanced chunks mean that no single worker gets overwhelmed or underutilized, maximizing the parallel processing capability.\n- **Accuracy:** Ensures each chunk is processed accurately within its bounds, providing correct results for grouped rolling operations.\n\n### Conclusion\n\nIn summary, the root of the issue lies in the incorrect calculation of chunk bounds due to ignoring the starting index during iteration. The solution provided by the commit fixes this by adjusting the iteration limits to accurately reflect the actual number of elements within the chunk. This adjustment resolves potential data mispartitioning, leading to correct and efficient parallel processing in rolling groupby operations. The careful recalibration ensures balanced data processing and accurate, efficient computational results."
  },
  "Explain": "Certainly! Let's break down and analyze the information:\n\n### Summary of the Issue\n\nThe issue is located in the `RollingGroupBy` class within the `rolling_groupby.py` file. This class is responsible for managing grouped rolling calculations, which can be used for parallel processing. However, there is a problem with the iteration logic that determines the bounds for chunking the data into pieces for these parallel operations.\n\n**Problem Description:**\n- The current iteration limits assume a stopping condition based solely on `chunk_.stop`.\n- This approach does not take into account the starting index of the chunk (`chunk_.start`).\n- Incorrect chunking could lead to uneven data partitions across workers. This can result in errors or inefficiencies during parallel processing tasks, leading to inaccurate calculations or suboptimal performance.\n\n### Explanation of the Commit\n\nThe commit **\"Fix Incorrect Iterator Usage for Rolling GroupBy Chunks\"** addresses this issue by modifying how the iteration bounds are calculated.\n\n**Commit Modifications:**\n- The iteration logic in `rolling_groupby.py` is adjusted to consider both the starting and stopping indexes of each chunk.\n- Specifically, the iteration range is modified to be `chunk_.stop - chunk_.start`.\n\n### Understanding the Commit and the Issue Resolution\n\n#### Issue Cause:\nThe original code incorrectly determines the number of elements within each chunk by only using the stopping index (`chunk_.stop`). This oversight ignores how far into the data each chunk should start, leading to potential miscalculations of which parts of the data belong within which chunk.\n\nFor example, if a chunk starts at index 5 and stops at index 10, the correct number of iterations should be the difference between these two values, i.e., `10 - 5 = 5`. However, the original iteration logic would incorrectly assume the number of iterations to be 10.\n\n#### Commit Solution:\nThe commit addresses this by ensuring the iteration accurately reflects the number of elements within the chunk bounds. This strips out any miscalculations and ensures:\n\n1. **Accurate Chunk Bounds:** Each worker gets an appropriately sized chunk of data, calculated with respect to both the `chunk_.start` and `chunk_.stop`.\n2. **Even Data Distribution:** The adjusted iteration logic prevents uneven data distribution among workers, maintaining balanced workload.\n3. **Correct Data Processing:** Correct chunk sizes eliminate chances of processing errors and improve the overall efficiency and accuracy of parallel computations.\n\n### How the Commit Solves the Issue\n\nThe commit fundamentally rectifies the calculation of chunk sizes. By subtracting `chunk_.start` from `chunk_.stop`, it ensures that the number of iterations truly matches the intended size of each chunk:\n\n- **Even Workload:** Balances the chunks so that each processing worker gets an equal slice of the data.\n- **Improved Performance:** Balanced chunks mean that no single worker gets overwhelmed or underutilized, maximizing the parallel processing capability.\n- **Accuracy:** Ensures each chunk is processed accurately within its bounds, providing correct results for grouped rolling operations.\n\n### Conclusion\n\nIn summary, the root of the issue lies in the incorrect calculation of chunk bounds due to ignoring the starting index during iteration. The solution provided by the commit fixes this by adjusting the iteration limits to accurately reflect the actual number of elements within the chunk. This adjustment resolves potential data mispartitioning, leading to correct and efficient parallel processing in rolling groupby operations. The careful recalibration ensures balanced data processing and accurate, efficient computational results.",
  "Source": "Human",
  "Token": 1112,
  "Command": [
    "pytest tests"
  ],
  "FilteredCode": [
    {
      "path": "nalepae_pandarallel/tests/test_pandarallel.py",
      "content": "1 import importlib\n2 import math\n3 \n4 import numpy as np\n5 import pandas as pd\n6 import pytest\n7 from pandarallel import pandarallel\n8 \n9 \n10 @pytest.fixture(params=(1000, 1))\n11 def df_size(request):\n12     return request.param\n13 \n14 \n15 @pytest.fixture(params=(False, True))\n16 def progress_bar(request):\n17     return request.param\n18 \n19 \n20 @pytest.fixture(params=(None, False))\n21 def use_memory_fs(request):\n22     return request.param\n23 \n24 \n25 @pytest.fixture(params=(RuntimeError, AttributeError, ZeroDivisionError))\n26 def exception(request):\n27     return request.param\n28 \n29 \n30 @pytest.fixture(params=(\"named\", \"anonymous\"))\n31 def func_dataframe_apply_axis_0(request):\n32     def func(x):\n33         return max(x) - min(x)\n34 \n35     return dict(named=func, anonymous=lambda x: max(x) - min(x))[request.param]\n36 \n37 \n38 @pytest.fixture(params=(\"named\", \"anonymous\"))\n39 def func_dataframe_apply_axis_1(request):\n40     def func(x):\n41         return math.sin(x.a**2) + math.sin(x.b**2)\n42 \n43     return dict(\n44         named=func, anonymous=lambda x: math.sin(x.a**2) + math.sin(x.b**2)\n45     )[request.param]\n46 \n47 \n48 @pytest.fixture(params=(\"named\", \"anonymous\"))\n49 def func_dataframe_applymap(request):\n50     def func(x):\n51         return math.sin(x**2) - math.cos(x**2)\n52 \n53     return dict(named=func, anonymous=lambda x: math.sin(x**2) - math.cos(x**2))[\n54         request.param\n55     ]\n56 \n57 \n58 @pytest.fixture(params=(\"named\", \"anonymous\"))\n59 def func_series_map(request):\n60     def func(x):\n61         return math.log10(math.sqrt(math.exp(x**2)))\n62 \n63     return dict(\n64         named=func, anonymous=lambda x: math.(...truncated)"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/core.py",
      "content": "1 import multiprocessing\n2 import os\n3 import pickle\n4 from itertools import count\n5 from multiprocessing.managers import SyncManager\n6 from pathlib import Path\n7 from tempfile import NamedTemporaryFile\n8 from typing import Any, Callable, Dict, Iterator, Optional, Tuple, Type, cast\n9 \n10 import dill\n11 import pandas as pd\n12 import psutil\n13 from pandas.core.groupby import DataFrameGroupBy as PandaDataFrameGroupBy\n14 from pandas.core.window.expanding import ExpandingGroupby as PandasExpandingGroupby\n15 from pandas.core.window.rolling import RollingGroupby as PandasRollingGroupby\n16 \n17 from .data_types import (\n18     DataFrame,\n19     DataFrameGroupBy,\n20     DataType,\n21     ExpandingGroupBy,\n22     RollingGroupBy,\n23     Series,\n24     SeriesRolling,\n25 )\n26 from .progress_bars import ProgressBarsType, get_progress_bars, progress_wrapper\n27 from .utils import WorkerStatus\n28 \n29 ON_WINDOWS = os.name == \"nt\"\n30 CONTEXT = multiprocessing.get_context(\"spawn\" if ON_WINDOWS else \"fork\")\n31 \n32 # Root of Memory File System\n33 MEMORY_FS_ROOT = os.environ.get(\"MEMORY_FS_ROOT\", \"/dev/shm\")\n34 \n35 # By default, Pand(...truncated)"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py",
      "content": "1 import multiprocessing\n2 from typing import Any, Callable, Dict, Iterable, Iterator, List, Tuple\n3 \n4 import pandas as pd\n5 from pandas.core.window.expanding import ExpandingGroupby as PandasExpandingGroupby\n6 \n7 from ..utils import WorkerStatus, chunk, get_pandas_version\n8 from .generic import DataType\n9 \n10 \n11 class ExpandingGroupBy:\n12     class Apply(DataType):\n13         @staticmethod\n14         def get_chunks(\n15             nb_workers: int, data: PandasExpandingGroupby, *args, **kwargs\n16         ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n17             pandas_version = get_pandas_version()\n18 \n19             nb_items = (\n20                 len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n21             )\n22 \n23             chunks = chunk(nb_items, nb_workers)(...truncated)"
    },
    {
      "path": "nalepae_pandarallel/pandarallel/utils.py",
      "content": "1 import itertools\n2 from enum import Enum\n3 from typing import Any, Dict, List, Tuple\n4 \n5 import pandas as pd\n6 from pandas import DataFrame, Index\n7 \n8 \n9 def chunk(nb_item: int, nb_chunks: int, start_offset=0) -> List[slice]:\n10     \"\"\"\n11     Return `nb_chunks` slices of approximatively `nb_item / nb_chunks` each.\n12 \n13     Parameters\n14     ----------\n15     nb_item : int\n16         Total number of items\n17 \n18     nb_chunks : int\n19         Number of chunks to return\n20 \n21     start_offset : int\n22         Shift start of slice by this amount\n23 \n24     Returns\n25     -------\n26    (...truncated)"
    }
  ],
  "TokenAll": 14238,
  "FilteredLevel": 1500,
  "Results": {
    "model": "GPT-4o",
    "Difficulty": "Easy",
    "issue_origin": {
      "title": "Syntax Error in Lambda Function",
      "description": "In the file `nalepae_pandarallel/tests/test_pandarallel.py`, line 64 contains a syntax error in the lambda function definition 'anonymous=lambda x: math.'. This should be corrected to a valid lambda function implementation.",
      "explanation": "### Summary of the Issue\n\nThe reported issue is a syntax error in the Python file `nalepae_pandarallel/tests/test_pandarallel.py` at line 64. Specifically, the lambda function definition is incomplete or malformed as `anonymous=lambda x: math.`, which results in a syntax error and must be corrected to a valid lambda function.\n\n### Analysis of the Code and Issue\n\nThe problematic line meant to return an appropriate lambda function based on the parameter given to the `func_series_map` fixture, but instead, it contains an incomplete lambda function:\n```python\n64  named=func, anonymous=lambda x: math.(...truncated)\n```\n\nFor comparison, here’s a properly defined lambda function inside the same file:\n\n```python\n35  return dict(named=func, anonymous=lambda x: max(x) - min(x))[request.param]\n```\n\n### Commit Content and Explanation\n\nLet's assume the commit fixes the syntax error by correctly defining the lambda function for the `anonymous` entry. Here’s how it should look:\n```diff\n63     return dict(\n-64         named=func, anonymous=lambda x: math.(...truncated)\n+64         named=func, anonymous=lambda x: math.log10(math.sqrt(math.exp(x**2)))\n65     )[request.param]\n```\n\n### How the Commit Solves the Issue\n\nThe commit provides a complete and valid implementation of the lambda function:\n\n1. **Correct Syntax**: It ensures that the lambda function for `anonymous` is syntactically correct. The previous definition (`lambda x: math.`) was incomplete because it ended abruptly with `math.` without specifying a method.\n   \n2. **Functional Parity**: The new lambda function `lambda x: math.log10(math.sqrt(math.exp(x**2)))` appears to be consistent with the `named` function in line 62, ensuring that both entries in the dictionary perform similar calculations based on mathematical operations.\n\n3. **Ensures Consistency**: By appropriately defining the lambda function, it makes sure that the tests depending on `func_series_map` will not fail due to a syntax error and will now execute their intended logic.\n\nIn conclusion, the commit resolved the root cause of the syntax error by completing the lambda function definition, making the code compile and run correctly, thus resolving the test failure issue."
    },
    "issue_message": {
      "title": "StopIteration in generator function",
      "description": "The generator function in 'pandarallel/data_types/rolling_groupby.py' is causing a StopIteration exception. This happens because the iterator used in the generator might not have enough items to iterate over. Please ensure that the iterator has sufficient items or handle the StopIteration exception explicitly in the generator.",
      "explanation": "### Issue Summary\n\nThe reported issue, titled \"StopIteration in generator function,\" is occurring in the `pandarallel/data_types/rolling_groupby.py` file. The `get_chunks` method in this file is causing a `StopIteration` exception because the iterator may not have enough items to provide when the generator requires them. This error happens during testing of the `parallel_apply` method in the `test_dataframe_groupby_rolling_apply` test case.\n\n### Detailed Analysis\n\nTo resolve this, let's break down the code and the root cause of the error:\n\n#### Relevant Code Snippets:\n\n1. `pandarallel/data_types/rolling_groupby.py`:\n   ```python\n   18 from .utils import chunk, get_pandas_version\n   ...\n   26 class RollingGroupBy:\n   27     class Apply:\n   28         @staticmethod\n   29         def get_chunks(nb_workers: int, data: PandasRollingGroupby, *args, **kwargs) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n   30             pandas_version = get_pandas_version()\n   31             nb_items = (\n   32                 len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n   33             )\n   34             \n   35             chunks = chunk(nb_items, nb_workers)\n   36\n   37             iterator = (\n   38                 iter(data._groupby)\n   39                 if pandas_version < (1, 3)\n   40                 else data._grouper.get_iterator(data.obj)\n   41             )\n   42\n   43             for chunk_ in chunks:\n   44                 yield [next(iterator) for _ in range(chunk_.stop)]\n   ```\n\n2. `pandarallel/utils.py` (chunk function):\n   ```python\n   def chunk(nb_item: int, nb_chunks: int, start_offset=0) -> List[slice]:\n       \"\"\"\n       Return `nb_chunks` slices of approximatively `nb_item / nb_chunks` each.\n       \"\"\"\n       chunks = []\n       chunk_size = (nb_item - start_offset) // nb_chunks\n       for i in range(nb_chunks):\n           start = i * chunk_size + start_offset\n           stop = start + chunk_size\n           chunks.append(slice(start, stop))\n       return chunks\n   ```\n\n3. Test File: `nalepae_pandarallel/tests/test_pandarallel.py`\n   ```python\n   321 res_parallel = (\n   322    df.groupby(\"a\")\n   323       .b.rolling(4)\n   324       .parallel_apply(func_dataframe_groupby_rolling_apply, raw=False)\n   325 )\n   ```\n\n#### Error Analysis:\n\n1. The `get_chunks` method tries to yield a list of tuples consisting of index and DataFrame for each chunk.\n2. The `chunk` function computes slice objects for dividing the items.\n3. The iterator is derived either from `data._groupby` or `data._grouper.get_iterator(data.obj)` depending on the Pandas version.\n4. When a chunk's `stop` value surpasses the number of available items in the iterator, `next(iterator)` raises a `StopIteration` exception.\n\n### Commit Solution\n\nThe solution involves ensuring the `StopIteration` is explicitly managed within the `get_chunks` method. We can modify the generator to handle the exhausted iterator within the loop to avoid unhandled `StopIteration`.\n\n#### Commit: Fix StopIteration in `get_chunks` method\n\nChange the `get_chunks` method to handle `StopIteration`:\n\n```diff\n- for chunk_ in chunks:\n-     yield [next(iterator) for _ in range(chunk_.stop)]\n+ for chunk_ in chunks:\n+     chunk_items = []\n+     for _ in range(chunk_.stop):\n+         try:\n+             chunk_items.append(next(iterator))\n+         except StopIteration:\n+             break\n+     if chunk_items:\n+         yield chunk_items\n```\n\n### Explanation\n\n1. **Initial Attempt**:\n   ```python\n   for chunk_ in chunks:\n       yield [next(iterator) for _ in range(chunk_.stop)]\n   ```\n   - Here, the code does not account for the iterator running out of items, and `next` raises a `StopIteration` when there are no more items.\n\n2. **Enhanced Solution**:\n   ```python\n   for chunk_ in chunks:\n       chunk_items = []\n       for _ in range(chunk_.stop):\n           try:\n               chunk_items.append(next(iterator))\n           except StopIteration:\n               break\n       if chunk_items:\n           yield chunk_items\n   ```\n\n   - **`chunk_items` Initialization**: Initially creates an empty list `chunk_items` to hold the items of the current chunk.\n   - **Try-Except Block**: The inner loop attempts to fetch the required number of items. If"
    },
    "issue_ground": {
      "title": "Fix Incorrect Iterator Usage for Rolling GroupBy Chunks",
      "description": "There appears to be an issue with the iteration logic in the RollingGroupBy class within the rolling_groupby.py file. When generating chunks for parallel processing, the iteration limits are not being calculated correctly. Specifically, the current code assumes a stopping condition solely based on chunk_.stop, which does not account for the starting index of the chunk.\n\nIn its current state, this could lead to incorrect partitioning of data, causing potential errors or inefficiencies in parallel processing tasks. Users may experience issues where data is not evenly split across workers, resulting in inaccurate calculations or suboptimal performance.\n\nThe commit modifies the iteration logic to account for both the start and stop bounds of each chunk (`chunk_.stop - chunk_.start`). By ensuring the correct calculation of iteration limits, the fix addresses potential inaccuracies in data chunking for rolling groupby operations.",
      "explanation": "### Summary of the Issue\n\nThe issue exists within the `RollingGroupBy` class, specifically when generating chunks for parallel processing in the iteration logic. The problem occurs because the current code calculates iteration limits solely based on `chunk_.stop` without accounting for the starting index of the chunk (`chunk_.start`). This incorrect iteration logic can cause improper partitioning of data, leading to errors like data not being evenly distributed among workers, and hence causing inaccurate calculations or performance issues.\n\n### Content of the Commit\n\nHere is a hypothetical example of what the commit might look like to solve this issue (the actual commit details have not been provided, so we will deduce the likely change based on the explanation given):\n\n```diff\n@staticmethod\ndef get_chunks(\n    nb_workers: int, data: PandasRollingGroupby, *args, **kwargs\n) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n    pandas_version = get_pandas_version()\n\n    nb_items = (\n        len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n    )\n\n    chunks = chunk(nb_items, nb_workers)\n\n    iterator = (\n        iter(data._groupby)\n        if pandas_version < (1, 3)\n        else data._grouper.get_iterator(data.obj)\n    )\n\n    for chunk_ in chunks:\n-       yield [next(iterator) for _ in range(chunk_.stop)]\n+       yield [next(iterator) for _ in range(chunk_.start, chunk_.stop)]\n```\n\n### Explanation of How the Commit Solves the Issue\n\n1. **Previous Iteration Logic**:\n    - The iteration limits were incorrectly calculated. The code used `range(chunk_.stop)`, which means it iterated `chunk_.stop` times starting from 0, without considering `chunk_.start`.\n    - This resulted in potential `StopIteration` errors, as seen in the test failures. For instance, if `chunk_.start` was 5 and `chunk_.stop` was 9, the correct number of iterations should be `(9 - 5) = 4` times, starting from the 5th element, not 9 times starting from 0.\n    - Yielding `[next(iterator) for _ in range(chunk_.stop)]` incorrectly aims to fetch more items than present, which causes `StopIteration`.\n\n2. **Fix Applied**:\n    - The fix correctly adjusts the range to account for both `chunk_.start` and `chunk_.stop`. Using `range(chunk_.start, chunk_.stop)`, the loop now respects the start and stop boundaries of the chunk.\n    - This results in accurate chunk sizes and proper handling of the data for each parallel processing worker.\n\n3. **Impact of the Fix**:\n    - By iterating with `range(chunk_.start, chunk_.stop)`, the code now processes only the data items that belong to the respective chunk, preventing `StopIteration` exceptions.\n    - This accurate partitioning ensures that data is evenly split across workers, leading to correct calculations and optimized performance.\n\n4. **Test Cases**:\n    - The errors seen in the test cases (`test_dataframe_groupby_rolling_apply[anonymous-1000-True-None]` and others) that produced `RuntimeError: generator raised StopIteration` occur because the generator tried to fetch more elements than available due to incorrect iteration limits.\n    - With the fix in place, these errors are resolved, as the iteration now corresponds correctly to the actual chunk boundaries, ensuring that each call to `next(iterator)` is valid within the chunk's range.\n\n### Conclusion\n\nIn conclusion, the issue was caused by incorrect calculation of iteration limits for data chunks in the `RollingGroupBy` class, leading to a `StopIteration` exception. The commit resolves this by adjusting the iteration range to properly respect the start and stop points (`range(chunk_.start, chunk_.stop)`), thereby ensuring proper data partitioning and preventing iterations beyond the available data, ensuring even distribution of data chunks across parallel tasks. This change is reflected in the corrected behavior of the test cases."
    },
    "issue_ground_truth": {
      "title": "Fix Incorrect Iterator Usage for Rolling GroupBy Chunks",
      "description": "There appears to be an issue with the iteration logic in the RollingGroupBy class within the rolling_groupby.py file. When generating chunks for parallel processing, the iteration limits are not being calculated correctly. Specifically, the current code assumes a stopping condition solely based on chunk_.stop, which does not account for the starting index of the chunk.\n\nIn its current state, this could lead to incorrect partitioning of data, causing potential errors or inefficiencies in parallel processing tasks. Users may experience issues where data is not evenly split across workers, resulting in inaccurate calculations or suboptimal performance.\n\nThe commit modifies the iteration logic to account for both the start and stop bounds of each chunk (`chunk_.stop - chunk_.start`). By ensuring the correct calculation of iteration limits, the fix addresses potential inaccuracies in data chunking for rolling groupby operations.",
      "explanation": "Certainly! Let's break down and analyze the information:\n\n### Summary of the Issue\n\nThe issue is located in the `RollingGroupBy` class within the `rolling_groupby.py` file. This class is responsible for managing grouped rolling calculations, which can be used for parallel processing. However, there is a problem with the iteration logic that determines the bounds for chunking the data into pieces for these parallel operations.\n\n**Problem Description:**\n- The current iteration limits assume a stopping condition based solely on `chunk_.stop`.\n- This approach does not take into account the starting index of the chunk (`chunk_.start`).\n- Incorrect chunking could lead to uneven data partitions across workers. This can result in errors or inefficiencies during parallel processing tasks, leading to inaccurate calculations or suboptimal performance.\n\n### Explanation of the Commit\n\nThe commit **\"Fix Incorrect Iterator Usage for Rolling GroupBy Chunks\"** addresses this issue by modifying how the iteration bounds are calculated.\n\n**Commit Modifications:**\n- The iteration logic in `rolling_groupby.py` is adjusted to consider both the starting and stopping indexes of each chunk.\n- Specifically, the iteration range is modified to be `chunk_.stop - chunk_.start`.\n\n### Understanding the Commit and the Issue Resolution\n\n#### Issue Cause:\nThe original code incorrectly determines the number of elements within each chunk by only using the stopping index (`chunk_.stop`). This oversight ignores how far into the data each chunk should start, leading to potential miscalculations of which parts of the data belong within which chunk.\n\nFor example, if a chunk starts at index 5 and stops at index 10, the correct number of iterations should be the difference between these two values, i.e., `10 - 5 = 5`. However, the original iteration logic would incorrectly assume the number of iterations to be 10.\n\n#### Commit Solution:\nThe commit addresses this by ensuring the iteration accurately reflects the number of elements within the chunk bounds. This strips out any miscalculations and ensures:\n\n1. **Accurate Chunk Bounds:** Each worker gets an appropriately sized chunk of data, calculated with respect to both the `chunk_.start` and `chunk_.stop`.\n2. **Even Data Distribution:** The adjusted iteration logic prevents uneven data distribution among workers, maintaining balanced workload.\n3. **Correct Data Processing:** Correct chunk sizes eliminate chances of processing errors and improve the overall efficiency and accuracy of parallel computations.\n\n### How the Commit Solves the Issue\n\nThe commit fundamentally rectifies the calculation of chunk sizes. By subtracting `chunk_.start` from `chunk_.stop`, it ensures that the number of iterations truly matches the intended size of each chunk:\n\n- **Even Workload:** Balances the chunks so that each processing worker gets an equal slice of the data.\n- **Improved Performance:** Balanced chunks mean that no single worker gets overwhelmed or underutilized, maximizing the parallel processing capability.\n- **Accuracy:** Ensures each chunk is processed accurately within its bounds, providing correct results for grouped rolling operations.\n\n### Conclusion\n\nIn summary, the root of the issue lies in the incorrect calculation of chunk bounds due to ignoring the starting index during iteration. The solution provided by the commit fixes this by adjusting the iteration limits to accurately reflect the actual number of elements within the chunk. This adjustment resolves potential data mispartitioning, leading to correct and efficient parallel processing in rolling groupby operations. The careful recalibration ensures balanced data processing and accurate, efficient computational results."
    },
    "location_origin": [
      {
        "file": "nalepae_pandarallel/tests/test_pandarallel.py",
        "function": {
          "59": "func_series_map"
        },
        "content_all": {
          "61": "        return math.log10(math.sqrt(math.exp(x**2)))\n",
          "62": "\n",
          "63": "    return dict(\n",
          "64": "        named=func, anonymous=lambda x: math.(...truncated)\n",
          "65": "    )[request.param]\n"
        },
        "content_change": {
          "64": "        named=func, anonymous=lambda x: math.(...truncated)\n"
        }
      }
    ],
    "location_message": [
      {
        "file": "pandarallel/data_types/rolling_groupby.py",
        "function": {
          "29": "get_chunks"
        },
        "content_all": {
          "26": "class RollingGroupBy:\n",
          "27": "    class Apply:\n",
          "28": "        @staticmethod\n",
          "29": "        def get_chunks(nb_workers: int, data: PandasRollingGroupby, *args, **kwargs) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n",
          "30": "            pandas_version = get_pandas_version()\n",
          "31": "            nb_items = (\n",
          "32": "                len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n",
          "33": "            )\n",
          "34": "            \n",
          "35": "            chunks = chunk(nb_items, nb_workers)\n",
          "36": "\n",
          "37": "            iterator = (\n",
          "38": "                iter(data._groupby)\n",
          "39": "                if pandas_version < (1, 3)\n",
          "40": "                else data._grouper.get_iterator(data.obj)\n",
          "41": "            )\n",
          "42": "\n",
          "43": "            for chunk_ in chunks:\n",
          "44": "                yield [next(iterator) for _ in range(chunk_.stop)]\n"
        },
        "content_change": {
          "43": "            for chunk_ in chunks:\n",
          "44": "                chunk_items = []\n",
          "45": "                for _ in range(chunk_.stop):\n",
          "46": "                    try:\n",
          "47": "                        chunk_items.append(next(iterator))\n",
          "48": "                    except StopIteration:\n",
          "49": "                        break\n",
          "50": "                if chunk_items:\n",
          "51": "                    yield chunk_items\n"
        }
      }
    ],
    "location_ground": [
      {
        "file": "nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py",
        "function": {
          "14": "get_chunks"
        },
        "content_all": {
          "11": "    @staticmethod\n",
          "12": "    def get_chunks(\n",
          "13": "        nb_workers: int, data: PandasRollingGroupby, *args, **kwargs\n",
          "14": "    ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n",
          "15": "        pandas_version = get_pandas_version()\n",
          "16": "\n",
          "17": "        nb_items = (\n",
          "18": "            len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n",
          "19": "        )\n",
          "20": "\n",
          "21": "        chunks = chunk(nb_items, nb_workers)\n",
          "22": "\n",
          "23": "        iterator = (\n",
          "24": "            iter(data._groupby)\n",
          "25": "            if pandas_version < (1, 3)\n",
          "26": "            else data._grouper.get_iterator(data.obj)\n",
          "27": "        )\n",
          "28": "\n",
          "29": "        for chunk_ in chunks:\n",
          "30": "            yield [next(iterator) for _ in range(chunk_.stop)]\n",
          "31": "\n"
        },
        "content_change": {
          "30": "            yield [next(iterator) for _ in range(chunk_.start, chunk_.stop)]\n"
        }
      }
    ],
    "location_ground_exp": [
      {
        "file": "nalepae_pandarallel/pandarallel/utils.py",
        "function": {
          "9": "chunk"
        },
        "content_all": {
          "6": "from pandas import DataFrame, Index\n",
          "7": "\n",
          "8": "\n",
          "9": "def chunk(nb_item: int, nb_chunks: int, start_offset=0) -> List[slice]:\n",
          "10": "    \"\"\"\n",
          "11": "    Return `nb_chunks` slices of approximatively `nb_item / nb_chunks` each.\n",
          "12": "\n",
          "13": "    Parameters\n",
          "14": "    ----------\n",
          "15": "    nb_item : int\n",
          "16": "        Total number of items\n",
          "17": "\n"
        },
        "content_change": {
          "9": "def chunk(nb_item: int, nb_chunks: int, start_offset=0, end_offset=0) -> List[slice]:\n"
        }
      },
      {
        "file": "nalepae_pandarallel/pandarallel/utils.py",
        "function": {
          "25": "get_slices"
        },
        "content_all": {
          "20": "        Shift start of slice by this amount\n",
          "21": "\n",
          "22": "    end_offset : int\n",
          "23": "        Shift end of slice by this amount\n",
          "24": "\n",
          "25": "    Returns\n",
          "26": "    -------\n",
          "27": "    List of slices\n",
          "28": "    \"\"\"\n",
          "29": "    step = max(1, (nb_item - abs(start_offset) - abs(end_offset)) // nb_chunks)\n",
          "30": "    slices = [slice(i*step + start_offset, (i+1)*step + start_offset) for i in range(nb_chunks)]\n",
          "31": "    return slices\n"
        },
        "content_change": {
          "29": "    step = max(1, (nb_item - abs(start_offset) - abs(end_offset)) // nb_chunks)\n",
          "30": "    slices = [slice(i*step + start_offset, (i+1)*step + start_offset - 1) for i in range(nb_chunks)]\n",
          "31": "    return slices\n"
        }
      },
      {
        "file": "nalepae_pandarallel/tests/test_pandarallel.py",
        "function": {
          "50": "func_dataframe_applymap"
        },
        "content_all": {
          "44": "        named=func, anonymous=lambda x: math.sin(x.a**2) + math.sin(x.b**2)\n",
          "45": "    )[request.param]\n",
          "46": "\n",
          "47": "\n",
          "48": "    @pytest.fixture(params=(\"named\", \"anonymous\"))\n",
          "49": "    def func_dataframe_applymap(request):\n",
          "50": "        def func(x):\n",
          "51": "            return math.sin(x**2) - math.cos(x**2)\n",
          "52": "\n",
          "53": "        return dict(named=func, anonymous=lambda x: math.sin(x**2) - math.cos(x**2))[\n",
          "54": "            request.param\n",
          "55": "        ]\n"
        },
        "content_change": {
          "53": "        return dict(named=func, anonymous=lambda x: math.sin(x**2) - math.cos(x**2))[\n",
          "54": "            request.param\n"
        }
      }
    ],
    "location_ground_truth": [
      {
        "file": "nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py",
        "function": null,
        "content_all": {
          "32": "                yield [next(iterator) for _ in range(chunk_.stop)]\n",
          "33": "\n"
        },
        "content_change": {
          "32": "                yield [next(iterator) for _ in range(chunk_.stop)]\n"
        }
      }
    ],
    "patch_i": "\n--- a/nalepae_pandarallel/tests/test_pandarallel.py\n+++ b/nalepae_pandarallel/tests/test_pandarallel.py\n@@ -61,7 +61,7 @@ def func_series_map(request):\n         return math.log10(math.sqrt(math.exp(x**2)))\n \n     return dict(\n-        named=func, anonymous=lambda x: math.(...truncated)\n+        named=func, anonymous=lambda x: math.log10(math.sqrt(math.exp(x**2)))\n     )[request.param]\n \n \n",
    "patch_im": "\n--- a/pandarallel/data_types/rolling_groupby.py\n+++ b/pandarallel/data_types/rolling_groupby.py\n@@ -18,7 +18,7 @@\n from .utils import chunk, get_pandas_version\n \n class RollingGroupBy:\n     class Apply:\n         @staticmethod\n         def get_chunks(nb_workers: int, data: PandasRollingGroupby, *args, **kwargs) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n             pandas_version = get_pandas_version()\n             nb_items = (\n                 len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n             )\n \n             chunks = chunk(nb_items, nb_workers)\n \n             iterator = (\n                 iter(data._groupby)\n                 if pandas_version < (1, 3)\n                 else data._grouper.get_iterator(data.obj)\n             )\n \n             for chunk_ in chunks:\n                 chunk_items = []\n                 for _ in range(chunk_.stop):\n                     try:\n                         chunk_items.append(next(iterator))\n                     except StopIteration:\n                         break\n                 if chunk_items:\n                     yield chunk_items\n\n",
    "patch_il": "\n--- a/nalepae_pandarallel/tests/test_pandarallel.py\n+++ b/nalepae_pandarallel/tests/test_pandarallel.py\n@@ -61,6 +61,6 @@ def func_series_map(request):\n     def func(x):\n         return math.log10(math.sqrt(math.exp(x**2)))\n \n-    return dict(\n-        named=func, anonymous=lambda x: math.(...truncated)\n-    )[request.param]\n+    return dict(\n+        named=func, anonymous=lambda x: math.log10(math.sqrt(math.exp(x**2)))\n+    )[request.param]\n",
    "patch_iml": "\n--- a/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n+++ b/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n@@ -26,19 +26,26 @@ class RollingGroupBy:\n     class Apply:\n         @staticmethod\n         def get_chunks(nb_workers: int, data: PandasRollingGroupby, *args, **kwargs) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n             pandas_version = get_pandas_version()\n             nb_items = (\n                 len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n             )\n             \n             chunks = chunk(nb_items, nb_workers)\n \n             iterator = (\n                 iter(data._groupby)\n                 if pandas_version < (1, 3)\n                 else data._grouper.get_iterator(data.obj)\n             )\n \n-            for chunk_ in chunks:\n-                yield [next(iterator) for _ in range(chunk_.stop)]\n+            for chunk_ in chunks:\n+                chunk_items = []\n+                for _ in range(chunk_.stop):\n+                    try:\n+                        chunk_items.append(next(iterator))\n+                    except StopIteration:\n+                        break\n+                if chunk_items:\n+                    yield chunk_items\n",
    "patch_ground": "\n--- a/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n+++ b/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n@@ -1,28 +1,28 @@\n import multiprocessing\n from typing import Any, Callable, Dict, Iterable, Iterator, List, Tuple\n \n import pandas as pd\n from pandas.core.window.rolling import RollingGroupby as PandasRollingGroupby\n \n from ..utils import WorkerStatus, chunk, get_pandas_version\n from .generic import DataType\n \n \n class RollingGroupBy:\n     class Apply(DataType):\n         @staticmethod\n         def get_chunks(\n             nb_workers: int, data: PandasRollingGroupby, *args, **kwargs\n         ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n             pandas_version = get_pandas_version()\n \n             nb_items = (\n                 len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n             )\n \n             chunks = chunk(nb_items, nb_workers)\n \n             iterator = (\n                 iter(data._groupby)\n                 if pandas_version < (1, 3)\n                 else data._grouper.get_iterator(data.obj)\n             )\n \n             for chunk_ in chunks:\n-                yield [next(iterator) for _ in range(chunk_.stop)]\n+                yield [next(iterator) for _ in range(chunk_.start, chunk_.stop)]\n",
    "patch_ground_location": "\n--- a/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n+++ b/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n@@ -11,21 +11,21 @@\n     @staticmethod\n     def get_chunks(\n         nb_workers: int, data: PandasRollingGroupby, *args, **kwargs\n     ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n         pandas_version = get_pandas_version()\n \n         nb_items = (\n             len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n         )\n \n         chunks = chunk(nb_items, nb_workers)\n \n         iterator = (\n             iter(data._groupby)\n             if pandas_version < (1, 3)\n             else data._grouper.get_iterator(data.obj)\n         )\n \n-        for chunk_ in chunks:\n-            yield [next(iterator) for _ in range(chunk_.stop)]\n+       for chunk_ in chunks:\n+           yield [next(iterator) for _ in range(chunk_.start, chunk_.stop)]\n",
    "patch_ground_exp": "\n--- a/nalepae_pandarallel/pandarallel/utils.py\n+++ b/nalepae_pandarallel/pandarallel/utils.py\n@@ -8,7 +8,7 @@\n \n def chunk(nb_item: int, nb_chunks: int, start_offset=0, end_offset=0) -> List[slice]:\n     \"\"\"\n-    Return `nb_chunks` slices of approximatively `nb_item / nb_chunks` each.\n+    Return `nb_chunks` slices of approximately `nb_item / nb_chunks` each.\n \n     Parameters\n     ----------\n@@ -28,7 +28,7 @@\n     List of slices\n     \"\"\"\n     step = max(1, (nb_item - abs(start_offset) - abs(end_offset)) // nb_chunks)\n-    slices = [slice(i * step + start_offset, (i + 1) * step + start_offset) for i in range(nb_chunks)]\n+    slices = [slice(i * step + start_offset, (i + 1) * step + start_offset - 1) for i in range(nb_chunks)]\n     return slices\n \n \n",
    "patch_ground_all": "\n--- a/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n+++ b/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n@@ -29,7 +29,7 @@\n         iterator = iter(data)\n         for chunk_ in chunks:\n             if not chunk_.is_empty:\n-                yield [next(iterator) for _ in range(chunk_.stop)]\n+                yield [next(iterator) for _ in range(chunk_.stop - chunk_.start)]\n",
    "patch_ground_truth": "--- a/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n+++ b/nalepae_pandarallel/pandarallel/data_types/rolling_groupby.py\n@@ -29,7 +29,7 @@\n             )\n \n             for chunk_ in chunks:\n-                yield [next(iterator) for _ in range(chunk_.stop)]\n+                yield [next(iterator) for _ in range(chunk_.stop - chunk_.start)]\n \n         @staticmethod\n         def get_work_extra(data: PandasRollingGroupby):\n",
    "message": "__________________________________________________________________ test_dataframe_groupby_rolling_apply[anonymous-1000-True-None] ___________________________________________________________________\n\nnb_workers = 2, data = RollingGroupby [window=4,center=False,axis=0,method=single], args = (), kwargs = {'user_defined_function_kwargs': {'raw': False}}, pandas_version = (2, 1), nb_items = 9\nchunks = [slice(0, 5, None), slice(5, 9, None)], chunk_ = slice(5, 9, None)\n\n    @staticmethod\n    def get_chunks(\n        nb_workers: int, data: PandasRollingGroupby, *args, **kwargs\n    ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n        pandas_version = get_pandas_version()\n    \n        nb_items = (\n            len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n        )\n    \n        chunks = chunk(nb_items, nb_workers)\n    \n        iterator = (\n            iter(data._groupby)\n            if pandas_version < (1, 3)\n            else data._grouper.get_iterator(data.obj)\n        )\n    \n        for chunk_ in chunks:\n>           yield [next(iterator) for _ in range(chunk_.stop)]\n\npandarallel/data_types/rolling_groupby.py:32: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\n.0 = <range_iterator object at 0x795bd66eb570>\n\n>   yield [next(iterator) for _ in range(chunk_.stop)]\nE   StopIteration\n\npandarallel/data_types/rolling_groupby.py:32: StopIteration\n\nThe above exception was the direct cause of the following exception:\n\npandarallel_init = None, func_dataframe_groupby_rolling_apply = <function func_dataframe_groupby_rolling_apply.<locals>.<lambda> at 0x795bd4c4f0d0>, df_size = 1000\n\n    def test_dataframe_groupby_rolling_apply(\n        pandarallel_init, func_dataframe_groupby_rolling_apply, df_size\n    ):\n        df = pd.DataFrame(\n            dict(a=np.random.randint(1, 10, df_size), b=np.random.rand(df_size))\n        )\n    \n        res = (\n            df.groupby(\"a\")\n            .b.rolling(4)\n            .apply(func_dataframe_groupby_rolling_apply, raw=False)\n        )\n        res_parallel = (\n>           df.groupby(\"a\")\n            .b.rolling(4)\n            .parallel_apply(func_dataframe_groupby_rolling_apply, raw=False)\n        )\n\ntests/test_pandarallel.py:321: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\ndata = RollingGroupby [window=4,center=False,axis=0,method=single], user_defined_function = <function func_dataframe_groupby_rolling_apply.<locals>.<lambda> at 0x795bd4c4f0d0>\nuser_defined_function_args = (), user_defined_function_kwargs = {'raw': False}, wrapped_work_function = <pandarallel.core.WrapWorkFunctionForFileSystem object at 0x795bd4e32d90>\nwrapped_reduce_function = <function wrap_reduce_function_for_file_system.<locals>.closure at 0x795bd675b790>\n\n    def closure(\n        data: Any,\n        user_defined_function: Callable,\n        *user_defined_function_args: tuple,\n        **user_defined_function_kwargs: Dict[str, Any],\n    ):\n        wrapped_work_function = WrapWorkFunctionForFileSystem(data_type.work)\n        wrapped_reduce_function = wrap_reduce_function_for_file_system(data_type.reduce)\n    \n>       chunks = list(\n            data_type.get_chunks(\n                nb_requested_workers,\n                data,\n                user_defined_function_kwargs=user_defined_function_kwargs,\n            )\n        )\nE       RuntimeError: generator raised StopIteration\n\npandarallel/core.py:218: RuntimeError\n--------------------------------------------------------------------------------------- Captured stdout setup ---------------------------------------------------------------------------------------\nINFO: Pandarallel will run on 2 workers.\nINFO: Pandarallel will use Memory file system to transfer data between the main process and workers.\n__________________________________________________________________ test_dataframe_groupby_rolling_apply[anonymous-1000-True-False] __________________________________________________________________\n\nnb_workers = 2, data = RollingGroupby [window=4,center=False,axis=0,method=single], args = (), kwargs = {'user_defined_function_kwargs': {'raw': False}}, pandas_version = (2, 1), nb_items = 9\nchunks = [slice(0, 5, None), slice(5, 9, None)], chunk_ = slice(5, 9, None)\n\n    @staticmethod\n    def get_chunks(\n        nb_workers: int, data: PandasRollingGroupby, *args, **kwargs\n    ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n        pandas_version = get_pandas_version()\n    \n        nb_items = (\n            len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n        )\n    \n        chunks = chunk(nb_items, nb_workers)\n    \n        iterator = (\n            iter(data._groupby)\n            if pandas_version < (1, 3)\n            else data._grouper.get_iterator(data.obj)\n        )\n    \n        for chunk_ in chunks:\n>           yield [next(iterator) for _ in range(chunk_.stop)]\n\npandarallel/data_types/rolling_groupby.py:32: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\n.0 = <range_iterator object at 0x795bd4d8c3c0>\n\n>   yield [next(iterator) for _ in range(chunk_.stop)]\nE   StopIteration\n\npandarallel/data_types/rolling_groupby.py:32: StopIteration\n\nThe above exception was the direct cause of the following exception:\n\npandarallel_init = None, func_dataframe_groupby_rolling_apply = <function func_dataframe_groupby_rolling_apply.<locals>.<lambda> at 0x795bd4c4f9d0>, df_size = 1000\n\n    def test_dataframe_groupby_rolling_apply(\n        pandarallel_init, func_dataframe_groupby_rolling_apply, df_size\n    ):\n        df = pd.DataFrame(\n            dict(a=np.random.randint(1, 10, df_size), b=np.random.rand(df_size))\n        )\n    \n        res = (\n            df.groupby(\"a\")\n            .b.rolling(4)\n            .apply(func_dataframe_groupby_rolling_apply, raw=False)\n        )\n        res_parallel = (\n>           df.groupby(\"a\")\n            .b.rolling(4)\n            .parallel_apply(func_dataframe_groupby_rolling_apply, raw=False)\n        )\n\ntests/test_pandarallel.py:321: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\ndata = RollingGroupby [window=4,center=False,axis=0,method=single], user_defined_function = <function func_dataframe_groupby_rolling_apply.<locals>.<lambda> at 0x795bd4c4f9d0>\nuser_defined_function_args = (), user_defined_function_kwargs = {'raw': False}, wrapped_work_function = <pandarallel.core.WrapWorkFunctionForPipe object at 0x795bd4dd7e20>\nmanager = <multiprocessing.managers.SyncManager object at 0x795bd4dd78b0>\n\n    def closure(\n        data: Any,\n        user_defined_function: Callable,\n        *user_defined_function_args: tuple,\n        **user_defined_function_kwargs: Dict[str, Any],\n    ):\n        wrapped_work_function = WrapWorkFunctionForPipe(data_type.work)\n        dilled_user_defined_function = dill.dumps(user_defined_function)\n        manager: SyncManager = CONTEXT.Manager()\n        master_workers_queue = manager.Queue()\n    \n>       chunks = list(\n            data_type.get_chunks(\n                nb_requested_workers,\n                data,\n                user_defined_function_kwargs=user_defined_function_kwargs,\n            )\n        )\nE       RuntimeError: generator raised StopIteration\n\npandarallel/core.py:370: RuntimeError\n--------------------------------------------------------------------------------------- Captured stdout setup ---------------------------------------------------------------------------------------\nINFO: Pandarallel will run on 2 workers.\nINFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.\n========================================================================================= warnings summary ==========================================================================================\ntests/test_pandarallel.py: 16 warnings\n  /home/user/Documents/repoben/buggycode/nalepae_pandarallel/tests/test_pandarallel.py:235: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n    res = df.applymap(func_dataframe_applymap)\n\n-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html\n====================================================================================== short test summary info ======================================================================================\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1000-False-None] - RuntimeError: generator raised StopIteration\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1000-False-False] - RuntimeError: generator raised StopIteration\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1000-True-None] - RuntimeError: generator raised StopIteration\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[named-1000-True-False] - RuntimeError: generator raised StopIteration\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1000-False-None] - RuntimeError: generator raised StopIteration\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1000-False-False] - RuntimeError: generator raised StopIteration\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1000-True-None] - RuntimeError: generator raised StopIteration\nFAILED tests/test_pandarallel.py::test_dataframe_groupby_rolling_apply[anonymous-1000-True-False] - RuntimeError: generator raised StopIteration\n============================================================================ 8 failed, 209 passed, 16 warnings in 7.76s =============================================================================",
    "CodeBase": [
      {
        "path": "nalepae_pandarallel/tests/test_pandarallel.py",
        "content": "1 import importlib\n2 import math\n3 \n4 import numpy as np\n5 import pandas as pd\n6 import pytest\n7 from pandarallel import pandarallel\n8 \n9 \n10 @pytest.fixture(params=(1000, 1))\n11 def df_size(request):\n12     return request.param\n13 \n14 \n15 @pytest.fixture(params=(False, True))\n16 def progress_bar(request):\n17     return request.param\n18 \n19 \n20 @pytest.fixture(params=(None, False))\n21 def use_memory_fs(request):\n22     return request.param\n23 \n24 \n25 @pytest.fixture(params=(RuntimeError, AttributeError, ZeroDivisionError))\n26 def exception(request):\n27     return request.param\n28 \n29 \n30 @pytest.fixture(params=(\"named\", \"anonymous\"))\n31 def func_dataframe_apply_axis_0(request):\n32     def func(x):\n33         return max(x) - min(x)\n34 \n35     return dict(named=func, anonymous=lambda x: max(x) - min(x))[request.param]\n36 \n37 \n38 @pytest.fixture(params=(\"named\", \"anonymous\"))\n39 def func_dataframe_apply_axis_1(request):\n40     def func(x):\n41         return math.sin(x.a**2) + math.sin(x.b**2)\n42 \n43     return dict(\n44         named=func, anonymous=lambda x: math.sin(x.a**2) + math.sin(x.b**2)\n45     )[request.param]\n46 \n47 \n48 @pytest.fixture(params=(\"named\", \"anonymous\"))\n49 def func_dataframe_applymap(request):\n50     def func(x):\n51         return math.sin(x**2) - math.cos(x**2)\n52 \n53     return dict(named=func, anonymous=lambda x: math.sin(x**2) - math.cos(x**2))[\n54         request.param\n55     ]\n56 \n57 \n58 @pytest.fixture(params=(\"named\", \"anonymous\"))\n59 def func_series_map(request):\n60     def func(x):\n61         return math.log10(math.sqrt(math.exp(x**2)))\n62 \n63     return dict(\n64         named=func, anonymous=lambda x: math.(...truncated)"
      },
      {
        "path": "nalepae_pandarallel/pandarallel/core.py",
        "content": "1 import multiprocessing\n2 import os\n3 import pickle\n4 from itertools import count\n5 from multiprocessing.managers import SyncManager\n6 from pathlib import Path\n7 from tempfile import NamedTemporaryFile\n8 from typing import Any, Callable, Dict, Iterator, Optional, Tuple, Type, cast\n9 \n10 import dill\n11 import pandas as pd\n12 import psutil\n13 from pandas.core.groupby import DataFrameGroupBy as PandaDataFrameGroupBy\n14 from pandas.core.window.expanding import ExpandingGroupby as PandasExpandingGroupby\n15 from pandas.core.window.rolling import RollingGroupby as PandasRollingGroupby\n16 \n17 from .data_types import (\n18     DataFrame,\n19     DataFrameGroupBy,\n20     DataType,\n21     ExpandingGroupBy,\n22     RollingGroupBy,\n23     Series,\n24     SeriesRolling,\n25 )\n26 from .progress_bars import ProgressBarsType, get_progress_bars, progress_wrapper\n27 from .utils import WorkerStatus\n28 \n29 ON_WINDOWS = os.name == \"nt\"\n30 CONTEXT = multiprocessing.get_context(\"spawn\" if ON_WINDOWS else \"fork\")\n31 \n32 # Root of Memory File System\n33 MEMORY_FS_ROOT = os.environ.get(\"MEMORY_FS_ROOT\", \"/dev/shm\")\n34 \n35 # By default, Pand(...truncated)"
      },
      {
        "path": "nalepae_pandarallel/pandarallel/data_types/expanding_groupby.py",
        "content": "1 import multiprocessing\n2 from typing import Any, Callable, Dict, Iterable, Iterator, List, Tuple\n3 \n4 import pandas as pd\n5 from pandas.core.window.expanding import ExpandingGroupby as PandasExpandingGroupby\n6 \n7 from ..utils import WorkerStatus, chunk, get_pandas_version\n8 from .generic import DataType\n9 \n10 \n11 class ExpandingGroupBy:\n12     class Apply(DataType):\n13         @staticmethod\n14         def get_chunks(\n15             nb_workers: int, data: PandasExpandingGroupby, *args, **kwargs\n16         ) -> Iterator[List[Tuple[int, pd.DataFrame]]]:\n17             pandas_version = get_pandas_version()\n18 \n19             nb_items = (\n20                 len(data._groupby) if pandas_version < (1, 3) else data._grouper.ngroups\n21             )\n22 \n23             chunks = chunk(nb_items, nb_workers)(...truncated)"
      },
      {
        "path": "nalepae_pandarallel/pandarallel/utils.py",
        "content": "1 import itertools\n2 from enum import Enum\n3 from typing import Any, Dict, List, Tuple\n4 \n5 import pandas as pd\n6 from pandas import DataFrame, Index\n7 \n8 \n9 def chunk(nb_item: int, nb_chunks: int, start_offset=0) -> List[slice]:\n10     \"\"\"\n11     Return `nb_chunks` slices of approximatively `nb_item / nb_chunks` each.\n12 \n13     Parameters\n14     ----------\n15     nb_item : int\n16         Total number of items\n17 \n18     nb_chunks : int\n19         Number of chunks to return\n20 \n21     start_offset : int\n22         Shift start of slice by this amount\n23 \n24     Returns\n25     -------\n26    (...truncated)"
      }
    ],
    "CommitSHA": "261a652cddb219ac353ff803e81646c08b72fc6f"
  },
  "Score": {
    "Difficulty": "Easy",
    "issue_origin": {
      "Title": 6,
      "Description": 4,
      "Reproducibility": 4,
      "Relevance": 7,
      "Explanation": 6,
      "Overall": 5
    },
    "issue_message": {
      "Title": 6,
      "Description": 7,
      "Reproducibility": 5,
      "Relevance": 8,
      "Explanation": 8,
      "Overall": 7
    },
    "issue_ground": {
      "Title": 9,
      "Description": 7,
      "Reproducibility": 6,
      "Relevance": 8,
      "Explanation": 9,
      "Overall": 8
    },
    "issue_ground_truth": {
      "title": "Fix Incorrect Iterator Usage for Rolling GroupBy Chunks",
      "description": "There appears to be an issue with the iteration logic in the RollingGroupBy class within the rolling_groupby.py file. When generating chunks for parallel processing, the iteration limits are not being calculated correctly. Specifically, the current code assumes a stopping condition solely based on chunk_.stop, which does not account for the starting index of the chunk.\n\nIn its current state, this could lead to incorrect partitioning of data, causing potential errors or inefficiencies in parallel processing tasks. Users may experience issues where data is not evenly split across workers, resulting in inaccurate calculations or suboptimal performance.\n\nThe commit modifies the iteration logic to account for both the start and stop bounds of each chunk (`chunk_.stop - chunk_.start`). By ensuring the correct calculation of iteration limits, the fix addresses potential inaccuracies in data chunking for rolling groupby operations.",
      "explanation": "Certainly! Let's break down and analyze the information:\n\n### Summary of the Issue\n\nThe issue is located in the `RollingGroupBy` class within the `rolling_groupby.py` file. This class is responsible for managing grouped rolling calculations, which can be used for parallel processing. However, there is a problem with the iteration logic that determines the bounds for chunking the data into pieces for these parallel operations.\n\n**Problem Description:**\n- The current iteration limits assume a stopping condition based solely on `chunk_.stop`.\n- This approach does not take into account the starting index of the chunk (`chunk_.start`).\n- Incorrect chunking could lead to uneven data partitions across workers. This can result in errors or inefficiencies during parallel processing tasks, leading to inaccurate calculations or suboptimal performance.\n\n### Explanation of the Commit\n\nThe commit **\"Fix Incorrect Iterator Usage for Rolling GroupBy Chunks\"** addresses this issue by modifying how the iteration bounds are calculated.\n\n**Commit Modifications:**\n- The iteration logic in `rolling_groupby.py` is adjusted to consider both the starting and stopping indexes of each chunk.\n- Specifically, the iteration range is modified to be `chunk_.stop - chunk_.start`.\n\n### Understanding the Commit and the Issue Resolution\n\n#### Issue Cause:\nThe original code incorrectly determines the number of elements within each chunk by only using the stopping index (`chunk_.stop`). This oversight ignores how far into the data each chunk should start, leading to potential miscalculations of which parts of the data belong within which chunk.\n\nFor example, if a chunk starts at index 5 and stops at index 10, the correct number of iterations should be the difference between these two values, i.e., `10 - 5 = 5`. However, the original iteration logic would incorrectly assume the number of iterations to be 10.\n\n#### Commit Solution:\nThe commit addresses this by ensuring the iteration accurately reflects the number of elements within the chunk bounds. This strips out any miscalculations and ensures:\n\n1. **Accurate Chunk Bounds:** Each worker gets an appropriately sized chunk of data, calculated with respect to both the `chunk_.start` and `chunk_.stop`.\n2. **Even Data Distribution:** The adjusted iteration logic prevents uneven data distribution among workers, maintaining balanced workload.\n3. **Correct Data Processing:** Correct chunk sizes eliminate chances of processing errors and improve the overall efficiency and accuracy of parallel computations.\n\n### How the Commit Solves the Issue\n\nThe commit fundamentally rectifies the calculation of chunk sizes. By subtracting `chunk_.start` from `chunk_.stop`, it ensures that the number of iterations truly matches the intended size of each chunk:\n\n- **Even Workload:** Balances the chunks so that each processing worker gets an equal slice of the data.\n- **Improved Performance:** Balanced chunks mean that no single worker gets overwhelmed or underutilized, maximizing the parallel processing capability.\n- **Accuracy:** Ensures each chunk is processed accurately within its bounds, providing correct results for grouped rolling operations.\n\n### Conclusion\n\nIn summary, the root of the issue lies in the incorrect calculation of chunk bounds due to ignoring the starting index during iteration. The solution provided by the commit fixes this by adjusting the iteration limits to accurately reflect the actual number of elements within the chunk. This adjustment resolves potential data mispartitioning, leading to correct and efficient parallel processing in rolling groupby operations. The careful recalibration ensures balanced data processing and accurate, efficient computational results."
    }
  }
}