Pandas Group By Performance

Are you having problems with pandas group by performance ? There are ways to improve. One of them is to use NumPy.

First let us look at pandas group by :

def pandas_groupby(df: pd.DataFrame) -> pd.DataFrame:
    return (
        df.groupby(["category", "year"])
        .apply(lambda df: np.interp(0.3, df["x"], df["y"]))
        .rename("y")
        .reset_index()
    )

Let us now look how NumPy helps to improve the query performance:

def _interpolate_wrapper(fp: np.ndarray, xp: np.ndarray, x: float) -> float:
    return float(np.interp(x=x, xp=xp, fp=fp))

def numpy_groupby(df: pd.DataFrame) -> pd.DataFrame:
      ....
      ....
      y_values = y_values.reshape([-1, num_x_unique_values])
      interpolated_y_values = np.apply_along_axis(
          _interpolate_wrapper,
          axis=1,
          arr=y_values,
          x=_INTERPOLATE_AT,
          xp=x_unique_values,
      )

Leave a comment