Are you having problems with pandas group by performance ? There are ways to improve. One of them is to use NumPy.
First let us look at pandas group by :
def pandas_groupby(df: pd.DataFrame) -> pd.DataFrame:
return (
df.groupby(["category", "year"])
.apply(lambda df: np.interp(0.3, df["x"], df["y"]))
.rename("y")
.reset_index()
)
Let us now look how NumPy helps to improve the query performance:
def _interpolate_wrapper(fp: np.ndarray, xp: np.ndarray, x: float) -> float:
return float(np.interp(x=x, xp=xp, fp=fp))
def numpy_groupby(df: pd.DataFrame) -> pd.DataFrame:
....
....
y_values = y_values.reshape([-1, num_x_unique_values])
interpolated_y_values = np.apply_along_axis(
_interpolate_wrapper,
axis=1,
arr=y_values,
x=_INTERPOLATE_AT,
xp=x_unique_values,
)