mit neuen venv und exe-Files
This commit is contained in:
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,435 @@
|
||||
"""
|
||||
test cython .agg behavior
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_float_dtype,
|
||||
is_integer_dtype,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
NaT,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
bdate_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
import pandas.core.common as com
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op_name",
|
||||
[
|
||||
"count",
|
||||
"sum",
|
||||
"std",
|
||||
"var",
|
||||
"sem",
|
||||
"mean",
|
||||
pytest.param(
|
||||
"median",
|
||||
# ignore mean of empty slice
|
||||
# and all-NaN
|
||||
marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")],
|
||||
),
|
||||
"prod",
|
||||
"min",
|
||||
"max",
|
||||
],
|
||||
)
|
||||
def test_cythonized_aggers(op_name):
|
||||
data = {
|
||||
"A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan],
|
||||
"B": ["A", "B"] * 6,
|
||||
"C": np.random.default_rng(2).standard_normal(12),
|
||||
}
|
||||
df = DataFrame(data)
|
||||
df.loc[2:10:2, "C"] = np.nan
|
||||
|
||||
op = lambda x: getattr(x, op_name)()
|
||||
|
||||
# single column
|
||||
grouped = df.drop(["B"], axis=1).groupby("A")
|
||||
exp = {cat: op(group["C"]) for cat, group in grouped}
|
||||
exp = DataFrame({"C": exp})
|
||||
exp.index.name = "A"
|
||||
result = op(grouped)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
# multiple columns
|
||||
grouped = df.groupby(["A", "B"])
|
||||
expd = {}
|
||||
for (cat1, cat2), group in grouped:
|
||||
expd.setdefault(cat1, {})[cat2] = op(group["C"])
|
||||
exp = DataFrame(expd).T.stack(future_stack=True)
|
||||
exp.index.names = ["A", "B"]
|
||||
exp.name = "C"
|
||||
|
||||
result = op(grouped)["C"]
|
||||
if op_name in ["sum", "prod"]:
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
|
||||
def test_cython_agg_boolean():
|
||||
frame = DataFrame(
|
||||
{
|
||||
"a": np.random.default_rng(2).integers(0, 5, 50),
|
||||
"b": np.random.default_rng(2).integers(0, 2, 50).astype("bool"),
|
||||
}
|
||||
)
|
||||
result = frame.groupby("a")["b"].mean()
|
||||
msg = "using SeriesGroupBy.mean"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
# GH#53425
|
||||
expected = frame.groupby("a")["b"].agg(np.mean)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_nothing_to_agg():
|
||||
frame = DataFrame(
|
||||
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
|
||||
)
|
||||
|
||||
msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
frame.groupby("a")["b"].mean(numeric_only=True)
|
||||
|
||||
frame = DataFrame(
|
||||
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
|
||||
)
|
||||
|
||||
result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
|
||||
expected = DataFrame(
|
||||
[], index=frame["a"].sort_values().drop_duplicates(), columns=[]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_nothing_to_agg_with_dates():
|
||||
frame = DataFrame(
|
||||
{
|
||||
"a": np.random.default_rng(2).integers(0, 5, 50),
|
||||
"b": ["foo", "bar"] * 25,
|
||||
"dates": pd.date_range("now", periods=50, freq="min"),
|
||||
}
|
||||
)
|
||||
msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
frame.groupby("b").dates.mean(numeric_only=True)
|
||||
|
||||
|
||||
def test_cython_agg_frame_columns():
|
||||
# #2113
|
||||
df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]})
|
||||
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.groupby(level=0, axis="columns").mean()
|
||||
|
||||
|
||||
def test_cython_agg_return_dict():
|
||||
# GH 16741
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
|
||||
ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict())
|
||||
expected = Series(
|
||||
[{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}],
|
||||
index=Index(["bar", "foo"], name="A"),
|
||||
name="B",
|
||||
)
|
||||
tm.assert_series_equal(ts, expected)
|
||||
|
||||
|
||||
def test_cython_fail_agg():
|
||||
dr = bdate_range("1/1/2000", periods=50)
|
||||
ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr)
|
||||
|
||||
grouped = ts.groupby(lambda x: x.month)
|
||||
summed = grouped.sum()
|
||||
msg = "using SeriesGroupBy.sum"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
# GH#53425
|
||||
expected = grouped.agg(np.sum)
|
||||
tm.assert_series_equal(summed, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, targop",
|
||||
[
|
||||
("mean", np.mean),
|
||||
("median", np.median),
|
||||
("var", np.var),
|
||||
("sum", np.sum),
|
||||
("prod", np.prod),
|
||||
("min", np.min),
|
||||
("max", np.max),
|
||||
("first", lambda x: x.iloc[0]),
|
||||
("last", lambda x: x.iloc[-1]),
|
||||
],
|
||||
)
|
||||
def test__cython_agg_general(op, targop):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal(1000))
|
||||
labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
|
||||
|
||||
result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True)
|
||||
warn = FutureWarning if targop in com._cython_table else None
|
||||
msg = f"using DataFrameGroupBy.{op}"
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
# GH#53425
|
||||
expected = df.groupby(labels).agg(targop)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, targop",
|
||||
[
|
||||
("mean", np.mean),
|
||||
("median", lambda x: np.median(x) if len(x) > 0 else np.nan),
|
||||
("var", lambda x: np.var(x, ddof=1)),
|
||||
("min", np.min),
|
||||
("max", np.max),
|
||||
],
|
||||
)
|
||||
def test_cython_agg_empty_buckets(op, targop, observed):
|
||||
df = DataFrame([11, 12, 13])
|
||||
grps = range(0, 55, 5)
|
||||
|
||||
# calling _cython_agg_general directly, instead of via the user API
|
||||
# which sets different values for min_count, so do that here.
|
||||
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
||||
result = g._cython_agg_general(op, alt=None, numeric_only=True)
|
||||
|
||||
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
||||
expected = g.agg(lambda x: targop(x))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_cython_agg_empty_buckets_nanops(observed):
|
||||
# GH-18869 can't call nanops on empty groups, so hardcode expected
|
||||
# for these
|
||||
df = DataFrame([11, 12, 13], columns=["a"])
|
||||
grps = np.arange(0, 25, 5, dtype=int)
|
||||
# add / sum
|
||||
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
|
||||
"sum", alt=None, numeric_only=True
|
||||
)
|
||||
intervals = pd.interval_range(0, 20, freq=5)
|
||||
expected = DataFrame(
|
||||
{"a": [0, 0, 36, 0]},
|
||||
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected.a != 0]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# prod
|
||||
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
|
||||
"prod", alt=None, numeric_only=True
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"a": [1, 1, 1716, 1]},
|
||||
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected.a != 1]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("op", ["first", "last", "max", "min"])
|
||||
@pytest.mark.parametrize(
|
||||
"data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")]
|
||||
)
|
||||
def test_cython_with_timestamp_and_nat(op, data):
|
||||
# https://github.com/pandas-dev/pandas/issues/19526
|
||||
df = DataFrame({"a": [0, 1], "b": [data, NaT]})
|
||||
index = Index([0, 1], name="a")
|
||||
|
||||
# We will group by a and test the cython aggregations
|
||||
expected = DataFrame({"b": [data, NaT]}, index=index)
|
||||
|
||||
result = df.groupby("a").aggregate(op)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"agg",
|
||||
[
|
||||
"min",
|
||||
"max",
|
||||
"count",
|
||||
"sum",
|
||||
"prod",
|
||||
"var",
|
||||
"mean",
|
||||
"median",
|
||||
"ohlc",
|
||||
"cumprod",
|
||||
"cumsum",
|
||||
"shift",
|
||||
"any",
|
||||
"all",
|
||||
"quantile",
|
||||
"first",
|
||||
"last",
|
||||
"rank",
|
||||
"cummin",
|
||||
"cummax",
|
||||
],
|
||||
)
|
||||
def test_read_only_buffer_source_agg(agg):
|
||||
# https://github.com/pandas-dev/pandas/issues/36014
|
||||
df = DataFrame(
|
||||
{
|
||||
"sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0],
|
||||
"species": ["setosa", "setosa", "setosa", "setosa", "setosa"],
|
||||
}
|
||||
)
|
||||
df._mgr.arrays[0].flags.writeable = False
|
||||
|
||||
result = df.groupby(["species"]).agg({"sepal_length": agg})
|
||||
expected = df.copy().groupby(["species"]).agg({"sepal_length": agg})
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op_name",
|
||||
[
|
||||
"count",
|
||||
"sum",
|
||||
"std",
|
||||
"var",
|
||||
"sem",
|
||||
"mean",
|
||||
"median",
|
||||
"prod",
|
||||
"min",
|
||||
"max",
|
||||
],
|
||||
)
|
||||
def test_cython_agg_nullable_int(op_name):
|
||||
# ensure that the cython-based aggregations don't fail for nullable dtype
|
||||
# (eg https://github.com/pandas-dev/pandas/issues/37415)
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["A", "B"] * 5,
|
||||
"B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"),
|
||||
}
|
||||
)
|
||||
result = getattr(df.groupby("A")["B"], op_name)()
|
||||
df2 = df.assign(B=df["B"].astype("float64"))
|
||||
expected = getattr(df2.groupby("A")["B"], op_name)()
|
||||
if op_name in ("mean", "median"):
|
||||
convert_integer = False
|
||||
else:
|
||||
convert_integer = True
|
||||
expected = expected.convert_dtypes(convert_integer=convert_integer)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
|
||||
def test_count_masked_returns_masked_dtype(dtype):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 1],
|
||||
"B": pd.array([1, pd.NA], dtype=dtype),
|
||||
"C": pd.array([1, 1], dtype=dtype),
|
||||
}
|
||||
)
|
||||
result = df.groupby("A").count()
|
||||
expected = DataFrame(
|
||||
[[1, 2]], index=Index([1], name="A"), columns=["B", "C"], dtype="Int64"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("with_na", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"op_name, action",
|
||||
[
|
||||
# ("count", "always_int"),
|
||||
("sum", "large_int"),
|
||||
# ("std", "always_float"),
|
||||
("var", "always_float"),
|
||||
# ("sem", "always_float"),
|
||||
("mean", "always_float"),
|
||||
("median", "always_float"),
|
||||
("prod", "large_int"),
|
||||
("min", "preserve"),
|
||||
("max", "preserve"),
|
||||
("first", "preserve"),
|
||||
("last", "preserve"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
pd.array([1, 2, 3, 4], dtype="Int64"),
|
||||
pd.array([1, 2, 3, 4], dtype="Int8"),
|
||||
pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float32"),
|
||||
pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64"),
|
||||
pd.array([True, True, False, False], dtype="boolean"),
|
||||
],
|
||||
)
|
||||
def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
|
||||
if with_na:
|
||||
data[3] = pd.NA
|
||||
|
||||
df = DataFrame({"key": ["a", "a", "b", "b"], "col": data})
|
||||
grouped = df.groupby("key")
|
||||
|
||||
if action == "always_int":
|
||||
# always Int64
|
||||
expected_dtype = pd.Int64Dtype()
|
||||
elif action == "large_int":
|
||||
# for any int/bool use Int64, for float preserve dtype
|
||||
if is_float_dtype(data.dtype):
|
||||
expected_dtype = data.dtype
|
||||
elif is_integer_dtype(data.dtype):
|
||||
# match the numpy dtype we'd get with the non-nullable analogue
|
||||
expected_dtype = data.dtype
|
||||
else:
|
||||
expected_dtype = pd.Int64Dtype()
|
||||
elif action == "always_float":
|
||||
# for any int/bool use Float64, for float preserve dtype
|
||||
if is_float_dtype(data.dtype):
|
||||
expected_dtype = data.dtype
|
||||
else:
|
||||
expected_dtype = pd.Float64Dtype()
|
||||
elif action == "preserve":
|
||||
expected_dtype = data.dtype
|
||||
|
||||
result = getattr(grouped, op_name)()
|
||||
assert result["col"].dtype == expected_dtype
|
||||
|
||||
result = grouped.aggregate(op_name)
|
||||
assert result["col"].dtype == expected_dtype
|
||||
|
||||
result = getattr(grouped["col"], op_name)()
|
||||
assert result.dtype == expected_dtype
|
||||
|
||||
result = grouped["col"].aggregate(op_name)
|
||||
assert result.dtype == expected_dtype
|
||||
@@ -0,0 +1,392 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import NumbaUtilError
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
NamedAgg,
|
||||
Series,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
pytestmark = pytest.mark.single_cpu
|
||||
|
||||
|
||||
def test_correct_function_signature():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def incorrect_function(x):
|
||||
return sum(x) * 2.7
|
||||
|
||||
data = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
with pytest.raises(NumbaUtilError, match="The first 2"):
|
||||
data.groupby("key").agg(incorrect_function, engine="numba")
|
||||
|
||||
with pytest.raises(NumbaUtilError, match="The first 2"):
|
||||
data.groupby("key")["data"].agg(incorrect_function, engine="numba")
|
||||
|
||||
|
||||
def test_check_nopython_kwargs():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def incorrect_function(values, index):
|
||||
return sum(values) * 2.7
|
||||
|
||||
data = DataFrame(
|
||||
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
|
||||
columns=["key", "data"],
|
||||
)
|
||||
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
||||
data.groupby("key").agg(incorrect_function, engine="numba", a=1)
|
||||
|
||||
with pytest.raises(NumbaUtilError, match="numba does not support"):
|
||||
data.groupby("key")["data"].agg(incorrect_function, engine="numba", a=1)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
@pytest.mark.parametrize("jit", [True, False])
|
||||
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func_numba(values, index):
|
||||
return np.mean(values) * 2.7
|
||||
|
||||
if jit:
|
||||
# Test accepted jitted functions
|
||||
import numba
|
||||
|
||||
func_numba = numba.jit(func_numba)
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
grouped = data.groupby(0, as_index=as_index)
|
||||
if pandas_obj == "Series":
|
||||
grouped = grouped[1]
|
||||
|
||||
result = grouped.agg(func_numba, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
# Filter warnings when parallel=True and the function can't be parallelized by Numba
|
||||
@pytest.mark.parametrize("jit", [True, False])
|
||||
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
|
||||
def test_cache(jit, pandas_obj, nogil, parallel, nopython):
|
||||
# Test that the functions are cached correctly if we switch functions
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func_1(values, index):
|
||||
return np.mean(values) - 3.4
|
||||
|
||||
def func_2(values, index):
|
||||
return np.mean(values) * 2.7
|
||||
|
||||
if jit:
|
||||
import numba
|
||||
|
||||
func_1 = numba.jit(func_1)
|
||||
func_2 = numba.jit(func_2)
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
|
||||
grouped = data.groupby(0)
|
||||
if pandas_obj == "Series":
|
||||
grouped = grouped[1]
|
||||
|
||||
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Add func_2 to the cache
|
||||
result = grouped.agg(func_2, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Retest func_1 which should use the cache
|
||||
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
|
||||
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
def test_use_global_config():
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def func_1(values, index):
|
||||
return np.mean(values) - 3.4
|
||||
|
||||
data = DataFrame(
|
||||
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
expected = grouped.agg(func_1, engine="numba")
|
||||
with option_context("compute.use_numba", True):
|
||||
result = grouped.agg(func_1, engine=None)
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"agg_kwargs",
|
||||
[
|
||||
{"func": ["min", "max"]},
|
||||
{"func": "min"},
|
||||
{"func": {1: ["min", "max"], 2: "sum"}},
|
||||
{"bmin": NamedAgg(column=1, aggfunc="min")},
|
||||
],
|
||||
)
|
||||
def test_multifunc_numba_vs_cython_frame(agg_kwargs):
|
||||
pytest.importorskip("numba")
|
||||
data = DataFrame(
|
||||
{
|
||||
0: ["a", "a", "b", "b", "a"],
|
||||
1: [1.0, 2.0, 3.0, 4.0, 5.0],
|
||||
2: [1, 2, 3, 4, 5],
|
||||
},
|
||||
columns=[0, 1, 2],
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
result = grouped.agg(**agg_kwargs, engine="numba")
|
||||
expected = grouped.agg(**agg_kwargs, engine="cython")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"agg_kwargs,expected_func",
|
||||
[
|
||||
({"func": lambda values, index: values.sum()}, "sum"),
|
||||
# FIXME
|
||||
pytest.param(
|
||||
{
|
||||
"func": [
|
||||
lambda values, index: values.sum(),
|
||||
lambda values, index: values.min(),
|
||||
]
|
||||
},
|
||||
["sum", "min"],
|
||||
marks=pytest.mark.xfail(
|
||||
reason="This doesn't work yet! Fails in nopython pipeline!"
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_multifunc_numba_udf_frame(agg_kwargs, expected_func):
|
||||
pytest.importorskip("numba")
|
||||
data = DataFrame(
|
||||
{
|
||||
0: ["a", "a", "b", "b", "a"],
|
||||
1: [1.0, 2.0, 3.0, 4.0, 5.0],
|
||||
2: [1, 2, 3, 4, 5],
|
||||
},
|
||||
columns=[0, 1, 2],
|
||||
)
|
||||
grouped = data.groupby(0)
|
||||
result = grouped.agg(**agg_kwargs, engine="numba")
|
||||
expected = grouped.agg(expected_func, engine="cython")
|
||||
# check_dtype can be removed if GH 44952 is addressed
|
||||
# Currently, UDFs still always return float64 while reductions can preserve dtype
|
||||
tm.assert_frame_equal(result, expected, check_dtype=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"agg_kwargs",
|
||||
[{"func": ["min", "max"]}, {"func": "min"}, {"min_val": "min", "max_val": "max"}],
|
||||
)
|
||||
def test_multifunc_numba_vs_cython_series(agg_kwargs):
|
||||
pytest.importorskip("numba")
|
||||
labels = ["a", "a", "b", "b", "a"]
|
||||
data = Series([1.0, 2.0, 3.0, 4.0, 5.0])
|
||||
grouped = data.groupby(labels)
|
||||
agg_kwargs["engine"] = "numba"
|
||||
result = grouped.agg(**agg_kwargs)
|
||||
agg_kwargs["engine"] = "cython"
|
||||
expected = grouped.agg(**agg_kwargs)
|
||||
if isinstance(expected, DataFrame):
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
@pytest.mark.parametrize(
|
||||
"data,agg_kwargs",
|
||||
[
|
||||
(Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": ["min", "max"]}),
|
||||
(Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": "min"}),
|
||||
(
|
||||
DataFrame(
|
||||
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
||||
),
|
||||
{"func": ["min", "max"]},
|
||||
),
|
||||
(
|
||||
DataFrame(
|
||||
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
||||
),
|
||||
{"func": "min"},
|
||||
),
|
||||
(
|
||||
DataFrame(
|
||||
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
||||
),
|
||||
{"func": {1: ["min", "max"], 2: "sum"}},
|
||||
),
|
||||
(
|
||||
DataFrame(
|
||||
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
|
||||
),
|
||||
{"min_col": NamedAgg(column=1, aggfunc="min")},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_multifunc_numba_kwarg_propagation(data, agg_kwargs):
|
||||
pytest.importorskip("numba")
|
||||
labels = ["a", "a", "b", "b", "a"]
|
||||
grouped = data.groupby(labels)
|
||||
result = grouped.agg(**agg_kwargs, engine="numba", engine_kwargs={"parallel": True})
|
||||
expected = grouped.agg(**agg_kwargs, engine="numba")
|
||||
if isinstance(expected, DataFrame):
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_args_not_cached():
|
||||
# GH 41647
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def sum_last(values, index, n):
|
||||
return values[-n:].sum()
|
||||
|
||||
df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
|
||||
grouped_x = df.groupby("id")["x"]
|
||||
result = grouped_x.agg(sum_last, 1, engine="numba")
|
||||
expected = Series([1.0] * 2, name="x", index=Index([0, 1], name="id"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = grouped_x.agg(sum_last, 2, engine="numba")
|
||||
expected = Series([2.0] * 2, name="x", index=Index([0, 1], name="id"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_index_data_correctly_passed():
|
||||
# GH 43133
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def f(values, index):
|
||||
return np.mean(index)
|
||||
|
||||
df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
|
||||
result = df.groupby("group").aggregate(f, engine="numba")
|
||||
expected = DataFrame(
|
||||
[-1.5, -3.0], columns=["v"], index=Index(["A", "B"], name="group")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_engine_kwargs_not_cached():
|
||||
# If the user passes a different set of engine_kwargs don't return the same
|
||||
# jitted function
|
||||
pytest.importorskip("numba")
|
||||
nogil = True
|
||||
parallel = False
|
||||
nopython = True
|
||||
|
||||
def func_kwargs(values, index):
|
||||
return nogil + parallel + nopython
|
||||
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
df = DataFrame({"value": [0, 0, 0]})
|
||||
result = df.groupby(level=0).aggregate(
|
||||
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame({"value": [2.0, 2.0, 2.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
nogil = False
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
result = df.groupby(level=0).aggregate(
|
||||
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame({"value": [1.0, 1.0, 1.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore")
|
||||
def test_multiindex_one_key(nogil, parallel, nopython):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def numba_func(values, index):
|
||||
return 1
|
||||
|
||||
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
result = df.groupby("A").agg(
|
||||
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
expected = DataFrame([1.0], index=Index([1], name="A"), columns=["C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
|
||||
pytest.importorskip("numba")
|
||||
|
||||
def numba_func(values, index):
|
||||
return 1
|
||||
|
||||
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
|
||||
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
|
||||
with pytest.raises(NotImplementedError, match="more than 1 grouping labels"):
|
||||
df.groupby(["A", "B"]).agg(
|
||||
numba_func, engine="numba", engine_kwargs=engine_kwargs
|
||||
)
|
||||
|
||||
|
||||
def test_multilabel_numba_vs_cython(numba_supported_reductions):
|
||||
pytest.importorskip("numba")
|
||||
reduction, kwargs = numba_supported_reductions
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
gb = df.groupby(["A", "B"])
|
||||
res_agg = gb.agg(reduction, engine="numba", **kwargs)
|
||||
expected_agg = gb.agg(reduction, engine="cython", **kwargs)
|
||||
tm.assert_frame_equal(res_agg, expected_agg)
|
||||
# Test that calling the aggregation directly also works
|
||||
direct_res = getattr(gb, reduction)(engine="numba", **kwargs)
|
||||
direct_expected = getattr(gb, reduction)(engine="cython", **kwargs)
|
||||
tm.assert_frame_equal(direct_res, direct_expected)
|
||||
|
||||
|
||||
def test_multilabel_udf_numba_vs_cython():
|
||||
pytest.importorskip("numba")
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.random.default_rng(2).standard_normal(8),
|
||||
}
|
||||
)
|
||||
gb = df.groupby(["A", "B"])
|
||||
result = gb.agg(lambda values, index: values.min(), engine="numba")
|
||||
expected = gb.agg(lambda x: x.min(), engine="cython")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
@@ -0,0 +1,675 @@
|
||||
"""
|
||||
test all other .agg behavior
|
||||
"""
|
||||
|
||||
import datetime as dt
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import SpecificationError
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
PeriodIndex,
|
||||
Series,
|
||||
date_range,
|
||||
period_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
|
||||
def test_agg_partial_failure_raises():
|
||||
# GH#43741
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"data1": np.random.default_rng(2).standard_normal(5),
|
||||
"data2": np.random.default_rng(2).standard_normal(5),
|
||||
"key1": ["a", "a", "b", "b", "a"],
|
||||
"key2": ["one", "two", "one", "two", "one"],
|
||||
}
|
||||
)
|
||||
grouped = df.groupby("key1")
|
||||
|
||||
def peak_to_peak(arr):
|
||||
return arr.max() - arr.min()
|
||||
|
||||
with pytest.raises(TypeError, match="unsupported operand type"):
|
||||
grouped.agg([peak_to_peak])
|
||||
|
||||
with pytest.raises(TypeError, match="unsupported operand type"):
|
||||
grouped.agg(peak_to_peak)
|
||||
|
||||
|
||||
def test_agg_datetimes_mixed():
|
||||
data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]]
|
||||
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"key": [x[0] for x in data],
|
||||
"date": [x[1] for x in data],
|
||||
"value": [x[2] for x in data],
|
||||
}
|
||||
)
|
||||
|
||||
data = [
|
||||
[
|
||||
row[0],
|
||||
(dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None),
|
||||
row[2],
|
||||
]
|
||||
for row in data
|
||||
]
|
||||
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"key": [x[0] for x in data],
|
||||
"date": [x[1] for x in data],
|
||||
"value": [x[2] for x in data],
|
||||
}
|
||||
)
|
||||
|
||||
df1["weights"] = df1["value"] / df1["value"].sum()
|
||||
gb1 = df1.groupby("date").aggregate("sum")
|
||||
|
||||
df2["weights"] = df1["value"] / df1["value"].sum()
|
||||
gb2 = df2.groupby("date").aggregate("sum")
|
||||
|
||||
assert len(gb1) == len(gb2)
|
||||
|
||||
|
||||
def test_agg_period_index():
|
||||
prng = period_range("2012-1-1", freq="M", periods=3)
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((3, 2)), index=prng)
|
||||
rs = df.groupby(level=0).sum()
|
||||
assert isinstance(rs.index, PeriodIndex)
|
||||
|
||||
# GH 3579
|
||||
index = period_range(start="1999-01", periods=5, freq="M")
|
||||
s1 = Series(np.random.default_rng(2).random(len(index)), index=index)
|
||||
s2 = Series(np.random.default_rng(2).random(len(index)), index=index)
|
||||
df = DataFrame.from_dict({"s1": s1, "s2": s2})
|
||||
grouped = df.groupby(df.index.month)
|
||||
list(grouped)
|
||||
|
||||
|
||||
def test_agg_dict_parameter_cast_result_dtypes():
|
||||
# GH 12821
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"class": ["A", "A", "B", "B", "C", "C", "D", "D"],
|
||||
"time": date_range("1/1/2011", periods=8, freq="h"),
|
||||
}
|
||||
)
|
||||
df.loc[[0, 1, 2, 5], "time"] = None
|
||||
|
||||
# test for `first` function
|
||||
exp = df.loc[[0, 3, 4, 6]].set_index("class")
|
||||
grouped = df.groupby("class")
|
||||
tm.assert_frame_equal(grouped.first(), exp)
|
||||
tm.assert_frame_equal(grouped.agg("first"), exp)
|
||||
tm.assert_frame_equal(grouped.agg({"time": "first"}), exp)
|
||||
tm.assert_series_equal(grouped.time.first(), exp["time"])
|
||||
tm.assert_series_equal(grouped.time.agg("first"), exp["time"])
|
||||
|
||||
# test for `last` function
|
||||
exp = df.loc[[0, 3, 4, 7]].set_index("class")
|
||||
grouped = df.groupby("class")
|
||||
tm.assert_frame_equal(grouped.last(), exp)
|
||||
tm.assert_frame_equal(grouped.agg("last"), exp)
|
||||
tm.assert_frame_equal(grouped.agg({"time": "last"}), exp)
|
||||
tm.assert_series_equal(grouped.time.last(), exp["time"])
|
||||
tm.assert_series_equal(grouped.time.agg("last"), exp["time"])
|
||||
|
||||
# count
|
||||
exp = Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time")
|
||||
tm.assert_series_equal(grouped.time.agg(len), exp)
|
||||
tm.assert_series_equal(grouped.time.size(), exp)
|
||||
|
||||
exp = Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time")
|
||||
tm.assert_series_equal(grouped.time.count(), exp)
|
||||
|
||||
|
||||
def test_agg_cast_results_dtypes():
|
||||
# similar to GH12821
|
||||
# xref #11444
|
||||
u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
|
||||
v = list("aaabbbbbbccd")
|
||||
df = DataFrame({"X": v, "Y": u})
|
||||
|
||||
result = df.groupby("X")["Y"].agg(len)
|
||||
expected = df.groupby("X")["Y"].count()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_float64_no_int64():
|
||||
# see gh-11199
|
||||
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]})
|
||||
|
||||
expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
|
||||
expected.index.name = "b"
|
||||
|
||||
result = df.groupby("b")[["a"]].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
|
||||
expected.index.name = "b"
|
||||
|
||||
result = df.groupby("b")[["a", "c"]].mean()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_aggregate_api_consistency():
|
||||
# GH 9052
|
||||
# make sure that the aggregates via dict
|
||||
# are consistent
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
grouped = df.groupby(["A", "B"])
|
||||
c_mean = grouped["C"].mean()
|
||||
c_sum = grouped["C"].sum()
|
||||
d_mean = grouped["D"].mean()
|
||||
d_sum = grouped["D"].sum()
|
||||
|
||||
result = grouped["D"].agg(["sum", "mean"])
|
||||
expected = pd.concat([d_sum, d_mean], axis=1)
|
||||
expected.columns = ["sum", "mean"]
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg(["sum", "mean"])
|
||||
expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped[["D", "C"]].agg(["sum", "mean"])
|
||||
expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg({"C": "mean", "D": "sum"})
|
||||
expected = pd.concat([d_sum, c_mean], axis=1)
|
||||
tm.assert_frame_equal(result, expected, check_like=True)
|
||||
|
||||
result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]})
|
||||
expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
|
||||
expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
|
||||
|
||||
msg = r"Column\(s\) \['r', 'r2'\] do not exist"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
grouped[["D", "C"]].agg({"r": "sum", "r2": "mean"})
|
||||
|
||||
|
||||
def test_agg_dict_renaming_deprecation():
|
||||
# 15931
|
||||
df = DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
df.groupby("A").agg(
|
||||
{"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}}
|
||||
)
|
||||
|
||||
msg = r"Column\(s\) \['ma'\] do not exist"
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
df.groupby("A")[["B", "C"]].agg({"ma": "max"})
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
df.groupby("A").B.agg({"foo": "count"})
|
||||
|
||||
|
||||
def test_agg_compat():
|
||||
# GH 12334
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"C": ["sum", "std"]})
|
||||
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"C": "sum", "D": "std"})
|
||||
|
||||
|
||||
def test_agg_nested_dicts():
|
||||
# API change for disallowing these types of nested dicts
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
|
||||
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
|
||||
"D": np.arange(8),
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
msg = r"nested renamer is not supported"
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}})
|
||||
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}})
|
||||
|
||||
# same name as the original column
|
||||
# GH9052
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"result1": np.sum, "result2": np.mean})
|
||||
|
||||
with pytest.raises(SpecificationError, match=msg):
|
||||
g["D"].agg({"D": np.sum, "result2": np.mean})
|
||||
|
||||
|
||||
def test_agg_item_by_item_raise_typeerror():
|
||||
df = DataFrame(np.random.default_rng(2).integers(10, size=(20, 10)))
|
||||
|
||||
def raiseException(df):
|
||||
pprint_thing("----------------------------------------")
|
||||
pprint_thing(df.to_string())
|
||||
raise TypeError("test")
|
||||
|
||||
with pytest.raises(TypeError, match="test"):
|
||||
df.groupby(0).agg(raiseException)
|
||||
|
||||
|
||||
def test_series_agg_multikey():
|
||||
ts = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
|
||||
|
||||
result = grouped.agg("sum")
|
||||
expected = grouped.sum()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_series_agg_multi_pure_python():
|
||||
data = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
"D": np.random.default_rng(2).standard_normal(11),
|
||||
"E": np.random.default_rng(2).standard_normal(11),
|
||||
"F": np.random.default_rng(2).standard_normal(11),
|
||||
}
|
||||
)
|
||||
|
||||
def bad(x):
|
||||
assert len(x.values.base) > 0
|
||||
return "foo"
|
||||
|
||||
result = data.groupby(["A", "B"]).agg(bad)
|
||||
expected = data.groupby(["A", "B"]).agg(lambda x: "foo")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_consistency():
|
||||
# agg with ([]) and () not consistent
|
||||
# GH 6715
|
||||
def P1(a):
|
||||
return np.percentile(a.dropna(), q=1)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"col1": [1, 2, 3, 4],
|
||||
"col2": [10, 25, 26, 31],
|
||||
"date": [
|
||||
dt.date(2013, 2, 10),
|
||||
dt.date(2013, 2, 10),
|
||||
dt.date(2013, 2, 11),
|
||||
dt.date(2013, 2, 11),
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
g = df.groupby("date")
|
||||
|
||||
expected = g.agg([P1])
|
||||
expected.columns = expected.columns.levels[0]
|
||||
|
||||
result = g.agg(P1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_callables():
|
||||
# GH 7929
|
||||
df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64)
|
||||
|
||||
class fn_class:
|
||||
def __call__(self, x):
|
||||
return sum(x)
|
||||
|
||||
equiv_callables = [
|
||||
sum,
|
||||
np.sum,
|
||||
lambda x: sum(x),
|
||||
lambda x: x.sum(),
|
||||
partial(sum),
|
||||
fn_class(),
|
||||
]
|
||||
|
||||
expected = df.groupby("foo").agg("sum")
|
||||
for ecall in equiv_callables:
|
||||
warn = FutureWarning if ecall is sum or ecall is np.sum else None
|
||||
msg = "using DataFrameGroupBy.sum"
|
||||
with tm.assert_produces_warning(warn, match=msg):
|
||||
result = df.groupby("foo").agg(ecall)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_over_numpy_arrays():
|
||||
# GH 3788
|
||||
df = DataFrame(
|
||||
[
|
||||
[1, np.array([10, 20, 30])],
|
||||
[1, np.array([40, 50, 60])],
|
||||
[2, np.array([20, 30, 40])],
|
||||
],
|
||||
columns=["category", "arraydata"],
|
||||
)
|
||||
gb = df.groupby("category")
|
||||
|
||||
expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
|
||||
expected_index = Index([1, 2], name="category")
|
||||
expected_column = ["arraydata"]
|
||||
expected = DataFrame(expected_data, index=expected_index, columns=expected_column)
|
||||
|
||||
alt = gb.sum(numeric_only=False)
|
||||
tm.assert_frame_equal(alt, expected)
|
||||
|
||||
result = gb.agg("sum", numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# FIXME: the original version of this test called `gb.agg(sum)`
|
||||
# and that raises TypeError if `numeric_only=False` is passed
|
||||
|
||||
|
||||
@pytest.mark.parametrize("as_period", [True, False])
|
||||
def test_agg_tzaware_non_datetime_result(as_period):
|
||||
# discussed in GH#29589, fixed in GH#29641, operating on tzaware values
|
||||
# with function that is not dtype-preserving
|
||||
dti = date_range("2012-01-01", periods=4, tz="UTC")
|
||||
if as_period:
|
||||
dti = dti.tz_localize(None).to_period("D")
|
||||
|
||||
df = DataFrame({"a": [0, 0, 1, 1], "b": dti})
|
||||
gb = df.groupby("a")
|
||||
|
||||
# Case that _does_ preserve the dtype
|
||||
result = gb["b"].agg(lambda x: x.iloc[0])
|
||||
expected = Series(dti[::2], name="b")
|
||||
expected.index.name = "a"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# Cases that do _not_ preserve the dtype
|
||||
result = gb["b"].agg(lambda x: x.iloc[0].year)
|
||||
expected = Series([2012, 2012], name="b")
|
||||
expected.index.name = "a"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0])
|
||||
expected = Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b")
|
||||
expected.index.name = "a"
|
||||
if as_period:
|
||||
expected = Series([pd.offsets.Day(1), pd.offsets.Day(1)], name="b")
|
||||
expected.index.name = "a"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_timezone_round_trip():
|
||||
# GH 15426
|
||||
ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
|
||||
df = DataFrame({"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]})
|
||||
|
||||
result1 = df.groupby("a")["b"].agg("min").iloc[0]
|
||||
result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0]
|
||||
result3 = df.groupby("a")["b"].min().iloc[0]
|
||||
|
||||
assert result1 == ts
|
||||
assert result2 == ts
|
||||
assert result3 == ts
|
||||
|
||||
dates = [
|
||||
pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5)
|
||||
]
|
||||
df = DataFrame({"A": ["a", "b"] * 2, "B": dates})
|
||||
grouped = df.groupby("A")
|
||||
|
||||
ts = df["B"].iloc[0]
|
||||
assert ts == grouped.nth(0)["B"].iloc[0]
|
||||
assert ts == grouped.head(1)["B"].iloc[0]
|
||||
assert ts == grouped.first()["B"].iloc[0]
|
||||
|
||||
# GH#27110 applying iloc should return a DataFrame
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
||||
assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]
|
||||
|
||||
ts = df["B"].iloc[2]
|
||||
assert ts == grouped.last()["B"].iloc[0]
|
||||
|
||||
# GH#27110 applying iloc should return a DataFrame
|
||||
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
||||
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
||||
assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]
|
||||
|
||||
|
||||
def test_sum_uint64_overflow():
|
||||
# see gh-14758
|
||||
# Convert to uint64 and don't overflow
|
||||
df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
|
||||
df = df + 9223372036854775807
|
||||
|
||||
index = Index(
|
||||
[9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64
|
||||
)
|
||||
expected = DataFrame(
|
||||
{1: [9223372036854775809, 9223372036854775811, 9223372036854775813]},
|
||||
index=index,
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
expected.index.name = 0
|
||||
result = df.groupby(0).sum(numeric_only=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# out column is non-numeric, so with numeric_only=True it is dropped
|
||||
result2 = df.groupby(0).sum(numeric_only=True)
|
||||
expected2 = expected[[]]
|
||||
tm.assert_frame_equal(result2, expected2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"structure, expected",
|
||||
[
|
||||
(tuple, DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})),
|
||||
(list, DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})),
|
||||
(
|
||||
lambda x: tuple(x),
|
||||
DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}),
|
||||
),
|
||||
(
|
||||
lambda x: list(x),
|
||||
DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_agg_structs_dataframe(structure, expected):
|
||||
df = DataFrame(
|
||||
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
|
||||
)
|
||||
|
||||
result = df.groupby(["A", "B"]).aggregate(structure)
|
||||
expected.index.names = ["A", "B"]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"structure, expected",
|
||||
[
|
||||
(tuple, Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
|
||||
(list, Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
|
||||
(lambda x: tuple(x), Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
|
||||
(lambda x: list(x), Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
|
||||
],
|
||||
)
|
||||
def test_agg_structs_series(structure, expected):
|
||||
# Issue #18079
|
||||
df = DataFrame(
|
||||
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
|
||||
)
|
||||
|
||||
result = df.groupby("A")["C"].aggregate(structure)
|
||||
expected.index.name = "A"
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_category_nansum(observed):
|
||||
categories = ["a", "b", "c"]
|
||||
df = DataFrame(
|
||||
{"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]}
|
||||
)
|
||||
msg = "using SeriesGroupBy.sum"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.groupby("A", observed=observed).B.agg(np.nansum)
|
||||
expected = Series(
|
||||
[3, 3, 0],
|
||||
index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"),
|
||||
name="B",
|
||||
)
|
||||
if observed:
|
||||
expected = expected[expected != 0]
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_list_like_func():
|
||||
# GH 18473
|
||||
df = DataFrame({"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]})
|
||||
grouped = df.groupby("A", as_index=False, sort=False)
|
||||
result = grouped.agg({"B": lambda x: list(x)})
|
||||
expected = DataFrame(
|
||||
{"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_agg_lambda_with_timezone():
|
||||
# GH 23683
|
||||
df = DataFrame(
|
||||
{
|
||||
"tag": [1, 1],
|
||||
"date": [
|
||||
pd.Timestamp("2018-01-01", tz="UTC"),
|
||||
pd.Timestamp("2018-01-02", tz="UTC"),
|
||||
],
|
||||
}
|
||||
)
|
||||
result = df.groupby("tag").agg({"date": lambda e: e.head(1)})
|
||||
expected = DataFrame(
|
||||
[pd.Timestamp("2018-01-01", tz="UTC")],
|
||||
index=Index([1], name="tag"),
|
||||
columns=["date"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"err_cls",
|
||||
[
|
||||
NotImplementedError,
|
||||
RuntimeError,
|
||||
KeyError,
|
||||
IndexError,
|
||||
OSError,
|
||||
ValueError,
|
||||
ArithmeticError,
|
||||
AttributeError,
|
||||
],
|
||||
)
|
||||
def test_groupby_agg_err_catching(err_cls):
|
||||
# make sure we suppress anything other than TypeError or AssertionError
|
||||
# in _python_agg_general
|
||||
|
||||
# Use a non-standard EA to make sure we don't go down ndarray paths
|
||||
from pandas.tests.extension.decimal.array import (
|
||||
DecimalArray,
|
||||
make_data,
|
||||
to_decimal,
|
||||
)
|
||||
|
||||
data = make_data()[:5]
|
||||
df = DataFrame(
|
||||
{"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)}
|
||||
)
|
||||
|
||||
expected = Series(to_decimal([data[0], data[3]]))
|
||||
|
||||
def weird_func(x):
|
||||
# weird function that raise something other than TypeError or IndexError
|
||||
# in _python_agg_general
|
||||
if len(x) == 0:
|
||||
raise err_cls
|
||||
return x.iloc[0]
|
||||
|
||||
result = df["decimals"].groupby(df["id1"]).agg(weird_func)
|
||||
tm.assert_series_equal(result, expected, check_names=False)
|
||||
Reference in New Issue
Block a user