Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update compute_dict_like to get all columns #58452

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Expand Up @@ -448,6 +448,7 @@ Groupby/resample/rolling
- Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)
- Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`)
- Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`)
- Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`)
- Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`)
- Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`)

Expand Down
35 changes: 30 additions & 5 deletions pandas/core/apply.py
Expand Up @@ -471,8 +471,30 @@ def compute_dict_like(

keys += [key] * len(key_data)
results += key_data
else:
elif is_groupby:
# key used for column selection and output

df = selected_obj
results, keys = [], []
for key, how in func.items():
cols = df[key]

if cols.ndim == 1:
series_list = [obj._gotitem(key, ndim=1, subset=cols)]
else:
series_list = []
for index in range(cols.shape[1]):
col = cols.iloc[:, index]

series = obj._gotitem(key, ndim=1, subset=col)
series_list.append(series)

for series in series_list:
result = getattr(series, op_name)(how, **kwargs)
results.append(result)
keys.append(key)

else:
results = [
getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs)
for key, how in func.items()
Expand All @@ -496,11 +518,14 @@ def wrap_results_dict_like(
is_ndframe = [isinstance(r, ABCNDFrame) for r in result_data]

if all(is_ndframe):
results = dict(zip(result_index, result_data))
results = [result for result in result_data if not result.empty]
rhshadrach marked this conversation as resolved.
Show resolved Hide resolved
keys_to_use: Iterable[Hashable]
keys_to_use = [k for k in result_index if not results[k].empty]
keys_to_use = [k for k, v in zip(result_index, result_data) if not v.empty]
# Have to check, if at least one DataFrame is not empty.
keys_to_use = keys_to_use if keys_to_use != [] else result_index
if keys_to_use == []:
keys_to_use = result_index
results = result_data

if selected_obj.ndim == 2:
# keys are columns, so we can preserve names
ktu = Index(keys_to_use)
Expand All @@ -509,7 +534,7 @@ def wrap_results_dict_like(

axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1
result = concat(
{k: results[k] for k in keys_to_use},
results,
axis=axis,
keys=keys_to_use,
)
Expand Down
118 changes: 118 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Expand Up @@ -1663,3 +1663,121 @@ def func(x):
msg = "length must not be 0"
with pytest.raises(ValueError, match=msg):
df.groupby("A", observed=False).agg(func)


def test_groupby_aggregation_duplicate_columns_single_dict_value():
# GH#55041
df = DataFrame(
[[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]],
columns=["a", "b", "c", "c"],
)
gb = df.groupby("a")
result = gb.agg({"c": "sum"})

expected = DataFrame(
[[7, 9], [5, 6]], columns=["c", "c"], index=Index([1, 2], name="a")
)
tm.assert_frame_equal(result, expected)


def test_groupby_aggregation_duplicate_columns_multiple_dict_values():
# GH#55041
df = DataFrame(
[[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]],
columns=["a", "b", "c", "c"],
)
gb = df.groupby("a")
result = gb.agg({"c": ["sum", "min", "max", "min"]})

expected = DataFrame(
[[7, 3, 4, 3, 9, 4, 5, 4], [5, 5, 5, 5, 6, 6, 6, 6]],
columns=MultiIndex(
levels=[["c"], ["sum", "min", "max"]],
codes=[[0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 2, 1, 0, 1, 2, 1]],
),
index=Index([1, 2], name="a"),
)
tm.assert_frame_equal(result, expected)


def test_groupby_aggregation_duplicate_columns_some_empty_result():
# GH#55041
df = DataFrame(
[
[1, 9843, 43, 54, 7867],
[2, 940, 9, -34, 44],
[1, -34, -546, -549358, 0],
[2, 244, -33, -100, 44],
],
columns=["a", "b", "b", "c", "c"],
)
gb = df.groupby("a")
result = gb.agg({"b": [], "c": ["var"]})

expected = DataFrame(
[[1.509268e11, 30944844.5], [2.178000e03, 0.0]],
columns=MultiIndex(levels=[["c"], ["var"]], codes=[[0, 0], [0, 0]]),
index=Index([1, 2], name="a"),
)
tm.assert_frame_equal(result, expected)


def test_groupby_aggregation_multi_index_duplicate_columns():
# GH#55041
df = DataFrame(
[
[1, -9843, 43, 54, 7867],
[2, 940, 9, -34, 44],
[1, -34, 546, -549358, 0],
[2, 244, -33, -100, 44],
],
columns=MultiIndex(
levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]],
codes=[[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]],
),
index=MultiIndex(
levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]],
codes=[[0, 0, 0, 1], [0, 1, 1, 0]],
),
)
gb = df.groupby(level=0)
result = gb.agg({("level1.1", "level2.2"): "min"})
rhshadrach marked this conversation as resolved.
Show resolved Hide resolved

expected = DataFrame(
[[-9843, 9], [244, -33]],
columns=MultiIndex(levels=[["level1.1"], ["level2.2"]], codes=[[0, 0], [0, 0]]),
index=Index(["level1.1", "level1.2"]),
)
tm.assert_frame_equal(result, expected)


def test_groupby_aggregation_func_list_multi_index_duplicate_columns():
# GH#55041
df = DataFrame(
[
[1, -9843, 43, 54, 7867],
[2, 940, 9, -34, 44],
[1, -34, 546, -549358, 0],
[2, 244, -33, -100, 44],
],
columns=MultiIndex(
levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]],
codes=[[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]],
),
index=MultiIndex(
levels=[["level1.1", "level1.2"], ["level2.1", "level2.2"]],
codes=[[0, 0, 0, 1], [0, 1, 1, 0]],
),
)
gb = df.groupby(level=0)
result = gb.agg({("level1.1", "level2.2"): ["min", "max"]})

expected = DataFrame(
[[-9843, 940, 9, 546], [244, 244, -33, -33]],
columns=MultiIndex(
levels=[["level1.1"], ["level2.2"], ["min", "max"]],
codes=[[0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 1]],
),
index=Index(["level1.1", "level1.2"]),
)
tm.assert_frame_equal(result, expected)