Implemented GroupBy.median() (#1957)

This PR proposes `GroupBy.median()`. Note: the result can be slightly different from pandas since we use an approximated median based upon approximate percentile computation because computing median across a large dataset is extremely expensive. ```python >>> kdf = ks.DataFrame({'a': [1., 1., 1., 1., 2., 2., 2., 3., 3., 3.], ... 'b': [2., 3., 1., 4., 6., 9., 8., 10., 7., 5.], ... 'c': [3., 5., 2., 5., 1., 2., 6., 4., 3., 6.]}, ... columns=['a', 'b', 'c'], ... index=[7, 2, 4, 1, 3, 4, 9, 10, 5, 6]) >>> kdf a b c 7 1.0 2.0 3.0 2 1.0 3.0 5.0 4 1.0 1.0 2.0 1 1.0 4.0 5.0 3 2.0 6.0 1.0 4 2.0 9.0 2.0 9 2.0 8.0 6.0 10 3.0 10.0 4.0 5 3.0 7.0 3.0 6 3.0 5.0 6.0 >>> kdf.groupby('a').median().sort_index() # doctest: +NORMALIZE_WHITESPACE b c a 1.0 2.0 3.0 2.0 8.0 2.0 3.0 7.0 4.0 >>> kdf.groupby('a')['b'].median().sort_index() a 1.0 2.0 2.0 8.0 3.0 7.0 Name: b, dtype: float64 ``` ref #1929
databricks · Dec 11, 2020 · 78b1004 · 78b1004
1 parent bb31489
commit 78b1004
Show file tree

Hide file tree

Showing 4 changed files with 93 additions and 2 deletions.
diff --git a/databricks/koalas/groupby.py b/databricks/koalas/groupby.py
@@ -71,6 +71,7 @@
 from databricks.koalas.spark.utils import as_nullable_spark_type, force_decimal_precision_scale
 from databricks.koalas.window import RollingGroupby, ExpandingGroupby
 from databricks.koalas.exceptions import DataError
+from databricks.koalas.spark import functions as SF
 
 # to keep it the same as pandas
 NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
@@ -2343,6 +2344,73 @@ def get_group(self, name) -> Union[DataFrame, Series]:
 
         return DataFrame(internal)
 
+    def median(self, numeric_only=True, accuracy=10000) -> Union[DataFrame, Series]:
+        """
+        Compute median of groups, excluding missing values.
+
+        For multiple groupings, the result index will be a MultiIndex
+
+        .. note:: Unlike pandas', the median in Koalas is an approximated median based upon
+            approximate percentile computation because computing median across a large dataset
+            is extremely expensive.
+
+        Parameters
+        ----------
+        numeric_only : bool, default True
+            Include only float, int, boolean columns. False is not supported. This parameter
+            is mainly for pandas compatibility.
+
+        Returns
+        -------
+        Series or DataFrame
+            Median of values within each group.
+
+        Examples
+        --------
+        >>> kdf = ks.DataFrame({'a': [1., 1., 1., 1., 2., 2., 2., 3., 3., 3.],
+        ...                     'b': [2., 3., 1., 4., 6., 9., 8., 10., 7., 5.],
+        ...                     'c': [3., 5., 2., 5., 1., 2., 6., 4., 3., 6.]},
+        ...                    columns=['a', 'b', 'c'],
+        ...                    index=[7, 2, 4, 1, 3, 4, 9, 10, 5, 6])
+        >>> kdf
+              a     b    c
+        7   1.0   2.0  3.0
+        2   1.0   3.0  5.0
+        4   1.0   1.0  2.0
+        1   1.0   4.0  5.0
+        3   2.0   6.0  1.0
+        4   2.0   9.0  2.0
+        9   2.0   8.0  6.0
+        10  3.0  10.0  4.0
+        5   3.0   7.0  3.0
+        6   3.0   5.0  6.0
+
+        DataFrameGroupBy
+
+        >>> kdf.groupby('a').median().sort_index()  # doctest: +NORMALIZE_WHITESPACE
+               b    c
+        a
+        1.0  2.0  3.0
+        2.0  8.0  2.0
+        3.0  7.0  4.0
+
+        SeriesGroupBy
+
+        >>> kdf.groupby('a')['b'].median().sort_index()
+        a
+        1.0    2.0
+        2.0    8.0
+        3.0    7.0
+        Name: b, dtype: float64
+        """
+        if not isinstance(accuracy, int):
+            raise ValueError(
+                "accuracy must be an integer; however, got [%s]" % type(accuracy).__name__
+            )
+
+        stat_function = lambda col: SF.percentile_approx(col, 0.5, accuracy)
+        return self._reduce_for_stat_function(stat_function, only_numeric=numeric_only)
+
     def _reduce_for_stat_function(self, sfun, only_numeric):
         agg_columns = self._agg_columns
         agg_columns_scols = self._agg_columns_scols

diff --git a/databricks/koalas/missing/groupby.py b/databricks/koalas/missing/groupby.py
@@ -57,7 +57,6 @@ class MissingPandasLikeDataFrameGroupBy(object):
 
     # Functions
     boxplot = _unsupported_function("boxplot")
-    median = _unsupported_function("median")
     ngroup = _unsupported_function("ngroup")
     nth = _unsupported_function("nth")
     ohlc = _unsupported_function("ohlc")
@@ -93,7 +92,6 @@ class MissingPandasLikeSeriesGroupBy(object):
     agg = _unsupported_function("agg")
     aggregate = _unsupported_function("aggregate")
     describe = _unsupported_function("describe")
-    median = _unsupported_function("median")
     ngroup = _unsupported_function("ngroup")
     nth = _unsupported_function("nth")
     ohlc = _unsupported_function("ohlc")

diff --git a/databricks/koalas/tests/test_groupby.py b/databricks/koalas/tests/test_groupby.py
@@ -2609,6 +2609,30 @@ def test_get_group(self):
             ValueError, lambda: kdf.groupby([("B", "class"), ("A", "name")]).get_group("mammal")
         )
 
+    def test_median(self):
+        kdf = ks.DataFrame(
+            {
+                "a": [1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0],
+                "b": [2.0, 3.0, 1.0, 4.0, 6.0, 9.0, 8.0, 10.0, 7.0, 5.0],
+                "c": [3.0, 5.0, 2.0, 5.0, 1.0, 2.0, 6.0, 4.0, 3.0, 6.0],
+            },
+            columns=["a", "b", "c"],
+            index=[7, 2, 4, 1, 3, 4, 9, 10, 5, 6],
+        )
+        # DataFrame
+        expected_result = ks.DataFrame(
+            {"b": [2.0, 8.0, 7.0], "c": [3.0, 2.0, 4.0]}, index=pd.Index([1.0, 2.0, 3.0], name="a")
+        )
+        self.assert_eq(expected_result, kdf.groupby("a").median().sort_index())
+        # Series
+        expected_result = ks.Series(
+            [2.0, 8.0, 7.0], name="b", index=pd.Index([1.0, 2.0, 3.0], name="a")
+        )
+        self.assert_eq(expected_result, kdf.groupby("a")["b"].median().sort_index())
+
+        with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"):
+            kdf.groupby("a").median(accuracy="a")
+
     def test_tail(self):
         pdf = pd.DataFrame(
             {

diff --git a/docs/source/reference/groupby.rst b/docs/source/reference/groupby.rst
@@ -51,6 +51,7 @@ Computations / Descriptive Stats
    GroupBy.last
    GroupBy.max
    GroupBy.mean
+   GroupBy.median
    GroupBy.min
    GroupBy.rank
    GroupBy.std