Implement DataFrame.first and Series.first functionality (#2128)

Please see change to implement DataFrame.first and Series.first functionality similar to that available in pandas. Requirement raised in issue: #1929 ```python >>> index = pd.date_range('2018-04-09', periods=4, freq='2D') >>> ks_series = ks.Series([1, 2, 3, 4], index=index) 2018-04-09 1 2018-04-11 2 2018-04-13 3 2018-04-15 4 dtype: int64 >>> ks_series.first('3D') 2018-04-09 1 2018-04-11 2 dtype: int64 ```
databricks · Mar 31, 2021 · 0565e14 · 0565e14
1 parent 07c4e36
commit 0565e14
Show file tree

Hide file tree

Showing 8 changed files with 133 additions and 18 deletions.
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -3077,14 +3077,12 @@ def between_time(
         2018-04-09 00:00:00  1
         2018-04-12 01:00:00  4
         """
-        from databricks.koalas.indexes import DatetimeIndex
-
         axis = validate_axis(axis)
 
         if axis != 0:
             raise NotImplementedError("between_time currently only works for axis=0")
 
-        if not isinstance(self.index, DatetimeIndex):
+        if not isinstance(self.index, ks.DatetimeIndex):
             raise TypeError("Index must be DatetimeIndex")
 
         kdf = self.copy()
@@ -3150,8 +3148,6 @@ def at_time(
         2018-04-09 12:00:00  2
         2018-04-10 12:00:00  4
         """
-        from databricks.koalas.indexes import DatetimeIndex
-
         if asof:
             raise NotImplementedError("'asof' argument is not supported")
 
@@ -3160,7 +3156,7 @@ def at_time(
         if axis != 0:
             raise NotImplementedError("at_time currently only works for axis=0")
 
-        if not isinstance(self.index, DatetimeIndex):
+        if not isinstance(self.index, ks.DatetimeIndex):
             raise TypeError("Index must be DatetimeIndex")
 
         kdf = self.copy()
@@ -5801,16 +5797,69 @@ def last(self, offset: Union[str, DateOffset]) -> "DataFrame":
         not returned.
         """
         # Check index type should be format DateTime
-        from databricks.koalas.indexes import DatetimeIndex
-
-        if not isinstance(self.index, DatetimeIndex):
+        if not isinstance(self.index, ks.DatetimeIndex):
             raise TypeError("'last' only supports a DatetimeIndex")
 
         offset = to_offset(offset)
         from_date = self.index.max() - offset
 
         return cast(DataFrame, self.loc[from_date:])
 
+    def first(self, offset: Union[str, DateOffset]) -> "DataFrame":
+        """
+        Select first periods of time series data based on a date offset.
+
+        When having a DataFrame with dates as index, this function can
+        select the first few rows based on a date offset.
+
+        Parameters
+        ----------
+        offset : str or DateOffset
+            The offset length of the data that will be selected. For instance,
+            '3D' will display all the rows having their index within the first 3 days.
+
+        Returns
+        -------
+        DataFrame
+            A subset of the caller.
+
+        Raises
+        ------
+        TypeError
+            If the index is not a :class:`DatetimeIndex`
+
+        Examples
+        --------
+
+        >>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
+        >>> kdf = ks.DataFrame({'A': [1, 2, 3, 4]}, index=index)
+        >>> kdf
+                    A
+        2018-04-09  1
+        2018-04-11  2
+        2018-04-13  3
+        2018-04-15  4
+
+        Get the rows for the last 3 days:
+
+        >>> kdf.first('3D')
+                    A
+        2018-04-09  1
+        2018-04-11  2
+
+        Notice the data for 3 first calendar days were returned, not the first
+        3 observed days in the dataset, and therefore data for 2018-04-13 was
+        not returned.
+        """
+        # Check index type should be format DatetimeIndex
+        if not isinstance(self.index, ks.DatetimeIndex):
+            raise TypeError("'first' only supports a DatetimeIndex")
+
+        offset = to_offset(offset)
+        to_date = self.index.min() + offset
+
+        return cast(DataFrame, self.loc[:to_date])
+
     def pivot_table(
         self, values=None, index=None, columns=None, aggfunc="mean", fill_value=None
     ) -> "DataFrame":

diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py
@@ -45,7 +45,6 @@ class _MissingPandasLikeDataFrame(object):
     corrwith = _unsupported_function("corrwith")
     cov = _unsupported_function("cov")
     ewm = _unsupported_function("ewm")
-    first = _unsupported_function("first")
     infer_objects = _unsupported_function("infer_objects")
     interpolate = _unsupported_function("interpolate")
     lookup = _unsupported_function("lookup")

diff --git a/databricks/koalas/missing/series.py b/databricks/koalas/missing/series.py
@@ -42,7 +42,6 @@ class MissingPandasLikeSeries(object):
     convert_dtypes = _unsupported_function("convert_dtypes")
     cov = _unsupported_function("cov")
     ewm = _unsupported_function("ewm")
-    first = _unsupported_function("first")
     infer_objects = _unsupported_function("infer_objects")
     interpolate = _unsupported_function("interpolate")
     reorder_levels = _unsupported_function("reorder_levels")

diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py
@@ -2246,8 +2246,8 @@ def last(self, offset: Union[str, DateOffset]) -> "Series":
         Examples
         --------
         >>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
-        >>> ks_series = ks.Series([1, 2, 3, 4], index=index)
-        >>> ks_series
+        >>> kser = ks.Series([1, 2, 3, 4], index=index)
+        >>> kser
         2018-04-09    1
         2018-04-11    2
         2018-04-13    3
@@ -2256,7 +2256,7 @@ def last(self, offset: Union[str, DateOffset]) -> "Series":
 
         Get the rows for the last 3 days:
 
-        >>> ks_series.last('3D')
+        >>> kser.last('3D')
         2018-04-13    3
         2018-04-15    4
         dtype: int64
@@ -2267,6 +2267,53 @@ def last(self, offset: Union[str, DateOffset]) -> "Series":
         """
         return first_series(self.to_frame().last(offset)).rename(self.name)
 
+    def first(self, offset: Union[str, DateOffset]) -> "Series":
+        """
+        Select first periods of time series data based on a date offset.
+
+        When having a Series with dates as index, this function can
+        select the first few elements based on a date offset.
+
+        Parameters
+        ----------
+        offset : str or DateOffset
+            The offset length of the data that will be selected. For instance,
+            '3D' will display all the rows having their index within the first 3 days.
+
+        Returns
+        -------
+        Series
+            A subset of the caller.
+
+        Raises
+        ------
+        TypeError
+            If the index is not a :class:`DatetimeIndex`
+
+        Examples
+        --------
+        >>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
+        >>> kser = ks.Series([1, 2, 3, 4], index=index)
+        >>> kser
+        2018-04-09    1
+        2018-04-11    2
+        2018-04-13    3
+        2018-04-15    4
+        dtype: int64
+
+        Get the rows for the first 3 days:
+
+        >>> kser.first('3D')
+        2018-04-09    1
+        2018-04-11    2
+        dtype: int64
+
+        Notice the data for 3 first calendar days were returned, not the first
+        3 observed days in the dataset, and therefore data for 2018-04-13 was
+        not returned.
+        """
+        return first_series(self.to_frame().first(offset)).rename(self.name)
+
     # TODO: Categorical type isn't supported (due to PySpark's limitation) and
     # some doctests related with timestamps were not added.
     def unique(self) -> "Series":

diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -5212,6 +5212,15 @@ def test_last(self):
         with self.assertRaisesRegex(TypeError, "'last' only supports a DatetimeIndex"):
             ks.DataFrame([1, 2, 3, 4]).last("1D")
 
+    def test_first(self):
+        index = pd.date_range("2018-04-09", periods=4, freq="2D")
+        pdf = pd.DataFrame([1, 2, 3, 4], index=index)
+        kdf = ks.from_pandas(pdf)
+        self.assert_eq(pdf.first("1D"), kdf.first("1D"))
+        self.assert_eq(pdf.first(DateOffset(days=1)), kdf.first(DateOffset(days=1)))
+        with self.assertRaisesRegex(TypeError, "'first' only supports a DatetimeIndex"):
+            ks.DataFrame([1, 2, 3, 4]).first("1D")
+
     def test_first_valid_index(self):
         pdf = pd.DataFrame(
             {"a": [None, 2, 3, 2], "b": [None, 2.0, 3.0, 1.0], "c": [None, 200, 400, 200]},

diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py
@@ -181,12 +181,22 @@ def test_head(self):
         self.assert_eq(kser.head(-10), pser.head(-10))
 
     def test_last(self):
-        index = pd.date_range("2018-04-09", periods=4, freq="2D")
-        pd_input = pd.Series([1, 2, 3, 4], index=index)
-        ks_input = ks.Series([1, 2, 3, 4], index=index)
         with self.assertRaises(TypeError):
             self.kser.last("1D")
-        self.assert_eq(ks_input.last("1D"), pd_input.last("1D"))
+
+        index = pd.date_range("2018-04-09", periods=4, freq="2D")
+        pser = pd.Series([1, 2, 3, 4], index=index)
+        kser = ks.from_pandas(pser)
+        self.assert_eq(kser.last("1D"), pser.last("1D"))
+
+    def test_first(self):
+        with self.assertRaises(TypeError):
+            self.kser.first("1D")
+
+        index = pd.date_range("2018-04-09", periods=4, freq="2D")
+        pser = pd.Series([1, 2, 3, 4], index=index)
+        kser = ks.from_pandas(pser)
+        self.assert_eq(kser.first("1D"), pser.first("1D"))
 
     def test_rename(self):
         pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name="x")

diff --git a/docs/source/reference/frame.rst b/docs/source/reference/frame.rst
@@ -172,6 +172,7 @@ Reindexing / Selection / Label manipulation
    DataFrame.duplicated
    DataFrame.equals
    DataFrame.filter
+   DataFrame.first
    DataFrame.head
    DataFrame.last
    DataFrame.rename

diff --git a/docs/source/reference/series.rst b/docs/source/reference/series.rst
@@ -167,6 +167,7 @@ Reindexing / Selection / Label manipulation
    Series.equals
    Series.add_prefix
    Series.add_suffix
+   Series.first
    Series.head
    Series.idxmax
    Series.idxmin