Implement DataFrame.itertuples (#1960)

ref #1929 ``` >>> df = ks.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]}, ... index=['dog', 'hawk']) >>> df num_legs num_wings dog 4 0 hawk 2 2 >>> for row in df.itertuples(): ... print(row) ... Koalas(Index='dog', num_legs=4, num_wings=0) Koalas(Index='hawk', num_legs=2, num_wings=2) ```
databricks · Dec 10, 2020 · 02133a8 · 02133a8
1 parent 2c23b2a
commit 02133a8
Show file tree

Hide file tree

Showing 4 changed files with 137 additions and 2 deletions.
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -17,7 +17,7 @@
 """
 A wrapper class for Spark DataFrame to behave similar to pandas DataFrame.
 """
-from collections import OrderedDict, defaultdict
+from collections import OrderedDict, defaultdict, namedtuple
 from collections.abc import Mapping
 from distutils.version import LooseVersion
 import re
@@ -1436,6 +1436,103 @@ def extract_kv_from_spark_row(row):
             s = pd.Series(v, index=columns, name=k)
             yield k, s
 
+    def itertuples(self, index: bool = True, name: Optional[str] = "Koalas") -> Iterator:
+        """
+        Iterate over DataFrame rows as namedtuples.
+
+        Parameters
+        ----------
+        index : bool, default True
+            If True, return the index as the first element of the tuple.
+        name : str or None, default "Koalas"
+            The name of the returned namedtuples or None to return regular
+            tuples.
+
+        Returns
+        -------
+        iterator
+            An object to iterate over namedtuples for each row in the
+            DataFrame with the first field possibly being the index and
+            following fields being the column values.
+
+        See Also
+        --------
+        DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
+            pairs.
+        DataFrame.items : Iterate over (column name, Series) pairs.
+
+        Notes
+        -----
+        The column names will be renamed to positional names if they are
+        invalid Python identifiers, repeated, or start with an underscore.
+        On python versions < 3.7 regular tuples are returned for DataFrames
+        with a large number of columns (>254).
+
+        Examples
+        --------
+        >>> df = ks.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
+        ...                   index=['dog', 'hawk'])
+        >>> df
+              num_legs  num_wings
+        dog          4          0
+        hawk         2          2
+
+        >>> for row in df.itertuples():
+        ...     print(row)
+        ...
+        Koalas(Index='dog', num_legs=4, num_wings=0)
+        Koalas(Index='hawk', num_legs=2, num_wings=2)
+
+        By setting the `index` parameter to False we can remove the index
+        as the first element of the tuple:
+
+        >>> for row in df.itertuples(index=False):
+        ...     print(row)
+        ...
+        Koalas(num_legs=4, num_wings=0)
+        Koalas(num_legs=2, num_wings=2)
+
+        With the `name` parameter set we set a custom name for the yielded
+        namedtuples:
+
+        >>> for row in df.itertuples(name='Animal'):
+        ...     print(row)
+        ...
+        Animal(Index='dog', num_legs=4, num_wings=0)
+        Animal(Index='hawk', num_legs=2, num_wings=2)
+        """
+        fields = list(self.columns)
+        if index:
+            fields.insert(0, "Index")
+
+        index_spark_column_names = self._internal.index_spark_column_names
+        data_spark_column_names = self._internal.data_spark_column_names
+
+        def extract_kv_from_spark_row(row):
+            k = (
+                row[index_spark_column_names[0]]
+                if len(index_spark_column_names) == 1
+                else tuple(row[c] for c in index_spark_column_names)
+            )
+            v = [row[c] for c in data_spark_column_names]
+            return k, v
+
+        can_return_named_tuples = sys.version_info >= (3, 7) or len(self.columns) + index < 255
+
+        if name is not None and can_return_named_tuples:
+            itertuple = namedtuple(name, fields, rename=True)  # type: ignore
+            for k, v in map(
+                extract_kv_from_spark_row,
+                self._internal.resolved_copy.spark_frame.toLocalIterator(),
+            ):
+                yield itertuple._make(([k] if index else []) + list(v))
+        else:
+            for k, v in map(
+                extract_kv_from_spark_row,
+                self._internal.resolved_copy.spark_frame.toLocalIterator(),
+            ):
+                yield tuple(([k] if index else []) + list(v))
+
     def items(self) -> Iterator:
         """This is an alias of ``iteritems``."""
         return self.iteritems()

diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py
@@ -52,7 +52,6 @@ class _MissingPandasLikeDataFrame(object):
     infer_objects = _unsupported_function("infer_objects")
     insert = _unsupported_function("insert")
     interpolate = _unsupported_function("interpolate")
-    itertuples = _unsupported_function("itertuples")
     last = _unsupported_function("last")
     lookup = _unsupported_function("lookup")
     mode = _unsupported_function("mode")

diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -186,6 +186,44 @@ def test_dataframe_multiindex_names_level(self):
         self.assert_eq(kdf[("X", "A")].to_pandas().columns.names, pdf[("X", "A")].columns.names)
         self.assert_eq(kdf[("X", "A", "Z")], pdf[("X", "A", "Z")])
 
+    def test_itertuples(self):
+        pdf = pd.DataFrame({"num_legs": [4, 2], "num_wings": [0, 2]}, index=["dog", "hawk"])
+        kdf = ks.from_pandas(pdf)
+
+        for ptuple, ktuple in zip(
+            pdf.itertuples(index=False, name="Animal"), kdf.itertuples(index=False, name="Animal")
+        ):
+            self.assert_eq(ptuple, ktuple)
+        for ptuple, ktuple in zip(pdf.itertuples(name=None), kdf.itertuples(name=None)):
+            self.assert_eq(ptuple, ktuple)
+
+        pdf.index = pd.MultiIndex.from_arrays(
+            [[1, 2], ["black", "brown"]], names=("count", "color")
+        )
+        kdf = ks.from_pandas(pdf)
+        for ptuple, ktuple in zip(pdf.itertuples(name="Animal"), kdf.itertuples(name="Animal")):
+            self.assert_eq(ptuple, ktuple)
+
+        pdf.columns = pd.MultiIndex.from_arrays(
+            [["CA", "WA"], ["age", "children"]], names=("origin", "info")
+        )
+        kdf = ks.from_pandas(pdf)
+        for ptuple, ktuple in zip(pdf.itertuples(name="Animal"), kdf.itertuples(name="Animal")):
+            self.assert_eq(ptuple, ktuple)
+
+        pdf = pd.DataFrame([1, 2, 3])
+        kdf = ks.from_pandas(pdf)
+        for ptuple, ktuple in zip(
+            (pdf + 1).itertuples(name="num"), (kdf + 1).itertuples(name="num")
+        ):
+            self.assert_eq(ptuple, ktuple)
+
+        # DataFrames with a large number of columns (>254)
+        pdf = pd.DataFrame(np.random.random((1, 255)))
+        kdf = ks.from_pandas(pdf)
+        for ptuple, ktuple in zip(pdf.itertuples(name="num"), kdf.itertuples(name="num")):
+            self.assert_eq(ptuple, ktuple)
+
     def test_iterrows(self):
         pdf = pd.DataFrame(
             {

diff --git a/docs/source/reference/frame.rst b/docs/source/reference/frame.rst
@@ -62,6 +62,7 @@ Indexing, iteration
    DataFrame.items
    DataFrame.iteritems
    DataFrame.iterrows
+   DataFrame.itertuples
    DataFrame.keys
    DataFrame.pop
    DataFrame.tail