Test with PySpark 3.2 (#2203)

Adds test matrix to test with PySpark `3.2`. Also upgrades PySpark `3.1` to `3.1.2`.
databricks · Oct 18, 2021 · a7e7bc7 · a7e7bc7
1 parent f44c050
commit a7e7bc7
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 13 deletions.
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
@@ -28,10 +28,15 @@ jobs:
             numpy-version: 1.19.5
             default-index-type: 'distributed-sequence'
           - python-version: 3.9
-            spark-version: 3.1.1
+            spark-version: 3.1.2
             pandas-version: 1.2.5
             pyarrow-version: 3.0.0
             numpy-version: 1.20.3
+          - python-version: 3.9
+            spark-version: 3.2.0
+            pandas-version: 1.2.5
+            pyarrow-version: 4.0.1
+            numpy-version: 1.21.2
             default-index-type: 'distributed-sequence'
     env:
       PYTHON_VERSION: ${{ matrix.python-version }}

diff --git a/databricks/koalas/groupby.py b/databricks/koalas/groupby.py
@@ -1073,7 +1073,7 @@ def apply(self, func, *args, **kwargs) -> Union[DataFrame, Series]:
 
         >>> def plus_max(x) -> ks.Series[np.int]:
         ...     return x + x.max()
-        >>> df.B.groupby(df.A).apply(plus_max).sort_index()
+        >>> df.B.groupby(df.A).apply(plus_max).sort_index()  # doctest: +SKIP
         0    6
         1    3
         2    4
@@ -1091,7 +1091,7 @@ def apply(self, func, *args, **kwargs) -> Union[DataFrame, Series]:
 
         >>> def plus_length(x) -> np.int:
         ...     return len(x)
-        >>> df.B.groupby(df.A).apply(plus_length).sort_index()
+        >>> df.B.groupby(df.A).apply(plus_length).sort_index()  # doctest: +SKIP
         0    1
         1    2
         Name: B, dtype: int64
@@ -1100,7 +1100,7 @@ def apply(self, func, *args, **kwargs) -> Union[DataFrame, Series]:
 
         >>> def calculation(x, y, z) -> np.int:
         ...     return len(x) + y * z
-        >>> df.B.groupby(df.A).apply(calculation, 5, z=10).sort_index()
+        >>> df.B.groupby(df.A).apply(calculation, 5, z=10).sort_index()  # doctest: +SKIP
         0    51
         1    52
         Name: B, dtype: int64
@@ -1903,12 +1903,12 @@ def tail(self, n=5) -> Union[DataFrame, Series]:
         ...                    'b': [2, 3, 1, 4, 6, 9, 8, 10, 7, 5],
         ...                    'c': [3, 5, 2, 5, 1, 2, 6, 4, 3, 6]},
         ...                   columns=['a', 'b', 'c'],
-        ...                   index=[7, 2, 4, 1, 3, 4, 9, 10, 5, 6])
+        ...                   index=[7, 2, 3, 1, 3, 4, 9, 10, 5, 6])
         >>> df
             a   b  c
         7   1   2  3
         2   1   3  5
-        4   1   1  2
+        3   1   1  2
         1   1   4  5
         3   2   6  1
         4   2   9  2
@@ -1920,16 +1920,16 @@ def tail(self, n=5) -> Union[DataFrame, Series]:
         >>> df.groupby('a').tail(2).sort_index()
            a  b  c
         1  1  4  5
+        3  1  1  2
         4  2  9  2
-        4  1  1  2
         5  3  7  3
         6  3  5  6
         9  2  8  6
 
         >>> df.groupby('a')['b'].tail(2).sort_index()
         1    4
+        3    1
         4    9
-        4    1
         5    7
         6    5
         9    8

diff --git a/databricks/koalas/indexes/base.py b/databricks/koalas/indexes/base.py
@@ -1381,12 +1381,12 @@ def symmetric_difference(self, other, result_name=None, sort=None) -> "Index":
         >>> s1 = ks.Series([1, 2, 3, 4], index=[1, 2, 3, 4])
         >>> s2 = ks.Series([1, 2, 3, 4], index=[2, 3, 4, 5])
 
-        >>> s1.index.symmetric_difference(s2.index)
+        >>> s1.index.symmetric_difference(s2.index)  # doctest: +SKIP
         Int64Index([5, 1], dtype='int64')
 
         You can set name of result Index.
 
-        >>> s1.index.symmetric_difference(s2.index, result_name='koalas')
+        >>> s1.index.symmetric_difference(s2.index, result_name='koalas')  # doctest: +SKIP
         Int64Index([5, 1], dtype='int64', name='koalas')
 
         You can set sort to `True`, if you want to sort the resulting index.
@@ -1396,7 +1396,7 @@ def symmetric_difference(self, other, result_name=None, sort=None) -> "Index":
 
         You can also use the ``^`` operator:
 
-        >>> s1.index ^ s2.index
+        >>> s1.index ^ s2.index  # doctest: +SKIP
         Int64Index([5, 1], dtype='int64')
         """
         if type(self) != type(other):

diff --git a/dev/pytest b/dev/pytest
@@ -41,8 +41,8 @@ fi
 
 # Runs both doctests and unit tests by default, otherwise hands arguments over to pytest.
 if [ "$#" = 0 ]; then
-    if [[ "$SPARK_VERSION" == 2.3* ]] || [[ "$SPARK_VERSION" == 2.4.1* ]] || [[ "$SPARK_VERSION" == 2.4.2* ]]; then
-        # Delta requires Spark 2.4.2+. We skip the related doctests.
+    if [[ "$SPARK_VERSION" == 2.3* ]] || [[ "$SPARK_VERSION" == 2.4.1* ]] || [[ "$SPARK_VERSION" == 2.4.2* ]] || [[ "$SPARK_VERSION" == 3.2.* ]]; then
+        # Delta requires Spark 2.4.2+, and doesn't support Spark 3.2+ yet. We skip the related doctests.
         if [[ "$SPARK_VERSION" == 2.3* ]]; then
             $PYTHON_EXECUTABLE -m pytest --cov=databricks --cov-report xml:"$FWDIR/coverage.xml" -k "not (melt or to_delta or read_delta or to_clipboard)" --verbose --showlocals --color=yes --doctest-modules databricks "${logopts[@]}"
         else