Merge branch 'main' into pandas-devGH-15354-phased

rtpsw · Feb 28, 2022 · 675810f · 675810f
2 parents 3a9c089 + 367f8a1
commit 675810f
Show file tree

Hide file tree

Showing 201 changed files with 3,400 additions and 2,481 deletions.
diff --git a/.github/workflows/datamanger.yml b/.github/workflows/datamanger.yml
diff --git a/.github/workflows/posix.yml b/.github/workflows/posix.yml
@@ -26,37 +26,52 @@ jobs:
       matrix:
         env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml]
         pattern: ["not single_cpu", "single_cpu"]
+        # Don't test pyarrow v2/3: Causes timeouts in read_csv engine
+        # even if tests are skipped/xfailed
+        pyarrow_version: ["5", "7"]
         include:
           - env_file: actions-38-downstream_compat.yaml
             pattern: "not slow and not network and not single_cpu"
             pytest_target: "pandas/tests/test_downstream.py"
+            name: "Downstream Compat"
           - env_file: actions-38-minimum_versions.yaml
             pattern: "not slow and not network and not single_cpu"
+            name: "Minimum Versions"
           - env_file: actions-38.yaml
             pattern: "not slow and not network and not single_cpu"
             extra_apt: "language-pack-it"
             lang: "it_IT.utf8"
             lc_all: "it_IT.utf8"
+            name: "Locale: it_IT.utf8"
           - env_file: actions-38.yaml
             pattern: "not slow and not network and not single_cpu"
             extra_apt: "language-pack-zh-hans"
             lang: "zh_CN.utf8"
             lc_all: "zh_CN.utf8"
+            name: "Locale: zh_CN.utf8"
+          - env_file: actions-38.yaml
+            pattern: "not slow and not network and not single_cpu"
+            pandas_data_manager: "array"
+            name: "Data Manager"
           - env_file: actions-pypy-38.yaml
             pattern: "not slow and not network and not single_cpu"
             test_args: "--max-worker-restart 0"
+            name: "Pypy"
           - env_file: actions-310-numpydev.yaml
             pattern: "not slow and not network and not single_cpu"
             pandas_testing_mode: "deprecate"
             test_args: "-W error"
+            name: "Numpy Dev"
       fail-fast: false
+    name: ${{ matrix.name || format('{0} pyarrow={1} {2}', matrix.env_file, matrix.pyarrow_version, matrix.pattern) }}
     env:
       ENV_FILE: ci/deps/${{ matrix.env_file }}
       PATTERN: ${{ matrix.pattern }}
       EXTRA_APT: ${{ matrix.extra_apt || '' }}
       LANG: ${{ matrix.lang || '' }}
       LC_ALL: ${{ matrix.lc_all || '' }}
       PANDAS_TESTING_MODE: ${{ matrix.pandas_testing_mode || '' }}
+      PANDAS_DATA_MANAGER: ${{ matrix.pandas_data_manager || 'block' }}
       TEST_ARGS: ${{ matrix.test_args || '' }}
       PYTEST_WORKERS: ${{ contains(matrix.pattern, 'not single_cpu') && 'auto' || '1' }}
       PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }}
@@ -65,7 +80,7 @@ jobs:
       COVERAGE: ${{ !contains(matrix.env_file, 'pypy') }}
     concurrency:
       # https://github.community/t/concurrecy-not-work-for-push/183068/7
-      group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}
+      group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.pyarrow_version || '' }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_data_manager || '' }}
       cancel-in-progress: true
 
     services:
@@ -133,6 +148,10 @@ jobs:
         use-only-tar-bz2: true
       if: ${{ env.IS_PYPY == 'false' }} # No pypy3.8 support
 
+    - name: Upgrade Arrow version
+      run: conda install -n pandas-dev -c conda-forge --no-update-deps pyarrow=${{ matrix.pyarrow_version }}
+      if: ${{ matrix.pyarrow_version }}
+
     - name: Setup PyPy
       uses: actions/setup-python@v2
       with:

diff --git a/.github/workflows/sdist.yml b/.github/workflows/sdist.yml
@@ -9,11 +9,13 @@ on:
     branches:
       - main
       - 1.4.x
+    types: [labeled, opened, synchronize, reopened]
     paths-ignore:
       - "doc/**"
 
 jobs:
   build:
+    if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}}
     runs-on: ubuntu-latest
     timeout-minutes: 60
     defaults:

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -18,6 +18,7 @@
 
 method_blocklist = {
     "object": {
+        "diff",
         "median",
         "prod",
         "sem",
@@ -405,7 +406,7 @@ class GroupByMethods:
 
     param_names = ["dtype", "method", "application", "ncols"]
     params = [
-        ["int", "float", "object", "datetime", "uint"],
+        ["int", "int16", "float", "object", "datetime", "uint"],
         [
             "all",
             "any",
@@ -417,6 +418,7 @@ class GroupByMethods:
             "cumprod",
             "cumsum",
             "describe",
+            "diff",
             "ffill",
             "first",
             "head",
@@ -478,7 +480,7 @@ def setup(self, dtype, method, application, ncols):
         values = rng.take(taker, axis=0)
         if dtype == "int":
             key = np.random.randint(0, size, size=size)
-        elif dtype == "uint":
+        elif dtype in ("int16", "uint"):
             key = np.random.randint(0, size, size=size, dtype=dtype)
         elif dtype == "float":
             key = np.concatenate(

diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -13,7 +13,6 @@
     CategoricalIndex,
     DataFrame,
     Float64Index,
-    IndexSlice,
     Int64Index,
     IntervalIndex,
     MultiIndex,
@@ -200,28 +199,69 @@ def time_take(self, index):
 
 
 class MultiIndexing:
-    def setup(self):
-        mi = MultiIndex.from_product([range(1000), range(1000)])
-        self.s = Series(np.random.randn(1000000), index=mi)
-        self.df = DataFrame(self.s)
 
-        n = 100000
-        with warnings.catch_warnings(record=True):
-            self.mdt = DataFrame(
-                {
-                    "A": np.random.choice(range(10000, 45000, 1000), n),
-                    "B": np.random.choice(range(10, 400), n),
-                    "C": np.random.choice(range(1, 150), n),
-                    "D": np.random.choice(range(10000, 45000), n),
-                    "x": np.random.choice(range(400), n),
-                    "y": np.random.choice(range(25), n),
-                }
-            )
-        self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000]
-        self.mdt = self.mdt.set_index(["A", "B", "C", "D"]).sort_index()
+    params = [True, False]
+    param_names = ["unique_levels"]
+
+    def setup(self, unique_levels):
+        self.ndim = 2
+        if unique_levels:
+            mi = MultiIndex.from_arrays([range(1000000)] * self.ndim)
+        else:
+            mi = MultiIndex.from_product([range(1000)] * self.ndim)
+        self.df = DataFrame(np.random.randn(len(mi)), index=mi)
+
+        self.tgt_slice = slice(200, 800)
+        self.tgt_null_slice = slice(None)
+        self.tgt_list = list(range(0, 1000, 10))
+        self.tgt_scalar = 500
+
+        bool_indexer = np.zeros(len(mi), dtype=np.bool_)
+        bool_indexer[slice(0, len(mi), 100)] = True
+        self.tgt_bool_indexer = bool_indexer
+
+    def time_loc_partial_key_slice(self, unique_levels):
+        self.df.loc[self.tgt_slice, :]
+
+    def time_loc_partial_key_null_slice(self, unique_levels):
+        self.df.loc[self.tgt_null_slice, :]
+
+    def time_loc_partial_key_list(self, unique_levels):
+        self.df.loc[self.tgt_list, :]
+
+    def time_loc_partial_key_scalar(self, unique_levels):
+        self.df.loc[self.tgt_scalar, :]
+
+    def time_loc_partial_bool_indexer(self, unique_levels):
+        self.df.loc[self.tgt_bool_indexer, :]
+
+    def time_loc_all_slices(self, unique_levels):
+        target = tuple([self.tgt_slice] * self.ndim)
+        self.df.loc[target, :]
+
+    def time_loc_all_null_slices(self, unique_levels):
+        target = tuple([self.tgt_null_slice] * self.ndim)
+        self.df.loc[target, :]
+
+    def time_loc_all_lists(self, unique_levels):
+        target = tuple([self.tgt_list] * self.ndim)
+        self.df.loc[target, :]
+
+    def time_loc_all_scalars(self, unique_levels):
+        target = tuple([self.tgt_scalar] * self.ndim)
+        self.df.loc[target, :]
+
+    def time_loc_all_bool_indexers(self, unique_levels):
+        target = tuple([self.tgt_bool_indexer] * self.ndim)
+        self.df.loc[target, :]
+
+    def time_loc_slice_plus_null_slice(self, unique_levels):
+        target = (self.tgt_slice, self.tgt_null_slice)
+        self.df.loc[target, :]
 
-    def time_index_slice(self):
-        self.mdt.loc[self.idx, :]
+    def time_loc_null_slice_plus_slice(self, unique_levels):
+        target = (self.tgt_null_slice, self.tgt_slice)
+        self.df.loc[target, :]
 
 
 class IntervalIndexing:

diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
@@ -158,6 +158,19 @@ def time_left_outer_join_index(self):
         self.left.join(self.right, on="jim")
 
 
+class JoinEmpty:
+    def setup(self):
+        N = 100_000
+        self.df = DataFrame({"A": np.arange(N)})
+        self.df_empty = DataFrame(columns=["B", "C"], dtype="int64")
+
+    def time_inner_join_left_empty(self):
+        self.df_empty.join(self.df, how="inner")
+
+    def time_inner_join_right_empty(self):
+        self.df.join(self.df_empty, how="inner")
+
+
 class JoinNonUnique:
     # outer join of non-unique
     # GH 6329

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -22,7 +22,6 @@ variables:
   PANDAS_CI: 1
 
 jobs:
-# Mac and Linux use the same template
 - template: ci/azure/posix.yml
   parameters:
     name: macOS

diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml
@@ -10,30 +10,35 @@ jobs:
   strategy:
     matrix:
       py38:
-        ENV_FILE: ci/deps/azure-macos-38.yaml
+        ENV_FILE: ci/deps/actions-38.yaml
         CONDA_PY: "38"
 
       py39:
-        ENV_FILE: ci/deps/azure-macos-39.yaml
+        ENV_FILE: ci/deps/actions-39.yaml
         CONDA_PY: "39"
 
       py310:
-        ENV_FILE: ci/deps/azure-macos-310.yaml
+        ENV_FILE: ci/deps/actions-310.yaml
         CONDA_PY: "310"
 
   steps:
     - script: echo '##vso[task.prependpath]$(HOME)/miniconda3/bin'
       displayName: 'Set conda path'
 
+    - script: rm /usr/local/miniconda/pkgs/cache/*.json
+      displayName: 'Workaround for mamba-org/mamba#488'
+
     - script: ci/setup_env.sh
       displayName: 'Setup environment and build pandas'
 
     - script: |
-        source activate pandas-dev
-        ci/run_tests.sh
+        conda run -n pandas-dev --no-capture-output ci/run_tests.sh
       displayName: 'Test'
 
-    - script: source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd
+    - script: |
+        pushd /tmp
+        conda run -n pandas-dev python -c "import pandas; pandas.show_versions()"
+        popd
       displayName: 'Build versions'
 
     - task: PublishTestResults@2

diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml
@@ -26,16 +26,22 @@ jobs:
         Write-Host "##vso[task.prependpath]$env:CONDA\Scripts"
         Write-Host "##vso[task.prependpath]$HOME/miniconda3/bin"
       displayName: 'Add conda to PATH'
-    - script: conda update -q -n base conda
-      displayName: 'Update conda'
+    - bash: conda install -yv -c conda-forge -n base 'mamba>=0.21.2'
+      displayName: 'Install mamba'
 
     - bash: |
-        conda env create -q --file ci\\deps\\actions-$(CONDA_PY).yaml
+        # See https://github.com/mamba-org/mamba/issues/1370
+        # See https://github.com/mamba-org/mamba/issues/633
+        C:\\Miniconda\\condabin\\mamba.bat create -n pandas-dev
+        C:\\Miniconda\\condabin\\mamba.bat env update -n pandas-dev --file ci\\deps\\actions-$(CONDA_PY).yaml
+        # TODO: GH#44980 https://github.com/pypa/setuptools/issues/2941
+        C:\\Miniconda\\condabin\\mamba.bat install -n pandas-dev 'setuptools<60'
+        C:\\Miniconda\\condabin\\mamba.bat list -n pandas-dev
       displayName: 'Create anaconda environment'
     - bash: |
         source activate pandas-dev
         conda list
-        python setup.py build_ext -q -j 4
+        python setup.py build_ext -q -j 2
         python -m pip install --no-build-isolation -e .
       displayName: 'Build'
     - bash: |

diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml
@@ -19,7 +19,7 @@ dependencies:
   - numpy=1.18.5
   - pytz=2020.1
 
-  # optional dependencies
+  # optional dependencies, markupsafe for jinja2
   - beautifulsoup4=4.8.2
   - blosc=1.20.1
   - bottleneck=1.3.1
@@ -29,6 +29,7 @@ dependencies:
   - gcsfs=0.6.0
   - jinja2=2.11
   - lxml=4.5.0
+  - markupsafe=2.0.1
   - matplotlib=3.3.2
   - numba=0.50.1
   - numexpr=2.7.1