Skip to content

Commit

Permalink
Merge branch 'main' into pandas-devGH-15354-phased
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Feb 28, 2022
2 parents 3a9c089 + 367f8a1 commit 675810f
Show file tree
Hide file tree
Showing 201 changed files with 3,400 additions and 2,481 deletions.
54 changes: 0 additions & 54 deletions .github/workflows/datamanger.yml

This file was deleted.

21 changes: 20 additions & 1 deletion .github/workflows/posix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,37 +26,52 @@ jobs:
matrix:
env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml]
pattern: ["not single_cpu", "single_cpu"]
# Don't test pyarrow v2/3: Causes timeouts in read_csv engine
# even if tests are skipped/xfailed
pyarrow_version: ["5", "7"]
include:
- env_file: actions-38-downstream_compat.yaml
pattern: "not slow and not network and not single_cpu"
pytest_target: "pandas/tests/test_downstream.py"
name: "Downstream Compat"
- env_file: actions-38-minimum_versions.yaml
pattern: "not slow and not network and not single_cpu"
name: "Minimum Versions"
- env_file: actions-38.yaml
pattern: "not slow and not network and not single_cpu"
extra_apt: "language-pack-it"
lang: "it_IT.utf8"
lc_all: "it_IT.utf8"
name: "Locale: it_IT.utf8"
- env_file: actions-38.yaml
pattern: "not slow and not network and not single_cpu"
extra_apt: "language-pack-zh-hans"
lang: "zh_CN.utf8"
lc_all: "zh_CN.utf8"
name: "Locale: zh_CN.utf8"
- env_file: actions-38.yaml
pattern: "not slow and not network and not single_cpu"
pandas_data_manager: "array"
name: "Data Manager"
- env_file: actions-pypy-38.yaml
pattern: "not slow and not network and not single_cpu"
test_args: "--max-worker-restart 0"
name: "Pypy"
- env_file: actions-310-numpydev.yaml
pattern: "not slow and not network and not single_cpu"
pandas_testing_mode: "deprecate"
test_args: "-W error"
name: "Numpy Dev"
fail-fast: false
name: ${{ matrix.name || format('{0} pyarrow={1} {2}', matrix.env_file, matrix.pyarrow_version, matrix.pattern) }}
env:
ENV_FILE: ci/deps/${{ matrix.env_file }}
PATTERN: ${{ matrix.pattern }}
EXTRA_APT: ${{ matrix.extra_apt || '' }}
LANG: ${{ matrix.lang || '' }}
LC_ALL: ${{ matrix.lc_all || '' }}
PANDAS_TESTING_MODE: ${{ matrix.pandas_testing_mode || '' }}
PANDAS_DATA_MANAGER: ${{ matrix.pandas_data_manager || 'block' }}
TEST_ARGS: ${{ matrix.test_args || '' }}
PYTEST_WORKERS: ${{ contains(matrix.pattern, 'not single_cpu') && 'auto' || '1' }}
PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }}
Expand All @@ -65,7 +80,7 @@ jobs:
COVERAGE: ${{ !contains(matrix.env_file, 'pypy') }}
concurrency:
# https://github.community/t/concurrecy-not-work-for-push/183068/7
group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}
group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.pyarrow_version || '' }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_data_manager || '' }}
cancel-in-progress: true

services:
Expand Down Expand Up @@ -133,6 +148,10 @@ jobs:
use-only-tar-bz2: true
if: ${{ env.IS_PYPY == 'false' }} # No pypy3.8 support

- name: Upgrade Arrow version
run: conda install -n pandas-dev -c conda-forge --no-update-deps pyarrow=${{ matrix.pyarrow_version }}
if: ${{ matrix.pyarrow_version }}

- name: Setup PyPy
uses: actions/setup-python@v2
with:
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/sdist.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@ on:
branches:
- main
- 1.4.x
types: [labeled, opened, synchronize, reopened]
paths-ignore:
- "doc/**"

jobs:
build:
if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}}
runs-on: ubuntu-latest
timeout-minutes: 60
defaults:
Expand Down
6 changes: 4 additions & 2 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

method_blocklist = {
"object": {
"diff",
"median",
"prod",
"sem",
Expand Down Expand Up @@ -405,7 +406,7 @@ class GroupByMethods:

param_names = ["dtype", "method", "application", "ncols"]
params = [
["int", "float", "object", "datetime", "uint"],
["int", "int16", "float", "object", "datetime", "uint"],
[
"all",
"any",
Expand All @@ -417,6 +418,7 @@ class GroupByMethods:
"cumprod",
"cumsum",
"describe",
"diff",
"ffill",
"first",
"head",
Expand Down Expand Up @@ -478,7 +480,7 @@ def setup(self, dtype, method, application, ncols):
values = rng.take(taker, axis=0)
if dtype == "int":
key = np.random.randint(0, size, size=size)
elif dtype == "uint":
elif dtype in ("int16", "uint"):
key = np.random.randint(0, size, size=size, dtype=dtype)
elif dtype == "float":
key = np.concatenate(
Expand Down
82 changes: 61 additions & 21 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
CategoricalIndex,
DataFrame,
Float64Index,
IndexSlice,
Int64Index,
IntervalIndex,
MultiIndex,
Expand Down Expand Up @@ -200,28 +199,69 @@ def time_take(self, index):


class MultiIndexing:
def setup(self):
mi = MultiIndex.from_product([range(1000), range(1000)])
self.s = Series(np.random.randn(1000000), index=mi)
self.df = DataFrame(self.s)

n = 100000
with warnings.catch_warnings(record=True):
self.mdt = DataFrame(
{
"A": np.random.choice(range(10000, 45000, 1000), n),
"B": np.random.choice(range(10, 400), n),
"C": np.random.choice(range(1, 150), n),
"D": np.random.choice(range(10000, 45000), n),
"x": np.random.choice(range(400), n),
"y": np.random.choice(range(25), n),
}
)
self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000]
self.mdt = self.mdt.set_index(["A", "B", "C", "D"]).sort_index()
params = [True, False]
param_names = ["unique_levels"]

def setup(self, unique_levels):
self.ndim = 2
if unique_levels:
mi = MultiIndex.from_arrays([range(1000000)] * self.ndim)
else:
mi = MultiIndex.from_product([range(1000)] * self.ndim)
self.df = DataFrame(np.random.randn(len(mi)), index=mi)

self.tgt_slice = slice(200, 800)
self.tgt_null_slice = slice(None)
self.tgt_list = list(range(0, 1000, 10))
self.tgt_scalar = 500

bool_indexer = np.zeros(len(mi), dtype=np.bool_)
bool_indexer[slice(0, len(mi), 100)] = True
self.tgt_bool_indexer = bool_indexer

def time_loc_partial_key_slice(self, unique_levels):
self.df.loc[self.tgt_slice, :]

def time_loc_partial_key_null_slice(self, unique_levels):
self.df.loc[self.tgt_null_slice, :]

def time_loc_partial_key_list(self, unique_levels):
self.df.loc[self.tgt_list, :]

def time_loc_partial_key_scalar(self, unique_levels):
self.df.loc[self.tgt_scalar, :]

def time_loc_partial_bool_indexer(self, unique_levels):
self.df.loc[self.tgt_bool_indexer, :]

def time_loc_all_slices(self, unique_levels):
target = tuple([self.tgt_slice] * self.ndim)
self.df.loc[target, :]

def time_loc_all_null_slices(self, unique_levels):
target = tuple([self.tgt_null_slice] * self.ndim)
self.df.loc[target, :]

def time_loc_all_lists(self, unique_levels):
target = tuple([self.tgt_list] * self.ndim)
self.df.loc[target, :]

def time_loc_all_scalars(self, unique_levels):
target = tuple([self.tgt_scalar] * self.ndim)
self.df.loc[target, :]

def time_loc_all_bool_indexers(self, unique_levels):
target = tuple([self.tgt_bool_indexer] * self.ndim)
self.df.loc[target, :]

def time_loc_slice_plus_null_slice(self, unique_levels):
target = (self.tgt_slice, self.tgt_null_slice)
self.df.loc[target, :]

def time_index_slice(self):
self.mdt.loc[self.idx, :]
def time_loc_null_slice_plus_slice(self, unique_levels):
target = (self.tgt_null_slice, self.tgt_slice)
self.df.loc[target, :]


class IntervalIndexing:
Expand Down
13 changes: 13 additions & 0 deletions asv_bench/benchmarks/join_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,19 @@ def time_left_outer_join_index(self):
self.left.join(self.right, on="jim")


class JoinEmpty:
def setup(self):
N = 100_000
self.df = DataFrame({"A": np.arange(N)})
self.df_empty = DataFrame(columns=["B", "C"], dtype="int64")

def time_inner_join_left_empty(self):
self.df_empty.join(self.df, how="inner")

def time_inner_join_right_empty(self):
self.df.join(self.df_empty, how="inner")


class JoinNonUnique:
# outer join of non-unique
# GH 6329
Expand Down
1 change: 0 additions & 1 deletion azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ variables:
PANDAS_CI: 1

jobs:
# Mac and Linux use the same template
- template: ci/azure/posix.yml
parameters:
name: macOS
Expand Down
17 changes: 11 additions & 6 deletions ci/azure/posix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,30 +10,35 @@ jobs:
strategy:
matrix:
py38:
ENV_FILE: ci/deps/azure-macos-38.yaml
ENV_FILE: ci/deps/actions-38.yaml
CONDA_PY: "38"

py39:
ENV_FILE: ci/deps/azure-macos-39.yaml
ENV_FILE: ci/deps/actions-39.yaml
CONDA_PY: "39"

py310:
ENV_FILE: ci/deps/azure-macos-310.yaml
ENV_FILE: ci/deps/actions-310.yaml
CONDA_PY: "310"

steps:
- script: echo '##vso[task.prependpath]$(HOME)/miniconda3/bin'
displayName: 'Set conda path'

- script: rm /usr/local/miniconda/pkgs/cache/*.json
displayName: 'Workaround for mamba-org/mamba#488'

- script: ci/setup_env.sh
displayName: 'Setup environment and build pandas'

- script: |
source activate pandas-dev
ci/run_tests.sh
conda run -n pandas-dev --no-capture-output ci/run_tests.sh
displayName: 'Test'
- script: source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd
- script: |
pushd /tmp
conda run -n pandas-dev python -c "import pandas; pandas.show_versions()"
popd
displayName: 'Build versions'
- task: PublishTestResults@2
Expand Down
14 changes: 10 additions & 4 deletions ci/azure/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,22 @@ jobs:
Write-Host "##vso[task.prependpath]$env:CONDA\Scripts"
Write-Host "##vso[task.prependpath]$HOME/miniconda3/bin"
displayName: 'Add conda to PATH'
- script: conda update -q -n base conda
displayName: 'Update conda'
- bash: conda install -yv -c conda-forge -n base 'mamba>=0.21.2'
displayName: 'Install mamba'

- bash: |
conda env create -q --file ci\\deps\\actions-$(CONDA_PY).yaml
# See https://github.com/mamba-org/mamba/issues/1370
# See https://github.com/mamba-org/mamba/issues/633
C:\\Miniconda\\condabin\\mamba.bat create -n pandas-dev
C:\\Miniconda\\condabin\\mamba.bat env update -n pandas-dev --file ci\\deps\\actions-$(CONDA_PY).yaml
# TODO: GH#44980 https://github.com/pypa/setuptools/issues/2941
C:\\Miniconda\\condabin\\mamba.bat install -n pandas-dev 'setuptools<60'
C:\\Miniconda\\condabin\\mamba.bat list -n pandas-dev
displayName: 'Create anaconda environment'
- bash: |
source activate pandas-dev
conda list
python setup.py build_ext -q -j 4
python setup.py build_ext -q -j 2
python -m pip install --no-build-isolation -e .
displayName: 'Build'
- bash: |
Expand Down
3 changes: 2 additions & 1 deletion ci/deps/actions-38-minimum_versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ dependencies:
- numpy=1.18.5
- pytz=2020.1

# optional dependencies
# optional dependencies, markupsafe for jinja2
- beautifulsoup4=4.8.2
- blosc=1.20.1
- bottleneck=1.3.1
Expand All @@ -29,6 +29,7 @@ dependencies:
- gcsfs=0.6.0
- jinja2=2.11
- lxml=4.5.0
- markupsafe=2.0.1
- matplotlib=3.3.2
- numba=0.50.1
- numexpr=2.7.1
Expand Down

0 comments on commit 675810f

Please sign in to comment.