diff --git a/README.rst b/README.rst index 1b3dc36f..fb3a8fb1 100644 --- a/README.rst +++ b/README.rst @@ -148,6 +148,15 @@ By default, ``arraysize`` is set to ``5000``. ``arraysize`` is used to set the b engine = create_engine('bigquery://project', arraysize=1000) +Page size for dataset.list_tables +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +By default, ``list_tables_page_size`` is set to ``1000``. ``list_tables_page_size`` is used to set the max_results for `dataset.list_tables`_ operation. To change it, pass ``list_tables_page_size`` to ``create_engine()``: + +.. _`dataset.list_tables`: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables/list +.. code-block:: python + + engine = create_engine('bigquery://project', list_tables_page_size=100) Adding a Default Dataset ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -180,7 +189,7 @@ Connection String Parameters There are many situations where you can't call ``create_engine`` directly, such as when using tools like `Flask SQLAlchemy `_. For situations like these, or for situations where you want the ``Client`` to have a `default_query_job_config `_, you can pass many arguments in the query of the connection string. -The ``credentials_path``, ``credentials_info``, ``location``, and ``arraysize`` parameters are used by this library, and the rest are used to create a `QueryJobConfig `_ +The ``credentials_path``, ``credentials_info``, ``location``, ``arraysize`` and ``list_tables_page_size`` parameters are used by this library, and the rest are used to create a `QueryJobConfig `_ Note that if you want to use query strings, it will be more reliable if you use three slashes, so ``'bigquery:///?a=b'`` will work reliably, but ``'bigquery://?a=b'`` might be interpreted as having a "database" of ``?a=b``, depending on the system being used to parse the connection string. @@ -193,6 +202,7 @@ Here are examples of all the supported arguments. Any not present are either for 'credentials_path=/some/path/to.json' '&' 'location=some-location' '&' 'arraysize=1000' '&' + 'list_tables_page_size=100' '&' 'clustering_fields=a,b,c' '&' 'create_disposition=CREATE_IF_NEEDED' '&' 'destination=different-project.different-dataset.table' '&' diff --git a/pybigquery/parse_url.py b/pybigquery/parse_url.py index 13dda364..370a46c7 100644 --- a/pybigquery/parse_url.py +++ b/pybigquery/parse_url.py @@ -68,6 +68,7 @@ def parse_url(url): # noqa: C901 dataset_id = url.database or None arraysize = None credentials_path = None + list_tables_page_size = None # location if "location" in query: @@ -85,6 +86,16 @@ def parse_url(url): # noqa: C901 except ValueError: raise ValueError("invalid int in url query arraysize: " + str_arraysize) + if "list_tables_page_size" in query: + str_list_tables_page_size = query.pop("list_tables_page_size") + try: + list_tables_page_size = int(str_list_tables_page_size) + except ValueError: + raise ValueError( + "invalid int in url query list_tables_page_size: " + + str_list_tables_page_size + ) + # if only these "non-config" values were present, the dict will now be empty if not query: # if a dataset_id exists, we need to return a job_config that isn't None @@ -97,9 +108,18 @@ def parse_url(url): # noqa: C901 arraysize, credentials_path, QueryJobConfig(), + list_tables_page_size, ) else: - return project_id, location, dataset_id, arraysize, credentials_path, None + return ( + project_id, + location, + dataset_id, + arraysize, + credentials_path, + None, + list_tables_page_size, + ) job_config = QueryJobConfig() @@ -239,4 +259,12 @@ def parse_url(url): # noqa: C901 "invalid write_disposition in url query: " + query["write_disposition"] ) - return project_id, location, dataset_id, arraysize, credentials_path, job_config + return ( + project_id, + location, + dataset_id, + arraysize, + credentials_path, + job_config, + list_tables_page_size, + ) diff --git a/pybigquery/sqlalchemy_bigquery.py b/pybigquery/sqlalchemy_bigquery.py index 60b8aab0..795f7c33 100644 --- a/pybigquery/sqlalchemy_bigquery.py +++ b/pybigquery/sqlalchemy_bigquery.py @@ -657,6 +657,7 @@ def __init__( credentials_path=None, location=None, credentials_info=None, + list_tables_page_size=1000, *args, **kwargs, ): @@ -666,6 +667,7 @@ def __init__( self.credentials_info = credentials_info self.location = location self.dataset_id = None + self.list_tables_page_size = list_tables_page_size @classmethod def dbapi(cls): @@ -694,9 +696,11 @@ def create_connect_args(self, url): arraysize, credentials_path, default_query_job_config, + list_tables_page_size, ) = parse_url(url) - self.arraysize = self.arraysize or arraysize + self.arraysize = arraysize or self.arraysize + self.list_tables_page_size = list_tables_page_size or self.list_tables_page_size self.location = location or self.location self.credentials_path = credentials_path or self.credentials_path self.dataset_id = dataset_id @@ -737,7 +741,9 @@ def _get_table_or_view_names(self, connection, table_type, schema=None): continue try: - tables = client.list_tables(dataset.reference) + tables = client.list_tables( + dataset.reference, page_size=self.list_tables_page_size + ) for table in tables: if table_type == table.table_type: result.append(get_table_name(table)) diff --git a/setup.py b/setup.py index 65f121ce..3e16f7d8 100644 --- a/setup.py +++ b/setup.py @@ -65,9 +65,9 @@ def readme(): ], platforms="Posix; MacOS X; Windows", install_requires=[ - "google-api-core>=1.23.0", # Work-around bug in cloud core deps. - "google-auth>=1.24.0,<2.0dev", # Work around pip wack. - "google-cloud-bigquery>=2.17.0", + "google-api-core>=1.30.0", # Work-around bug in cloud core deps. + "google-auth>=1.25.0,<2.0dev", # Work around pip wack. + "google-cloud-bigquery>=2.19.0", "sqlalchemy>=1.2.0,<1.5.0dev", "future", ], diff --git a/testing/constraints-3.6.txt b/testing/constraints-3.6.txt index 03281e21..1785edd0 100644 --- a/testing/constraints-3.6.txt +++ b/testing/constraints-3.6.txt @@ -5,6 +5,6 @@ # # e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev", sqlalchemy==1.2.0 -google-auth==1.24.0 -google-cloud-bigquery==2.17.0 -google-api-core==1.23.0 +google-auth==1.25.0 +google-cloud-bigquery==2.19.0 +google-api-core==1.30.0 diff --git a/tests/unit/fauxdbi.py b/tests/unit/fauxdbi.py index de835753..23d832f1 100644 --- a/tests/unit/fauxdbi.py +++ b/tests/unit/fauxdbi.py @@ -462,7 +462,7 @@ def list_datasets(self): google.cloud.bigquery.Dataset("myproject.yourdataset"), ] - def list_tables(self, dataset): + def list_tables(self, dataset, page_size): with contextlib.closing(self.connection.connection.cursor()) as cursor: cursor.execute("select * from sqlite_master") return [ diff --git a/tests/unit/test_engine.py b/tests/unit/test_engine.py index ad34ca08..8d8d75b9 100644 --- a/tests/unit/test_engine.py +++ b/tests/unit/test_engine.py @@ -52,3 +52,15 @@ def test_set_arraysize(faux_conn, metadata): # Because we gave a false array size, the array size wasn't set on the cursor: assert conn.connection.test_data["arraysize"] == 42 + + +def test_arraysize_querystring_takes_precedence_over_default(faux_conn, metadata): + arraysize = 42 + engine = sqlalchemy.create_engine( + f"bigquery://myproject/mydataset?arraysize={arraysize}" + ) + sqlalchemy.Table("t", metadata, sqlalchemy.Column("c", sqlalchemy.Integer)) + conn = engine.connect() + metadata.create_all(engine) + + assert conn.connection.test_data["arraysize"] == arraysize diff --git a/tests/unit/test_parse_url.py b/tests/unit/test_parse_url.py index 3da0546d..a3b5c2fb 100644 --- a/tests/unit/test_parse_url.py +++ b/tests/unit/test_parse_url.py @@ -50,6 +50,7 @@ def url_with_everything(): "?credentials_path=/some/path/to.json" "&location=some-location" "&arraysize=1000" + "&list_tables_page_size=5000" "&clustering_fields=a,b,c" "&create_disposition=CREATE_IF_NEEDED" "&destination=different-project.different-dataset.table" @@ -72,12 +73,14 @@ def test_basic(url_with_everything): arraysize, credentials_path, job_config, + list_tables_page_size, ) = parse_url(url_with_everything) assert project_id == "some-project" assert location == "some-location" assert dataset_id == "some-dataset" assert arraysize == 1000 + assert list_tables_page_size == 5000 assert credentials_path == "/some/path/to.json" assert isinstance(job_config, QueryJobConfig) @@ -136,6 +139,7 @@ def test_all_values(url_with_everything, param, value, default): "param, value", [ ("arraysize", "not-int"), + ("list_tables_page_size", "not-int"), ("create_disposition", "not-attribute"), ("destination", "not.fully-qualified"), ("dry_run", "not-bool"), @@ -167,7 +171,15 @@ def test_empty_with_non_config(): "bigquery:///?location=some-location&arraysize=1000&credentials_path=/some/path/to.json" ) ) - project_id, location, dataset_id, arraysize, credentials_path, job_config = url + ( + project_id, + location, + dataset_id, + arraysize, + credentials_path, + job_config, + list_tables_page_size, + ) = url assert project_id is None assert location == "some-location" @@ -175,17 +187,27 @@ def test_empty_with_non_config(): assert arraysize == 1000 assert credentials_path == "/some/path/to.json" assert job_config is None + assert list_tables_page_size is None def test_only_dataset(): url = parse_url(make_url("bigquery:///some-dataset")) - project_id, location, dataset_id, arraysize, credentials_path, job_config = url + ( + project_id, + location, + dataset_id, + arraysize, + credentials_path, + job_config, + list_tables_page_size, + ) = url assert project_id is None assert location is None assert dataset_id == "some-dataset" assert arraysize is None assert credentials_path is None + assert list_tables_page_size is None assert isinstance(job_config, QueryJobConfig) # we can't actually test that the dataset is on the job_config, # since we take care of that afterwards, when we have a client to fill in the project