add list_tables_page_size int querystring ability with defaults

fix arraysize default always takes priority over querystring add and fix tests, reformatting update readme
googleapis · May 12, 2021 · 085df0f · 085df0f
1 parent f5adfc0
commit 085df0f
Show file tree

Hide file tree

Showing 5 changed files with 74 additions and 8 deletions.
diff --git a/README.rst b/README.rst
@@ -148,6 +148,15 @@ By default, ``arraysize`` is set to ``5000``. ``arraysize`` is used to set the b
 
     engine = create_engine('bigquery://project', arraysize=1000)
 
+Page size for dataset.list_tables
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+By default, ``list_tables_page_size`` is set to ``1000``. ``list_tables_page_size`` is used to set the max_results for `dataset.list_tables`_ operation. To change it, pass ``list_tables_page_size`` to ``create_engine()``:
+
+.. _`dataset.list_tables`: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables/list
+.. code-block:: python
+
+    engine = create_engine('bigquery://project', list_tables_page_size=100)
 
 Adding a Default Dataset
 ^^^^^^^^^^^^^^^^^^^^^^^^
@@ -180,7 +189,7 @@ Connection String Parameters
 
 There are many situations where you can't call ``create_engine`` directly, such as when using tools like `Flask SQLAlchemy <http://flask-sqlalchemy.pocoo.org/2.3/>`_. For situations like these, or for situations where you want the ``Client`` to have a `default_query_job_config <https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.client.Client.html#google.cloud.bigquery.client.Client>`_, you can pass many arguments in the query of the connection string.
 
-The ``credentials_path``, ``credentials_info``, ``location``, and ``arraysize`` parameters are used by this library, and the rest are used to create a `QueryJobConfig <https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig>`_
+The ``credentials_path``, ``credentials_info``, ``location``, ``arraysize`` and ``list_tables_page_size`` parameters are used by this library, and the rest are used to create a `QueryJobConfig <https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig>`_
 
 Note that if you want to use query strings, it will be more reliable if you use three slashes, so ``'bigquery:///?a=b'`` will work reliably, but ``'bigquery://?a=b'`` might be interpreted as having a "database" of ``?a=b``, depending on the system being used to parse the connection string.
 
@@ -193,6 +202,7 @@ Here are examples of all the supported arguments. Any not present are either for
         'credentials_path=/some/path/to.json' '&'
         'location=some-location' '&'
         'arraysize=1000' '&'
+        'list_tables_page_size=100' '&'
         'clustering_fields=a,b,c' '&'
         'create_disposition=CREATE_IF_NEEDED' '&'
         'destination=different-project.different-dataset.table' '&'

diff --git a/pybigquery/parse_url.py b/pybigquery/parse_url.py
@@ -68,6 +68,7 @@ def parse_url(url):  # noqa: C901
     dataset_id = url.database or None
     arraysize = None
     credentials_path = None
+    list_tables_page_size = None
 
     # location
     if "location" in query:
@@ -85,6 +86,16 @@ def parse_url(url):  # noqa: C901
         except ValueError:
             raise ValueError("invalid int in url query arraysize: " + str_arraysize)
 
+    if "list_tables_page_size" in query:
+        str_list_tables_page_size = query.pop("list_tables_page_size")
+        try:
+            list_tables_page_size = int(str_list_tables_page_size)
+        except ValueError:
+            raise ValueError(
+                "invalid int in url query list_tables_page_size: "
+                + str_list_tables_page_size
+            )
+
     # if only these "non-config" values were present, the dict will now be empty
     if not query:
         # if a dataset_id exists, we need to return a job_config that isn't None
@@ -97,9 +108,18 @@ def parse_url(url):  # noqa: C901
                 arraysize,
                 credentials_path,
                 QueryJobConfig(),
+                list_tables_page_size,
             )
         else:
-            return project_id, location, dataset_id, arraysize, credentials_path, None
+            return (
+                project_id,
+                location,
+                dataset_id,
+                arraysize,
+                credentials_path,
+                None,
+                list_tables_page_size,
+            )
 
     job_config = QueryJobConfig()
 
@@ -239,4 +259,12 @@ def parse_url(url):  # noqa: C901
                 "invalid write_disposition in url query: " + query["write_disposition"]
             )
 
-    return project_id, location, dataset_id, arraysize, credentials_path, job_config
+    return (
+        project_id,
+        location,
+        dataset_id,
+        arraysize,
+        credentials_path,
+        job_config,
+        list_tables_page_size,
+    )
diff --git a/pybigquery/sqlalchemy_bigquery.py b/pybigquery/sqlalchemy_bigquery.py
@@ -564,6 +564,7 @@ def __init__(
         credentials_path=None,
         location=None,
         credentials_info=None,
+        list_tables_page_size=1000,
         *args,
         **kwargs,
     ):
@@ -573,6 +574,7 @@ def __init__(
         self.credentials_info = credentials_info
         self.location = location
         self.dataset_id = None
+        self.list_tables_page_size = list_tables_page_size
 
     @classmethod
     def dbapi(cls):
@@ -601,9 +603,11 @@ def create_connect_args(self, url):
             arraysize,
             credentials_path,
             default_query_job_config,
+            list_tables_page_size,
         ) = parse_url(url)
 
-        self.arraysize = self.arraysize or arraysize
+        self.arraysize = arraysize or self.arraysize
+        self.list_tables_page_size = list_tables_page_size or self.list_tables_page_size
         self.location = location or self.location
         self.credentials_path = credentials_path or self.credentials_path
         self.dataset_id = dataset_id
@@ -644,7 +648,9 @@ def _get_table_or_view_names(self, connection, table_type, schema=None):
                 continue
 
             try:
-                tables = client.list_tables(dataset.reference)
+                tables = client.list_tables(
+                    dataset.reference, max_results=self.list_tables_page_size
+                )
                 for table in tables:
                     if table_type == table.table_type:
                         result.append(get_table_name(table))

diff --git a/tests/unit/fauxdbi.py b/tests/unit/fauxdbi.py
@@ -415,7 +415,7 @@ def list_datasets(self):
             google.cloud.bigquery.Dataset("myproject.yourdataset"),
         ]
 
-    def list_tables(self, dataset):
+    def list_tables(self, dataset, max_results):
         with contextlib.closing(self.connection.connection.cursor()) as cursor:
             cursor.execute("select * from sqlite_master")
             return [

diff --git a/tests/unit/test_parse_url.py b/tests/unit/test_parse_url.py
@@ -50,6 +50,7 @@ def url_with_everything():
         "?credentials_path=/some/path/to.json"
         "&location=some-location"
         "&arraysize=1000"
+        "&list_tables_page_size=5000"
         "&clustering_fields=a,b,c"
         "&create_disposition=CREATE_IF_NEEDED"
         "&destination=different-project.different-dataset.table"
@@ -72,12 +73,14 @@ def test_basic(url_with_everything):
         arraysize,
         credentials_path,
         job_config,
+        list_tables_page_size,
     ) = parse_url(url_with_everything)
 
     assert project_id == "some-project"
     assert location == "some-location"
     assert dataset_id == "some-dataset"
     assert arraysize == 1000
+    assert list_tables_page_size == 5000
     assert credentials_path == "/some/path/to.json"
     assert isinstance(job_config, QueryJobConfig)
 
@@ -134,6 +137,7 @@ def test_all_values(url_with_everything, param, value, default):
     "param, value",
     [
         ("arraysize", "not-int"),
+        ("list_tables_page_size", "not-int"),
         ("create_disposition", "not-attribute"),
         ("destination", "not.fully-qualified"),
         ("dry_run", "not-bool"),
@@ -165,25 +169,43 @@ def test_empty_with_non_config():
             "bigquery:///?location=some-location&arraysize=1000&credentials_path=/some/path/to.json"
         )
     )
-    project_id, location, dataset_id, arraysize, credentials_path, job_config = url
+    (
+        project_id,
+        location,
+        dataset_id,
+        arraysize,
+        credentials_path,
+        job_config,
+        list_tables_page_size,
+    ) = url
 
     assert project_id is None
     assert location == "some-location"
     assert dataset_id is None
     assert arraysize == 1000
     assert credentials_path == "/some/path/to.json"
     assert job_config is None
+    assert list_tables_page_size is None
 
 
 def test_only_dataset():
     url = parse_url(make_url("bigquery:///some-dataset"))
-    project_id, location, dataset_id, arraysize, credentials_path, job_config = url
+    (
+        project_id,
+        location,
+        dataset_id,
+        arraysize,
+        credentials_path,
+        job_config,
+        list_tables_page_size,
+    ) = url
 
     assert project_id is None
     assert location is None
     assert dataset_id == "some-dataset"
     assert arraysize is None
     assert credentials_path is None
+    assert list_tables_page_size is None
     assert isinstance(job_config, QueryJobConfig)
     # we can't actually test that the dataset is on the job_config,
     # since we take care of that afterwards, when we have a client to fill in the project