Skip to content

Commit

Permalink
add list_tables_page_size int querystring ability with defaults
Browse files Browse the repository at this point in the history
fix arraysize default always takes priority over querystring

add and fix tests, reformatting

update readme
  • Loading branch information
OmriBromberg committed May 12, 2021
1 parent f5adfc0 commit 085df0f
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 8 deletions.
12 changes: 11 additions & 1 deletion README.rst
Expand Up @@ -148,6 +148,15 @@ By default, ``arraysize`` is set to ``5000``. ``arraysize`` is used to set the b
engine = create_engine('bigquery://project', arraysize=1000)
Page size for dataset.list_tables
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

By default, ``list_tables_page_size`` is set to ``1000``. ``list_tables_page_size`` is used to set the max_results for `dataset.list_tables`_ operation. To change it, pass ``list_tables_page_size`` to ``create_engine()``:

.. _`dataset.list_tables`: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables/list
.. code-block:: python
engine = create_engine('bigquery://project', list_tables_page_size=100)
Adding a Default Dataset
^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down Expand Up @@ -180,7 +189,7 @@ Connection String Parameters

There are many situations where you can't call ``create_engine`` directly, such as when using tools like `Flask SQLAlchemy <http://flask-sqlalchemy.pocoo.org/2.3/>`_. For situations like these, or for situations where you want the ``Client`` to have a `default_query_job_config <https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.client.Client.html#google.cloud.bigquery.client.Client>`_, you can pass many arguments in the query of the connection string.

The ``credentials_path``, ``credentials_info``, ``location``, and ``arraysize`` parameters are used by this library, and the rest are used to create a `QueryJobConfig <https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig>`_
The ``credentials_path``, ``credentials_info``, ``location``, ``arraysize`` and ``list_tables_page_size`` parameters are used by this library, and the rest are used to create a `QueryJobConfig <https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig>`_

Note that if you want to use query strings, it will be more reliable if you use three slashes, so ``'bigquery:///?a=b'`` will work reliably, but ``'bigquery://?a=b'`` might be interpreted as having a "database" of ``?a=b``, depending on the system being used to parse the connection string.

Expand All @@ -193,6 +202,7 @@ Here are examples of all the supported arguments. Any not present are either for
'credentials_path=/some/path/to.json' '&'
'location=some-location' '&'
'arraysize=1000' '&'
'list_tables_page_size=100' '&'
'clustering_fields=a,b,c' '&'
'create_disposition=CREATE_IF_NEEDED' '&'
'destination=different-project.different-dataset.table' '&'
Expand Down
32 changes: 30 additions & 2 deletions pybigquery/parse_url.py
Expand Up @@ -68,6 +68,7 @@ def parse_url(url): # noqa: C901
dataset_id = url.database or None
arraysize = None
credentials_path = None
list_tables_page_size = None

# location
if "location" in query:
Expand All @@ -85,6 +86,16 @@ def parse_url(url): # noqa: C901
except ValueError:
raise ValueError("invalid int in url query arraysize: " + str_arraysize)

if "list_tables_page_size" in query:
str_list_tables_page_size = query.pop("list_tables_page_size")
try:
list_tables_page_size = int(str_list_tables_page_size)
except ValueError:
raise ValueError(
"invalid int in url query list_tables_page_size: "
+ str_list_tables_page_size
)

# if only these "non-config" values were present, the dict will now be empty
if not query:
# if a dataset_id exists, we need to return a job_config that isn't None
Expand All @@ -97,9 +108,18 @@ def parse_url(url): # noqa: C901
arraysize,
credentials_path,
QueryJobConfig(),
list_tables_page_size,
)
else:
return project_id, location, dataset_id, arraysize, credentials_path, None
return (
project_id,
location,
dataset_id,
arraysize,
credentials_path,
None,
list_tables_page_size,
)

job_config = QueryJobConfig()

Expand Down Expand Up @@ -239,4 +259,12 @@ def parse_url(url): # noqa: C901
"invalid write_disposition in url query: " + query["write_disposition"]
)

return project_id, location, dataset_id, arraysize, credentials_path, job_config
return (
project_id,
location,
dataset_id,
arraysize,
credentials_path,
job_config,
list_tables_page_size,
)
10 changes: 8 additions & 2 deletions pybigquery/sqlalchemy_bigquery.py
Expand Up @@ -564,6 +564,7 @@ def __init__(
credentials_path=None,
location=None,
credentials_info=None,
list_tables_page_size=1000,
*args,
**kwargs,
):
Expand All @@ -573,6 +574,7 @@ def __init__(
self.credentials_info = credentials_info
self.location = location
self.dataset_id = None
self.list_tables_page_size = list_tables_page_size

@classmethod
def dbapi(cls):
Expand Down Expand Up @@ -601,9 +603,11 @@ def create_connect_args(self, url):
arraysize,
credentials_path,
default_query_job_config,
list_tables_page_size,
) = parse_url(url)

self.arraysize = self.arraysize or arraysize
self.arraysize = arraysize or self.arraysize
self.list_tables_page_size = list_tables_page_size or self.list_tables_page_size
self.location = location or self.location
self.credentials_path = credentials_path or self.credentials_path
self.dataset_id = dataset_id
Expand Down Expand Up @@ -644,7 +648,9 @@ def _get_table_or_view_names(self, connection, table_type, schema=None):
continue

try:
tables = client.list_tables(dataset.reference)
tables = client.list_tables(
dataset.reference, max_results=self.list_tables_page_size
)
for table in tables:
if table_type == table.table_type:
result.append(get_table_name(table))
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/fauxdbi.py
Expand Up @@ -415,7 +415,7 @@ def list_datasets(self):
google.cloud.bigquery.Dataset("myproject.yourdataset"),
]

def list_tables(self, dataset):
def list_tables(self, dataset, max_results):
with contextlib.closing(self.connection.connection.cursor()) as cursor:
cursor.execute("select * from sqlite_master")
return [
Expand Down
26 changes: 24 additions & 2 deletions tests/unit/test_parse_url.py
Expand Up @@ -50,6 +50,7 @@ def url_with_everything():
"?credentials_path=/some/path/to.json"
"&location=some-location"
"&arraysize=1000"
"&list_tables_page_size=5000"
"&clustering_fields=a,b,c"
"&create_disposition=CREATE_IF_NEEDED"
"&destination=different-project.different-dataset.table"
Expand All @@ -72,12 +73,14 @@ def test_basic(url_with_everything):
arraysize,
credentials_path,
job_config,
list_tables_page_size,
) = parse_url(url_with_everything)

assert project_id == "some-project"
assert location == "some-location"
assert dataset_id == "some-dataset"
assert arraysize == 1000
assert list_tables_page_size == 5000
assert credentials_path == "/some/path/to.json"
assert isinstance(job_config, QueryJobConfig)

Expand Down Expand Up @@ -134,6 +137,7 @@ def test_all_values(url_with_everything, param, value, default):
"param, value",
[
("arraysize", "not-int"),
("list_tables_page_size", "not-int"),
("create_disposition", "not-attribute"),
("destination", "not.fully-qualified"),
("dry_run", "not-bool"),
Expand Down Expand Up @@ -165,25 +169,43 @@ def test_empty_with_non_config():
"bigquery:///?location=some-location&arraysize=1000&credentials_path=/some/path/to.json"
)
)
project_id, location, dataset_id, arraysize, credentials_path, job_config = url
(
project_id,
location,
dataset_id,
arraysize,
credentials_path,
job_config,
list_tables_page_size,
) = url

assert project_id is None
assert location == "some-location"
assert dataset_id is None
assert arraysize == 1000
assert credentials_path == "/some/path/to.json"
assert job_config is None
assert list_tables_page_size is None


def test_only_dataset():
url = parse_url(make_url("bigquery:///some-dataset"))
project_id, location, dataset_id, arraysize, credentials_path, job_config = url
(
project_id,
location,
dataset_id,
arraysize,
credentials_path,
job_config,
list_tables_page_size,
) = url

assert project_id is None
assert location is None
assert dataset_id == "some-dataset"
assert arraysize is None
assert credentials_path is None
assert list_tables_page_size is None
assert isinstance(job_config, QueryJobConfig)
# we can't actually test that the dataset is on the job_config,
# since we take care of that afterwards, when we have a client to fill in the project
Expand Down

0 comments on commit 085df0f

Please sign in to comment.