Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Feature/list tables page size #174

Merged
Merged
12 changes: 11 additions & 1 deletion README.rst
Expand Up @@ -148,6 +148,15 @@ By default, ``arraysize`` is set to ``5000``. ``arraysize`` is used to set the b

engine = create_engine('bigquery://project', arraysize=1000)

Page size for dataset.list_tables
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

By default, ``list_tables_page_size`` is set to ``1000``. ``list_tables_page_size`` is used to set the max_results for `dataset.list_tables`_ operation. To change it, pass ``list_tables_page_size`` to ``create_engine()``:

.. _`dataset.list_tables`: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables/list
.. code-block:: python

engine = create_engine('bigquery://project', list_tables_page_size=100)

Adding a Default Dataset
^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down Expand Up @@ -180,7 +189,7 @@ Connection String Parameters

There are many situations where you can't call ``create_engine`` directly, such as when using tools like `Flask SQLAlchemy <http://flask-sqlalchemy.pocoo.org/2.3/>`_. For situations like these, or for situations where you want the ``Client`` to have a `default_query_job_config <https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.client.Client.html#google.cloud.bigquery.client.Client>`_, you can pass many arguments in the query of the connection string.

The ``credentials_path``, ``credentials_info``, ``location``, and ``arraysize`` parameters are used by this library, and the rest are used to create a `QueryJobConfig <https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig>`_
The ``credentials_path``, ``credentials_info``, ``location``, ``arraysize`` and ``list_tables_page_size`` parameters are used by this library, and the rest are used to create a `QueryJobConfig <https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJobConfig.html#google.cloud.bigquery.job.QueryJobConfig>`_

Note that if you want to use query strings, it will be more reliable if you use three slashes, so ``'bigquery:///?a=b'`` will work reliably, but ``'bigquery://?a=b'`` might be interpreted as having a "database" of ``?a=b``, depending on the system being used to parse the connection string.

Expand All @@ -193,6 +202,7 @@ Here are examples of all the supported arguments. Any not present are either for
'credentials_path=/some/path/to.json' '&'
'location=some-location' '&'
'arraysize=1000' '&'
'list_tables_page_size=100' '&'
'clustering_fields=a,b,c' '&'
'create_disposition=CREATE_IF_NEEDED' '&'
'destination=different-project.different-dataset.table' '&'
Expand Down
32 changes: 30 additions & 2 deletions pybigquery/parse_url.py
Expand Up @@ -68,6 +68,7 @@ def parse_url(url): # noqa: C901
dataset_id = url.database or None
arraysize = None
credentials_path = None
list_tables_page_size = None

# location
if "location" in query:
Expand All @@ -85,6 +86,16 @@ def parse_url(url): # noqa: C901
except ValueError:
raise ValueError("invalid int in url query arraysize: " + str_arraysize)

if "list_tables_page_size" in query:
str_list_tables_page_size = query.pop("list_tables_page_size")
try:
list_tables_page_size = int(str_list_tables_page_size)
except ValueError:
raise ValueError(
"invalid int in url query list_tables_page_size: "
+ str_list_tables_page_size
)

# if only these "non-config" values were present, the dict will now be empty
if not query:
# if a dataset_id exists, we need to return a job_config that isn't None
Expand All @@ -97,9 +108,18 @@ def parse_url(url): # noqa: C901
arraysize,
credentials_path,
QueryJobConfig(),
list_tables_page_size,
)
else:
return project_id, location, dataset_id, arraysize, credentials_path, None
return (
project_id,
location,
dataset_id,
arraysize,
credentials_path,
None,
list_tables_page_size,
)

job_config = QueryJobConfig()

Expand Down Expand Up @@ -239,4 +259,12 @@ def parse_url(url): # noqa: C901
"invalid write_disposition in url query: " + query["write_disposition"]
)

return project_id, location, dataset_id, arraysize, credentials_path, job_config
return (
project_id,
location,
dataset_id,
arraysize,
credentials_path,
job_config,
list_tables_page_size,
)
10 changes: 8 additions & 2 deletions pybigquery/sqlalchemy_bigquery.py
Expand Up @@ -657,6 +657,7 @@ def __init__(
credentials_path=None,
location=None,
credentials_info=None,
list_tables_page_size=1000,
*args,
**kwargs,
):
Expand All @@ -666,6 +667,7 @@ def __init__(
self.credentials_info = credentials_info
self.location = location
self.dataset_id = None
self.list_tables_page_size = list_tables_page_size

@classmethod
def dbapi(cls):
Expand Down Expand Up @@ -694,9 +696,11 @@ def create_connect_args(self, url):
arraysize,
credentials_path,
default_query_job_config,
list_tables_page_size,
) = parse_url(url)

self.arraysize = self.arraysize or arraysize
self.arraysize = arraysize or self.arraysize
OmriBromberg marked this conversation as resolved.
Show resolved Hide resolved
self.list_tables_page_size = list_tables_page_size or self.list_tables_page_size
self.location = location or self.location
self.credentials_path = credentials_path or self.credentials_path
self.dataset_id = dataset_id
Expand Down Expand Up @@ -737,7 +741,9 @@ def _get_table_or_view_names(self, connection, table_type, schema=None):
continue

try:
tables = client.list_tables(dataset.reference)
tables = client.list_tables(
dataset.reference, page_size=self.list_tables_page_size
)
jimfulton marked this conversation as resolved.
Show resolved Hide resolved
for table in tables:
if table_type == table.table_type:
result.append(get_table_name(table))
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Expand Up @@ -65,9 +65,9 @@ def readme():
],
platforms="Posix; MacOS X; Windows",
install_requires=[
"google-api-core>=1.23.0", # Work-around bug in cloud core deps.
"google-auth>=1.24.0,<2.0dev", # Work around pip wack.
"google-cloud-bigquery>=2.17.0",
"google-api-core>=1.30.0", # Work-around bug in cloud core deps.
"google-auth>=1.25.0,<2.0dev", # Work around pip wack.
"google-cloud-bigquery>=2.19.0",
"sqlalchemy>=1.2.0,<1.5.0dev",
"future",
],
Expand Down
6 changes: 3 additions & 3 deletions testing/constraints-3.6.txt
Expand Up @@ -5,6 +5,6 @@
#
# e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev",
sqlalchemy==1.2.0
google-auth==1.24.0
google-cloud-bigquery==2.17.0
google-api-core==1.23.0
google-auth==1.25.0
google-cloud-bigquery==2.19.0
google-api-core==1.30.0
2 changes: 1 addition & 1 deletion tests/unit/fauxdbi.py
Expand Up @@ -462,7 +462,7 @@ def list_datasets(self):
google.cloud.bigquery.Dataset("myproject.yourdataset"),
]

def list_tables(self, dataset):
def list_tables(self, dataset, page_size):
with contextlib.closing(self.connection.connection.cursor()) as cursor:
cursor.execute("select * from sqlite_master")
return [
Expand Down
12 changes: 12 additions & 0 deletions tests/unit/test_engine.py
Expand Up @@ -52,3 +52,15 @@ def test_set_arraysize(faux_conn, metadata):

# Because we gave a false array size, the array size wasn't set on the cursor:
assert conn.connection.test_data["arraysize"] == 42


def test_arraysize_querystring_takes_precedence_over_default(faux_conn, metadata):
arraysize = 42
engine = sqlalchemy.create_engine(
f"bigquery://myproject/mydataset?arraysize={arraysize}"
)
sqlalchemy.Table("t", metadata, sqlalchemy.Column("c", sqlalchemy.Integer))
conn = engine.connect()
metadata.create_all(engine)

assert conn.connection.test_data["arraysize"] == arraysize
26 changes: 24 additions & 2 deletions tests/unit/test_parse_url.py
Expand Up @@ -50,6 +50,7 @@ def url_with_everything():
"?credentials_path=/some/path/to.json"
"&location=some-location"
"&arraysize=1000"
"&list_tables_page_size=5000"
"&clustering_fields=a,b,c"
"&create_disposition=CREATE_IF_NEEDED"
"&destination=different-project.different-dataset.table"
Expand All @@ -72,12 +73,14 @@ def test_basic(url_with_everything):
arraysize,
credentials_path,
job_config,
list_tables_page_size,
) = parse_url(url_with_everything)

assert project_id == "some-project"
assert location == "some-location"
assert dataset_id == "some-dataset"
assert arraysize == 1000
assert list_tables_page_size == 5000
assert credentials_path == "/some/path/to.json"
assert isinstance(job_config, QueryJobConfig)

Expand Down Expand Up @@ -136,6 +139,7 @@ def test_all_values(url_with_everything, param, value, default):
"param, value",
[
("arraysize", "not-int"),
("list_tables_page_size", "not-int"),
("create_disposition", "not-attribute"),
("destination", "not.fully-qualified"),
("dry_run", "not-bool"),
Expand Down Expand Up @@ -167,25 +171,43 @@ def test_empty_with_non_config():
"bigquery:///?location=some-location&arraysize=1000&credentials_path=/some/path/to.json"
)
)
project_id, location, dataset_id, arraysize, credentials_path, job_config = url
(
project_id,
location,
dataset_id,
arraysize,
credentials_path,
job_config,
list_tables_page_size,
) = url

assert project_id is None
assert location == "some-location"
assert dataset_id is None
assert arraysize == 1000
assert credentials_path == "/some/path/to.json"
assert job_config is None
assert list_tables_page_size is None


def test_only_dataset():
url = parse_url(make_url("bigquery:///some-dataset"))
project_id, location, dataset_id, arraysize, credentials_path, job_config = url
(
project_id,
location,
dataset_id,
arraysize,
credentials_path,
job_config,
list_tables_page_size,
) = url

assert project_id is None
assert location is None
assert dataset_id == "some-dataset"
assert arraysize is None
assert credentials_path is None
assert list_tables_page_size is None
assert isinstance(job_config, QueryJobConfig)
# we can't actually test that the dataset is on the job_config,
# since we take care of that afterwards, when we have a client to fill in the project
Expand Down