googleapis · gcf-merge-on-green · Aug 11, 2021 · Jul 28, 2021 · Jul 28, 2021 · Jul 28, 2021
@@ -3206,16 +3206,14 @@ def query(
                 class.
         """
         job_id_given = job_id is not None
-        job_id = _make_job_id(job_id, job_id_prefix)
+        job_id_save = job_id
 
         if project is None:
             project = self.project
 
         if location is None:
             location = self.location
 
-        job_config = copy.deepcopy(job_config)
-
         if self._default_query_job_config:
             if job_config:
                 _verify_job_config_type(
@@ -3225,6 +3223,8 @@ def query(
                 # that is in the default,
                 # should be filled in with the default
                 # the incoming therefore has precedence
+                #
+                # Note that _fill_from_default doesn't mutate the receiver
                 job_config = job_config._fill_from_default(
                     self._default_query_job_config
                 )
@@ -3233,34 +3233,53 @@ def query(
                     self._default_query_job_config,
                     google.cloud.bigquery.job.QueryJobConfig,
                 )
-                job_config = copy.deepcopy(self._default_query_job_config)
+                job_config = self._default_query_job_config
 
-        job_ref = job._JobReference(job_id, project=project, location=location)
-        query_job = job.QueryJob(job_ref, query, client=self, job_config=job_config)
+        # Note that we haven't modified the original job_config (or
+        # _default_query_job_config) up to this point.
+        job_config_save = job_config
 
-        try:
-            query_job._begin(retry=retry, timeout=timeout)
-        except core_exceptions.Conflict as create_exc:
-            # The thought is if someone is providing their own job IDs and they get
-            # their job ID generation wrong, this could end up returning results for
-            # the wrong query. We thus only try to recover if job ID was not given.
-            if job_id_given:
-                raise create_exc
+        def do_query():
+            # Make a copy now, so that original doesn't get changed by the process
+            # below and to facilitate retry
+            job_config = copy.deepcopy(job_config_save)
+
+            job_id = _make_job_id(job_id_save, job_id_prefix)
+            job_ref = job._JobReference(job_id, project=project, location=location)
+            query_job = job.QueryJob(job_ref, query, client=self, job_config=job_config)
 
             try:
-                query_job = self.get_job(
-                    job_id,
-                    project=project,
-                    location=location,
-                    retry=retry,
-                    timeout=timeout,
-                )
-            except core_exceptions.GoogleAPIError:  # (includes RetryError)
-                raise create_exc
+                query_job._begin(retry=retry, timeout=timeout)
+            except core_exceptions.Conflict as create_exc:
+                # The thought is if someone is providing their own job IDs and they get
+                # their job ID generation wrong, this could end up returning results for
+                # the wrong query. We thus only try to recover if job ID was not given.
+                if job_id_given:
+                    raise create_exc
+
+                try:
+                    query_job = self.get_job(
+                        job_id,
+                        project=project,
+                        location=location,
+                        retry=retry,
+                        timeout=timeout,
+                    )
+                except core_exceptions.GoogleAPIError:  # (includes RetryError)
+                    raise create_exc
+                else:
+                    return query_job
             else:
                 return query_job
-        else:
-            return query_job
+
+        future = do_query()
+        # The future might be in a failed state now, but if it's
+        # unrecoverable, we'll find out when we ask for it's result, at which
+        # point, we may retry.
+        if not job_id_given:
+            future.retry_do_query = do_query  # in case we have to retry later
+
+        return future
 
     def insert_rows(
         self,

@@ -1300,12 +1300,47 @@ def result(
                 If the job did not complete in the given timeout.
         """
         try:
-            super(QueryJob, self).result(retry=retry, timeout=timeout)
+            retry_do_query = getattr(self, "retry_do_query", None)
+            first = True
+            sub_retry = retry if retry_do_query is None else None
+
+            def do_get_result():
+                nonlocal first
+
+                if first:
+                    first = False
+                else:
+                    # Note that we won't get here if retry_do_query is
+                    # None, because we won't use a retry.
+
+                    # The orinal job is failed. Create a new one.
+                    job = retry_do_query()
+
+                    # If it's already failed, we might as well stop:
+                    if job.done() and job.exception() is not None:
+                        raise job.exception()
+
+                    # Become the new job:
+                    self.__dict__.clear()
+                    self.__dict__.update(job.__dict__)
+
+                    # This shouldn't be necessary, because once we have a good
+                    # job, it should stay good,and we shouldn't have to retry.
+                    # But let's be paranoid. :)
+                    self.retry_do_query = retry_do_query
+
+                super(QueryJob, self).result(retry=sub_retry, timeout=timeout)
+
+                # Since the job could already be "done" (e.g. got a finished job
+                # via client.get_job), the superclass call to done() might not
+                # set the self._query_results cache.
+                self._reload_query_results(retry=sub_retry, timeout=timeout)
+
+            if retry is not None and retry_do_query is not None:
+                do_get_result = retry(do_get_result)
+
+            do_get_result()
 
-            # Since the job could already be "done" (e.g. got a finished job
-            # via client.get_job), the superclass call to done() might not
-            # set the self._query_results cache.
-            self._reload_query_results(retry=retry, timeout=timeout)
         except exceptions.GoogleAPICallError as exc:
             exc.message += self._format_for_exception(self.query, self.job_id)
             exc.query_job = self

diff --git a/tests/system/test_query_retry.py b/tests/system/test_query_retry.py
@@ -0,0 +1,46 @@
+import contextlib
+import threading
+import time
+
+import google.cloud.bigquery
+
+
+def thread(func):
+    thread = threading.Thread(target=func, daemon=True)
+    thread.start()
+    return thread
+
+
+def test_query_retry_539(bigquery_client, dataset_id):
+    """
+    Test semantic retry
+
+    See: https://github.com/googleapis/python-bigquery/issues/539
+    """
+    from google.api_core import exceptions
+    from google.api_core.retry import if_exception_type, Retry
+
+    table_name = f"{dataset_id}.t539"
+    job = bigquery_client.query(f"select count(*) from {table_name}")
+    job_id = job.job_id
+
+    # We can already know that the job failed, but we're not supposed
+    # to find out until we call result, which is where retry happend
+    assert job.done()
+    assert job.exception() is not None
+
+    @thread
+    def create_table():
+        time.sleep(1)
+        with contextlib.closing(google.cloud.bigquery.Client()) as client:
+            client.query(f"create table {table_name} (id int64)")
+
+    retry_policy = Retry(predicate=if_exception_type(exceptions.NotFound))
+    [[count]] = list(job.result(retry=retry_policy))
+    assert count == 0
+
+    # The job was retried, and thus got a new job id
+    assert job.job_id != job_id
+
+    # Make sure we don't leave a thread behind:
+    create_table.join()
diff --git a/tests/unit/test_query_retry.py b/tests/unit/test_query_retry.py
@@ -0,0 +1,134 @@
+import datetime
+
+import mock
+import pytest
+
+import google.api_core.exceptions
+
+from .helpers import make_connection
+
+
+@mock.patch("time.sleep")
+def test_retry_failed_jobs(sleep, client):
+    """
+    Test retry of job failures, as opposed to API-invocation failures.
+    """
+    err = dict(reason="rateLimitExceeded")
+    responses = [
+        dict(status=dict(state="DONE", errors=[err], errorResult=err)),
+        dict(status=dict(state="DONE", errors=[err], errorResult=err)),
+        dict(status=dict(state="DONE", errors=[err], errorResult=err)),
+        dict(status=dict(state="DONE")),
+        dict(rows=[{"f": [{"v": "1"}]}], totalRows="1"),
+    ]
+
+    def api_request(method, path, query_params=None, data=None, **kw):
+        response = responses.pop(0)
+        if data:
+            response["jobReference"] = data["jobReference"]
+        else:
+            response["jobReference"] = dict(
+                jobId=path.split("/")[-1], projectId="PROJECT"
+            )
+        return response
+
+    conn = client._connection = make_connection()
+    conn.api_request.side_effect = api_request
+
+    job = client.query("select 1")
+
+    orig_job_id = job.job_id
+    result = job.result()
+    assert result.total_rows == 1
+    assert not responses  # We made all the calls we expected to.
+
+    # The job adjusts it's job id based on the id of the last attempt.
+    assert job.job_id != orig_job_id
+    assert job.job_id == conn.mock_calls[3][2]["data"]["jobReference"]["jobId"]
+
+    # We had to sleep three times
+    assert len(sleep.mock_calls) == 3
+
+    # Sleeps are random, however they're more than 0
+    assert min(c[1][0] for c in sleep.mock_calls) > 0
+
+    # They're at most 2 * (multiplier**(number of sleeps - 1)) * initial
+    # The default multiplier is 2
+    assert max(c[1][0] for c in sleep.mock_calls) <= 8
+
+    # We can ask for the result again:
+    responses = [
+        dict(rows=[{"f": [{"v": "1"}]}], totalRows="1"),
+    ]
+    orig_job_id = job.job_id
+    result = job.result()
+    assert result.total_rows == 1
+    assert not responses  # We made all the calls we expected to.
+
+    # We wouldn't (and didn't) fail, because we're dealing with a successful job.
+    # So the job id hasn't changed.
+    assert job.job_id == orig_job_id
+
+
+@mock.patch("google.api_core.retry.datetime_helpers")
+@mock.patch("time.sleep")
+def test_retry_failed_jobs_after_retry_failed(sleep, datetime_helpers, client):
+    """
+    If at first you don't succeed, maybe you will later. :)
+    """
+    conn = client._connection = make_connection()
+
+    datetime_helpers.utcnow.return_value = datetime.datetime(2021, 7, 29, 10, 43, 2)
+
+    err = dict(reason="rateLimitExceeded")
+
+    def api_request(method, path, query_params=None, data=None, **kw):
+        calls = sleep.mock_calls
+        if calls:
+            datetime_helpers.utcnow.return_value += datetime.timedelta(
+                seconds=calls[-1][1][0]
+            )
+        response = dict(status=dict(state="DONE", errors=[err], errorResult=err))
+        response["jobReference"] = data["jobReference"]
+        return response
+
+    conn.api_request.side_effect = api_request
+
+    job = client.query("select 1")
+    orig_job_id = job.job_id
+
+    with pytest.raises(google.api_core.exceptions.RetryError):
+        job.result()
+
+    # We never fot a successful job, so the job id never changed:
+    assert job.job_id == orig_job_id
+
+    # We failed because we couldn't succeed after 120 seconds.
+    # But we can try again:
+    responses = [
+        dict(status=dict(state="DONE", errors=[err], errorResult=err)),
+        dict(status=dict(state="DONE", errors=[err], errorResult=err)),
+        dict(status=dict(state="DONE", errors=[err], errorResult=err)),
+        dict(status=dict(state="DONE")),
+        dict(rows=[{"f": [{"v": "1"}]}], totalRows="1"),
+    ]
+
+    def api_request(method, path, query_params=None, data=None, **kw):
+        calls = sleep.mock_calls
+        datetime_helpers.utcnow.return_value += datetime.timedelta(
+            seconds=calls[-1][1][0]
+        )
+        response = responses.pop(0)
+        if data:
+            response["jobReference"] = data["jobReference"]
+        else:
+            response["jobReference"] = dict(
+                jobId=path.split("/")[-1], projectId="PROJECT"
+            )
+        return response
+
+    conn.api_request.side_effect = api_request
+    result = job.result()
+    assert result.total_rows == 1
+    assert not responses  # We made all the calls we expected to.
+    assert job.job_id != orig_job_id