From 6d9c3b884357ddc4d314ebdfac5fc6dda2de3b49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Knut=20Olav=20L=C3=B8ite?= Date: Wed, 24 Mar 2021 08:55:25 +0100 Subject: [PATCH] fix: transaction retries should not timeout (#1009) Transactions that are retried because of an aborted transaction use the retry settings of the Rollback RPC. This ensures reasonable backoff values. It however also meant that transactions that are retried multiple times could exceed the total timeout of the retry settings, and that again would cause the Aborted error to propagate. This change sets the total timeout for transaction retries to 24 hours and disables any max attempts in the retry settings to prevent retries to fail because the deadline is exceeded. Transactions can still fail with timeout errors if individual RPC invocations exceed the configured timeout of that RPC. This change only prevents timeouts from occurring because of repeated retries of an entire transaction. Fixes #1008 --- .../cloud/spanner/SpannerRetryHelper.java | 25 ++++++-- .../cloud/spanner/SpannerRetryHelperTest.java | 62 +++++++++++++++++++ 2 files changed, 81 insertions(+), 6 deletions(-) diff --git a/google-cloud-spanner/src/main/java/com/google/cloud/spanner/SpannerRetryHelper.java b/google-cloud-spanner/src/main/java/com/google/cloud/spanner/SpannerRetryHelper.java index e22a0f7485..9c49efe2f1 100644 --- a/google-cloud-spanner/src/main/java/com/google/cloud/spanner/SpannerRetryHelper.java +++ b/google-cloud-spanner/src/main/java/com/google/cloud/spanner/SpannerRetryHelper.java @@ -16,6 +16,7 @@ package com.google.cloud.spanner; +import com.google.api.core.ApiClock; import com.google.api.core.NanoClock; import com.google.api.gax.retrying.ResultRetryAlgorithm; import com.google.api.gax.retrying.RetrySettings; @@ -24,6 +25,7 @@ import com.google.cloud.RetryHelper.RetryHelperException; import com.google.cloud.spanner.v1.stub.SpannerStub; import com.google.cloud.spanner.v1.stub.SpannerStubSettings; +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Throwables; import com.google.spanner.v1.RollbackRequest; import io.grpc.Context; @@ -45,25 +47,36 @@ class SpannerRetryHelper { * retrying aborted transactions will also automatically be updated if the default retry settings * are updated. * + *

A read/write transaction should not timeout while retrying. The total timeout of the retry + * settings is therefore set to 24 hours and there is no max attempts value. + * *

These default {@link RetrySettings} are only used if no retry information is returned by the * {@link AbortedException}. */ - private static final RetrySettings txRetrySettings = - SpannerStubSettings.newBuilder().rollbackSettings().getRetrySettings(); + @VisibleForTesting + static final RetrySettings txRetrySettings = + SpannerStubSettings.newBuilder() + .rollbackSettings() + .getRetrySettings() + .toBuilder() + .setTotalTimeout(Duration.ofHours(24L)) + .setMaxAttempts(0) + .build(); /** Executes the {@link Callable} and retries if it fails with an {@link AbortedException}. */ static T runTxWithRetriesOnAborted(Callable callable) { - return runTxWithRetriesOnAborted(callable, txRetrySettings); + return runTxWithRetriesOnAborted(callable, txRetrySettings, NanoClock.getDefaultClock()); } /** * Executes the {@link Callable} and retries if it fails with an {@link AbortedException} using * the specific {@link RetrySettings}. */ - static T runTxWithRetriesOnAborted(Callable callable, RetrySettings retrySettings) { + @VisibleForTesting + static T runTxWithRetriesOnAborted( + Callable callable, RetrySettings retrySettings, ApiClock clock) { try { - return RetryHelper.runWithRetries( - callable, retrySettings, new TxRetryAlgorithm<>(), NanoClock.getDefaultClock()); + return RetryHelper.runWithRetries(callable, retrySettings, new TxRetryAlgorithm<>(), clock); } catch (RetryHelperException e) { if (e.getCause() != null) { Throwables.throwIfUnchecked(e.getCause()); diff --git a/google-cloud-spanner/src/test/java/com/google/cloud/spanner/SpannerRetryHelperTest.java b/google-cloud-spanner/src/test/java/com/google/cloud/spanner/SpannerRetryHelperTest.java index 415145120f..679cc610ef 100644 --- a/google-cloud-spanner/src/test/java/com/google/cloud/spanner/SpannerRetryHelperTest.java +++ b/google-cloud-spanner/src/test/java/com/google/cloud/spanner/SpannerRetryHelperTest.java @@ -17,8 +17,10 @@ package com.google.cloud.spanner; import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; +import com.google.api.core.ApiClock; import com.google.common.base.Stopwatch; import com.google.common.util.concurrent.ThreadFactoryBuilder; import com.google.protobuf.Duration; @@ -42,6 +44,66 @@ @RunWith(JUnit4.class) public class SpannerRetryHelperTest { + private static class FakeClock implements ApiClock { + private long currentTime; + + @Override + public long nanoTime() { + return TimeUnit.NANOSECONDS.convert(currentTime, TimeUnit.MILLISECONDS); + } + + @Override + public long millisTime() { + return currentTime; + } + } + + @Test + public void testRetryDoesNotTimeoutAfterTenMinutes() { + final FakeClock clock = new FakeClock(); + final AtomicInteger attempts = new AtomicInteger(); + Callable callable = + new Callable() { + @Override + public Integer call() { + if (attempts.getAndIncrement() == 0) { + clock.currentTime += TimeUnit.MILLISECONDS.convert(10L, TimeUnit.MINUTES); + throw SpannerExceptionFactory.newSpannerException(ErrorCode.ABORTED, "test"); + } + return 1 + 1; + } + }; + assertEquals( + 2, + SpannerRetryHelper.runTxWithRetriesOnAborted( + callable, SpannerRetryHelper.txRetrySettings, clock) + .intValue()); + } + + @Test + public void testRetryDoesFailAfterMoreThanOneDay() { + final FakeClock clock = new FakeClock(); + final AtomicInteger attempts = new AtomicInteger(); + Callable callable = + new Callable() { + @Override + public Integer call() { + if (attempts.getAndIncrement() == 0) { + clock.currentTime += TimeUnit.MILLISECONDS.convert(25L, TimeUnit.HOURS); + throw SpannerExceptionFactory.newSpannerException(ErrorCode.ABORTED, "test"); + } + return 1 + 1; + } + }; + try { + SpannerRetryHelper.runTxWithRetriesOnAborted( + callable, SpannerRetryHelper.txRetrySettings, clock); + fail("missing expected exception"); + } catch (SpannerException e) { + assertEquals(ErrorCode.ABORTED, e.getErrorCode()); + assertEquals(1, attempts.get()); + } + } @Test public void testCancelledContext() {