Skip to content

Commit

Permalink
fix: transaction retries should not timeout (#1009)
Browse files Browse the repository at this point in the history
Transactions that are retried because of an aborted transaction use the
retry settings of the Rollback RPC. This ensures reasonable backoff values.
It however also meant that transactions that are retried multiple times
could exceed the total timeout of the retry settings, and that again would
cause the Aborted error to propagate. This change sets the total timeout
for transaction retries to 24 hours and disables any max attempts in the
retry settings to prevent retries to fail because the deadline is exceeded.

Transactions can still fail with timeout errors if individual RPC invocations
exceed the configured timeout of that RPC. This change only prevents timeouts
from occurring because of repeated retries of an entire transaction.

Fixes #1008
  • Loading branch information
olavloite committed Mar 24, 2021
1 parent 865bf01 commit 6d9c3b8
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 6 deletions.
Expand Up @@ -16,6 +16,7 @@

package com.google.cloud.spanner;

import com.google.api.core.ApiClock;
import com.google.api.core.NanoClock;
import com.google.api.gax.retrying.ResultRetryAlgorithm;
import com.google.api.gax.retrying.RetrySettings;
Expand All @@ -24,6 +25,7 @@
import com.google.cloud.RetryHelper.RetryHelperException;
import com.google.cloud.spanner.v1.stub.SpannerStub;
import com.google.cloud.spanner.v1.stub.SpannerStubSettings;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Throwables;
import com.google.spanner.v1.RollbackRequest;
import io.grpc.Context;
Expand All @@ -45,25 +47,36 @@ class SpannerRetryHelper {
* retrying aborted transactions will also automatically be updated if the default retry settings
* are updated.
*
* <p>A read/write transaction should not timeout while retrying. The total timeout of the retry
* settings is therefore set to 24 hours and there is no max attempts value.
*
* <p>These default {@link RetrySettings} are only used if no retry information is returned by the
* {@link AbortedException}.
*/
private static final RetrySettings txRetrySettings =
SpannerStubSettings.newBuilder().rollbackSettings().getRetrySettings();
@VisibleForTesting
static final RetrySettings txRetrySettings =
SpannerStubSettings.newBuilder()
.rollbackSettings()
.getRetrySettings()
.toBuilder()
.setTotalTimeout(Duration.ofHours(24L))
.setMaxAttempts(0)
.build();

/** Executes the {@link Callable} and retries if it fails with an {@link AbortedException}. */
static <T> T runTxWithRetriesOnAborted(Callable<T> callable) {
return runTxWithRetriesOnAborted(callable, txRetrySettings);
return runTxWithRetriesOnAborted(callable, txRetrySettings, NanoClock.getDefaultClock());
}

/**
* Executes the {@link Callable} and retries if it fails with an {@link AbortedException} using
* the specific {@link RetrySettings}.
*/
static <T> T runTxWithRetriesOnAborted(Callable<T> callable, RetrySettings retrySettings) {
@VisibleForTesting
static <T> T runTxWithRetriesOnAborted(
Callable<T> callable, RetrySettings retrySettings, ApiClock clock) {
try {
return RetryHelper.runWithRetries(
callable, retrySettings, new TxRetryAlgorithm<>(), NanoClock.getDefaultClock());
return RetryHelper.runWithRetries(callable, retrySettings, new TxRetryAlgorithm<>(), clock);
} catch (RetryHelperException e) {
if (e.getCause() != null) {
Throwables.throwIfUnchecked(e.getCause());
Expand Down
Expand Up @@ -17,8 +17,10 @@
package com.google.cloud.spanner;

import static com.google.common.truth.Truth.assertThat;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;

import com.google.api.core.ApiClock;
import com.google.common.base.Stopwatch;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.google.protobuf.Duration;
Expand All @@ -42,6 +44,66 @@

@RunWith(JUnit4.class)
public class SpannerRetryHelperTest {
private static class FakeClock implements ApiClock {
private long currentTime;

@Override
public long nanoTime() {
return TimeUnit.NANOSECONDS.convert(currentTime, TimeUnit.MILLISECONDS);
}

@Override
public long millisTime() {
return currentTime;
}
}

@Test
public void testRetryDoesNotTimeoutAfterTenMinutes() {
final FakeClock clock = new FakeClock();
final AtomicInteger attempts = new AtomicInteger();
Callable<Integer> callable =
new Callable<Integer>() {
@Override
public Integer call() {
if (attempts.getAndIncrement() == 0) {
clock.currentTime += TimeUnit.MILLISECONDS.convert(10L, TimeUnit.MINUTES);
throw SpannerExceptionFactory.newSpannerException(ErrorCode.ABORTED, "test");
}
return 1 + 1;
}
};
assertEquals(
2,
SpannerRetryHelper.runTxWithRetriesOnAborted(
callable, SpannerRetryHelper.txRetrySettings, clock)
.intValue());
}

@Test
public void testRetryDoesFailAfterMoreThanOneDay() {
final FakeClock clock = new FakeClock();
final AtomicInteger attempts = new AtomicInteger();
Callable<Integer> callable =
new Callable<Integer>() {
@Override
public Integer call() {
if (attempts.getAndIncrement() == 0) {
clock.currentTime += TimeUnit.MILLISECONDS.convert(25L, TimeUnit.HOURS);
throw SpannerExceptionFactory.newSpannerException(ErrorCode.ABORTED, "test");
}
return 1 + 1;
}
};
try {
SpannerRetryHelper.runTxWithRetriesOnAborted(
callable, SpannerRetryHelper.txRetrySettings, clock);
fail("missing expected exception");
} catch (SpannerException e) {
assertEquals(ErrorCode.ABORTED, e.getErrorCode());
assertEquals(1, attempts.get());
}
}

@Test
public void testCancelledContext() {
Expand Down

0 comments on commit 6d9c3b8

Please sign in to comment.