Skip to content

Commit

Permalink
[PLAT-12830][PLAT-13409] Universe left locked, and restore left "In P…
Browse files Browse the repository at this point in the history
…rogress" if YBA dies during restore. Abort backup/restore operations if YBA restarts during xcluster setup

Summary:
There are 2 changes made as part of this PR:

  # Today, we are not unlocking the universe if the cluster fails to enable LB when the restore operation fails.
  # Abort backup, restore operations on YBC if YBA shuts down while the xcluster operation is in progress.

Test Plan: Manually tested the changes to make sure that backup, restore are getting aborted when YBA shuts down and the xcluster backup or restore operations are in progress.

Reviewers: vkumar

Reviewed By: vkumar

Subscribers: yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D34207
  • Loading branch information
vpatibandla-yb committed May 6, 2024
1 parent 749d700 commit 628b398
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,9 @@ public void run() {

// Run all the tasks.
getRunnableTask().runSubTasks();
unlockUniverseForUpdate();
if (restore != null) {
restore.update(getTaskUUID(), Restore.State.Completed);
}

} catch (CancellationException ce) {
unlockUniverseForUpdate(false);
isAbort = true;
Expand All @@ -106,9 +104,10 @@ public void run() {
log.error("Error executing task {} with error='{}'.", getName(), t.getMessage(), t);
handleFailedBackupAndRestore(
null, Arrays.asList(restore), isAbort, taskParams().alterLoadBalancer);
unlockUniverseForUpdate();
kubernetesStatus.updateRestoreJobStatus("Failed Restore task", getUserTaskUUID());
throw t;
} finally {
unlockUniverseForUpdate();
}
kubernetesStatus.updateRestoreJobStatus("Finished Restore", getUserTaskUUID());
log.info("Finished {} task.", getName());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import com.yugabyte.yw.forms.BackupTableParams;
import com.yugabyte.yw.models.Backup;
import com.yugabyte.yw.models.Universe;
import com.yugabyte.yw.models.helpers.TaskType;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
Expand Down Expand Up @@ -210,7 +211,8 @@ public void run() {
Throwables.propagate(e);
}
} catch (CancellationException ce) {
if (!taskExecutor.isShutdown()) {
if (!taskExecutor.isShutdown()
|| !getRunnableTask().getTaskInfo().getTaskType().equals(TaskType.CreateBackup)) {
if (ce.getMessage().contains("Task aborted on YB-Controller")) {
// Remove task on YB-Controller server.
ybcManager.deleteYbcBackupTask(taskParams().taskID, ybcClient);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -229,11 +229,14 @@ public void run() {
restoreKeyspace.update(getTaskUUID(), RestoreKeyspace.State.Completed);
}
} catch (CancellationException ce) {
if (!taskExecutor.isShutdown()) {
if (!taskExecutor.isShutdown()
|| !getRunnableTask().getTaskInfo().getTaskType().equals(TaskType.RestoreBackup)) {
// update aborted/failed - not showing aborted from here.
if (restoreKeyspace != null) {
restoreKeyspace.update(getTaskUUID(), RestoreKeyspace.State.Aborted);
}
ybcManager.abortRestoreTask(
taskParams().customerUUID, taskParams().prefixUUID, taskId, ybcClient);
ybcManager.deleteYbcBackupTask(taskId, ybcClient);
}
Throwables.propagate(ce);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

package com.yugabyte.yw.common.backuprestore.ybc;

import static play.mvc.Http.Status.BAD_REQUEST;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableSet;
import com.google.inject.Inject;
Expand All @@ -15,6 +17,7 @@
import com.yugabyte.yw.common.KubernetesManagerFactory;
import com.yugabyte.yw.common.NFSUtil;
import com.yugabyte.yw.common.NodeManager;
import com.yugabyte.yw.common.PlatformServiceException;
import com.yugabyte.yw.common.ReleaseManager;
import com.yugabyte.yw.common.StorageUtilFactory;
import com.yugabyte.yw.common.Util;
Expand All @@ -38,6 +41,7 @@
import com.yugabyte.yw.models.InstanceType;
import com.yugabyte.yw.models.Provider;
import com.yugabyte.yw.models.Region;
import com.yugabyte.yw.models.Restore;
import com.yugabyte.yw.models.Universe;
import com.yugabyte.yw.models.Universe.UniverseUpdater;
import com.yugabyte.yw.models.configs.CustomerConfig;
Expand Down Expand Up @@ -496,6 +500,47 @@ public void abortBackupTask(
}
}

public void abortRestoreTask(
UUID customerUUID, UUID restoreUUID, String taskID, YbcClient ybcClient) {
Optional<Restore> optionalRestore = Restore.fetchRestore(restoreUUID);
if (!optionalRestore.isPresent()) {
throw new PlatformServiceException(
BAD_REQUEST, String.format("Invalid restore UUID: %s", restoreUUID));
}
Restore restore = optionalRestore.get();
try {
BackupServiceTaskAbortRequest abortTaskRequest =
BackupServiceTaskAbortRequest.newBuilder().setTaskId(taskID).build();
BackupServiceTaskAbortResponse abortTaskResponse =
ybcClient.backupServiceTaskAbort(abortTaskRequest);
if (!abortTaskResponse.getStatus().getCode().equals(ControllerStatus.OK)) {
LOG.error(
"Aborting restore {} task errored out with {}.",
restore.getRestoreUUID(),
abortTaskResponse.getStatus().getErrorMessage());
return;
}
BackupServiceTaskResultRequest taskResultRequest =
BackupServiceTaskResultRequest.newBuilder().setTaskId(taskID).build();
BackupServiceTaskResultResponse taskResultResponse =
ybcClient.backupServiceTaskResult(taskResultRequest);
if (!taskResultResponse.getTaskStatus().equals(ControllerStatus.ABORT)) {
LOG.error(
"Aborting restore {} task errored out and is in {} state.",
restore.getRestoreUUID(),
taskResultResponse.getTaskStatus());
return;
} else {
LOG.info(
"Restore {} task is successfully aborted on Yb-controller.", restore.getRestoreUUID());
deleteYbcBackupTask(taskID, ybcClient);
}
} catch (Exception e) {
LOG.error(
"Restore {} task abort failed with error: {}.", restore.getRestoreUUID(), e.getMessage());
}
}

public void deleteYbcBackupTask(String taskID, YbcClient ybcClient) {
try {
BackupServiceTaskResultRequest taskResultRequest =
Expand Down

0 comments on commit 628b398

Please sign in to comment.