Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[controller] Do not delete parent VT for target colo batch push #964

Merged
merged 9 commits into from
May 1, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -952,6 +952,7 @@ void postValidationConsumption(Set<String> candidateRegions) {
} else {
// perform data recovery for BATCH stores
for (String region: candidateRegions) {
LOGGER.info("Pushing from {} to {}", pushJobSetting.kafkaSourceRegion, region);
ControllerResponse response = controllerClient.dataRecovery(
pushJobSetting.kafkaSourceRegion,
region,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import static com.linkedin.venice.hadoop.VenicePushJobConstants.SEND_CONTROL_MESSAGES_DIRECTLY;
import static com.linkedin.venice.hadoop.VenicePushJobConstants.SOURCE_KAFKA;
import static com.linkedin.venice.hadoop.VenicePushJobConstants.TARGETED_REGION_PUSH_ENABLED;
import static com.linkedin.venice.hadoop.VenicePushJobConstants.TARGETED_REGION_PUSH_LIST;
import static com.linkedin.venice.integration.utils.VeniceControllerWrapper.D2_SERVICE_NAME;
import static com.linkedin.venice.integration.utils.VeniceControllerWrapper.PARENT_D2_SERVICE_NAME;
import static com.linkedin.venice.meta.PersistenceType.ROCKS_DB;
Expand Down Expand Up @@ -923,70 +922,6 @@ public void testControllerBlocksConcurrentBatchPush() {
* not receive any data.
* @throws IOException
*/
@Test(timeOut = TEST_TIMEOUT)
public void testTargetedRegionPushJob() throws Exception {
motherOfAllTests(
"testTargetedRegionPushJob",
updateStoreQueryParams -> updateStoreQueryParams.setPartitionCount(1),
100,
(parentControllerClient, clusterName, storeName, props, inputDir) -> {
// start a targeted region push which should only increase the version to 1 in dc-0
props.put(TARGETED_REGION_PUSH_ENABLED, true);
props.put(POST_VALIDATION_CONSUMPTION_ENABLED, false);
try (VenicePushJob job = new VenicePushJob("Test push job 1", props)) {
job.run(); // the job should succeed

TestUtils.waitForNonDeterministicAssertion(45, TimeUnit.SECONDS, () -> {
Map<String, Integer> coloVersions =
parentControllerClient.getStore(storeName).getStore().getColoToCurrentVersions();

coloVersions.forEach((colo, version) -> {
if (colo.equals(DEFAULT_NATIVE_REPLICATION_SOURCE)) {
Assert.assertEquals((int) version, 1);
} else {
Assert.assertEquals((int) version, 0);
}
});
});
}

// specify two regions, so both dc-0 and dc-1 is updated to version 2
props.setProperty(TARGETED_REGION_PUSH_LIST, "dc-0, dc-1");
try (VenicePushJob job = new VenicePushJob("Test push job 2", props)) {
job.run(); // the job should succeed

TestUtils.waitForNonDeterministicAssertion(30, TimeUnit.SECONDS, () -> {
// Current version should become 1 at both 2 data centers
for (int version: parentControllerClient.getStore(storeName)
.getStore()
.getColoToCurrentVersions()
.values()) {
Assert.assertEquals(version, 2);
}
});
}

// emergency source is dc-0 so dc-1 isn't selected to be the source fabric but the push should still complete
props.setProperty(TARGETED_REGION_PUSH_LIST, "dc-1");
try (VenicePushJob job = new VenicePushJob("Test push job 3", props)) {
job.run(); // the job should succeed

TestUtils.waitForNonDeterministicAssertion(30, TimeUnit.SECONDS, () -> {
Map<String, Integer> coloVersions =
parentControllerClient.getStore(storeName).getStore().getColoToCurrentVersions();

coloVersions.forEach((colo, version) -> {
if (colo.equals("dc-1")) {
Assert.assertEquals((int) version, 3);
} else {
Assert.assertEquals((int) version, 2);
}
});
});
}
});
}

@Test(timeOut = TEST_TIMEOUT * 2)
public void testTargetedRegionPushJobFullConsumptionForBatchStore() throws Exception {
// make sure the participant store is up and running in dest region otherwise the test will be flaky
Expand All @@ -1010,7 +945,7 @@ public void testTargetedRegionPushJobFullConsumptionForBatchStore() throws Excep
100,
(parentControllerClient, clusterName, storeName, props, inputDir) -> {
props.put(TARGETED_REGION_PUSH_ENABLED, false);
props.put(POST_VALIDATION_CONSUMPTION_ENABLED, false);
// props.put(POST_VALIDATION_CONSUMPTION_ENABLED, false);
try (VenicePushJob job = new VenicePushJob("Test push job 1", props)) {
job.run(); // the job should succeed

Expand All @@ -1025,7 +960,7 @@ public void testTargetedRegionPushJobFullConsumptionForBatchStore() throws Excep
});
}
props.put(TARGETED_REGION_PUSH_ENABLED, true);
props.put(POST_VALIDATION_CONSUMPTION_ENABLED, true);
// props.put(POST_VALIDATION_CONSUMPTION_ENABLED, true);
TestWriteUtils.writeSimpleAvroFileWithStringToStringSchema(inputDir, 20);
try (VenicePushJob job = new VenicePushJob("Test push job 2", props)) {
job.run(); // the job should succeed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3526,13 +3526,37 @@ private OfflinePushStatusInfo getOffLineJobStatus(
String storeName = Version.parseStoreFromKafkaTopicName(kafkaTopic);
int versionNum = Version.parseVersionFromKafkaTopicName(kafkaTopic);
boolean deferredSwap = isDeferredSwap(clusterName, storeName, versionNum);
HelixVeniceClusterResources resources = getVeniceHelixAdmin().getHelixVeniceClusterResources(clusterName);

if (!deferredSwap) {
truncateTopicsOptionally(
clusterName,
kafkaTopic,
incrementalPushVersion,
currentReturnStatus,
currentReturnStatusDetails);
try (AutoCloseableLock ignore = resources.getClusterLockManager().createStoreWriteLock(storeName)) {
ReadWriteStoreRepository repository = resources.getStoreMetadataRepository();
Store parentStore = repository.getStore(storeName);
boolean isTargetRegionPush = !StringUtils.isEmpty(targetedRegions);
huangminchn marked this conversation as resolved.
Show resolved Hide resolved
boolean isVersionPushed =
parentStore.getVersion(versionNum).map(v -> v.getStatus().equals(PUSHED)).orElse(false);
// Truncate topic after push is in terminal state if
// 1. Its a hybrid store or regular push
// 2. If target push is enabled and its previously pushed by previous target colo push (status == PUSHED)
if (!isTargetRegionPush || isVersionPushed
|| parentStore.getVersion(versionNum).get().getHybridStoreConfig() != null) {
majisourav99 marked this conversation as resolved.
Show resolved Hide resolved
LOGGER
.info("Truncating parent VT {} after push status {}", kafkaTopic, currentReturnStatus.getRootStatus());
truncateTopicsOptionally(
clusterName,
kafkaTopic,
incrementalPushVersion,
currentReturnStatus,
currentReturnStatusDetails);
}
if (isTargetRegionPush && !isVersionPushed) {
huangminchn marked this conversation as resolved.
Show resolved Hide resolved
parentStore.updateVersionStatus(versionNum, PUSHED);
repository.updateStore(parentStore);
} else {
parentStore.updateVersionStatus(versionNum, ONLINE);
repository.updateStore(parentStore);
}
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,14 @@
import com.linkedin.venice.meta.OfflinePushStrategy;
import com.linkedin.venice.meta.PersistenceType;
import com.linkedin.venice.meta.ReadStrategy;
import com.linkedin.venice.meta.ReadWriteStoreRepository;
import com.linkedin.venice.meta.RegionPushDetails;
import com.linkedin.venice.meta.RoutingStrategy;
import com.linkedin.venice.meta.Store;
import com.linkedin.venice.meta.StoreInfo;
import com.linkedin.venice.meta.Version;
import com.linkedin.venice.meta.VersionImpl;
import com.linkedin.venice.meta.VersionStatus;
import com.linkedin.venice.meta.ViewConfigImpl;
import com.linkedin.venice.meta.ZKStore;
import com.linkedin.venice.offsets.OffsetRecord;
Expand All @@ -87,6 +89,7 @@
import com.linkedin.venice.utils.Time;
import com.linkedin.venice.utils.Utils;
import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap;
import com.linkedin.venice.utils.locks.ClusterLockManager;
import com.linkedin.venice.views.ChangeCaptureView;
import com.linkedin.venice.writer.VeniceWriter;
import java.nio.ByteBuffer;
Expand Down Expand Up @@ -1560,7 +1563,16 @@ public void testGetExecutionStatus() {
doReturn(false).when(store).isIncrementalPushEnabled();
doReturn(Optional.empty()).when(store).getVersion(anyInt());
doReturn(store).when(internalAdmin).getStore(anyString(), anyString());
// doReturn(false).when(parentAdmin).isDeferredSwap(anyString(), anyString(), anyInt());
HelixVeniceClusterResources resources = mock(HelixVeniceClusterResources.class);
doReturn(mock(ClusterLockManager.class)).when(resources).getClusterLockManager();
doReturn(resources).when(internalAdmin).getHelixVeniceClusterResources(anyString());
ReadWriteStoreRepository repository = mock(ReadWriteStoreRepository.class);
doReturn(repository).when(resources).getStoreMetadataRepository();
doReturn(store).when(repository).getStore(anyString());
Version version = mock(Version.class);
doReturn(Optional.of(version)).when(store).getVersion(anyInt());
doReturn(VersionStatus.CREATED).when(version).getStatus();
doReturn(Version.PushType.BATCH).when(version).getPushType();
Admin.OfflinePushStatusInfo offlineJobStatus = parentAdmin.getOffLineJobStatus("IGNORED", "topic1_v1", completeMap);
Map<String, String> extraInfo = offlineJobStatus.getExtraInfo();
Assert.assertEquals(offlineJobStatus.getExecutionStatus(), ExecutionStatus.COMPLETED);
Expand Down