-
Notifications
You must be signed in to change notification settings - Fork 296
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
bypass cassandra streaming #837
base: 3.x
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ | |
import com.netflix.priam.backup.BackupService; | ||
import com.netflix.priam.backupv2.BackupV2Service; | ||
import com.netflix.priam.cluster.management.ClusterManagementService; | ||
import com.netflix.priam.config.IBackupRestoreConfig; | ||
import com.netflix.priam.config.IConfiguration; | ||
import com.netflix.priam.config.PriamConfigurationPersister; | ||
import com.netflix.priam.defaultimpl.ICassandraProcess; | ||
|
@@ -42,6 +43,7 @@ | |
public class PriamServer implements IService { | ||
private final PriamScheduler scheduler; | ||
private final IConfiguration config; | ||
private final IBackupRestoreConfig backupRestoreConfig; | ||
private final InstanceIdentity instanceIdentity; | ||
private final Sleeper sleeper; | ||
private final ICassandraProcess cassProcess; | ||
|
@@ -56,6 +58,7 @@ public class PriamServer implements IService { | |
@Inject | ||
public PriamServer( | ||
IConfiguration config, | ||
IBackupRestoreConfig backupRestoreConfig, | ||
PriamScheduler scheduler, | ||
InstanceIdentity id, | ||
Sleeper sleeper, | ||
|
@@ -66,6 +69,7 @@ public PriamServer( | |
CassandraTunerService cassandraTunerService, | ||
ClusterManagementService clusterManagementService) { | ||
this.config = config; | ||
this.backupRestoreConfig = backupRestoreConfig; | ||
this.scheduler = scheduler; | ||
this.instanceIdentity = id; | ||
this.sleeper = sleeper; | ||
|
@@ -111,25 +115,48 @@ public void scheduleService() throws Exception { | |
UpdateSecuritySettings.getTimer(instanceIdentity)); | ||
} | ||
|
||
// Set up cassandra tuning. | ||
cassandraTunerService.scheduleService(); | ||
// Set up the background configuration dumping thread | ||
scheduleTask( | ||
scheduler, | ||
PriamConfigurationPersister.class, | ||
PriamConfigurationPersister.getTimer(config)); | ||
|
||
boolean shouldStartCassandra = false; | ||
|
||
// Determine if we need to restore from backup else start cassandra. | ||
if (restoreContext.isRestoreEnabled()) { | ||
// Determine if we need to restore from backup. | ||
if (restoreContext.isRestoreEnabled(config, instanceIdentity.getInstanceInfo())) { | ||
restoreContext.restore(); | ||
} else { // no restores needed | ||
logger.info("No restore needed, task not scheduled"); | ||
if (!config.doesCassandraStartManually()) cassProcess.start(true); // Start cassandra. | ||
else | ||
logger.info( | ||
"config.doesCassandraStartManually() is set to True, hence Cassandra needs to be started manually ..."); | ||
// Start cassandra only if restore is successful. | ||
shouldStartCassandra = true; | ||
} else { | ||
if (instanceIdentity.isReplace() | ||
&& backupRestoreConfig.enableBypassCassandraStreaming()) { | ||
logger.info("Trying to download data instead of streaming from Cassandra."); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Add "from backup", as in "Trying to download data instead of streaming from Cassandra" |
||
try { | ||
restoreContext.restore(); | ||
instanceIdentity.setReplacedIp(""); | ||
} catch (Exception e) { | ||
logger.error( | ||
"Error while trying to rebuild the node from backup. Maybe backup not available or disk full? Trying normal path of cassandra streaming"); | ||
// Clean the data folder. | ||
SystemUtils.cleanupDir(config.getDataFileLocation(), null); | ||
} finally { | ||
shouldStartCassandra = true; | ||
} | ||
} else { | ||
// no restores needed | ||
logger.info("No restore needed, task not scheduled"); | ||
shouldStartCassandra = true; | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How can we exit this block with shouldStartCassandra being false? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the original requested But yes with the recent refactoring of restore we will throw exception there and thus we don't need that variable. good catch. |
||
|
||
/* | ||
* Run the delayed task (after 10 seconds) to Monitor Cassandra | ||
* If Restore option is chosen, then Running Cassandra instance is stopped | ||
* Hence waiting for Cassandra to stop | ||
*/ | ||
// Tune Cassandra. | ||
cassandraTunerService.scheduleService(); | ||
|
||
// Start Cassandra. | ||
if (shouldStartCassandra) startCassandra(); | ||
|
||
// Run the delayed task (after 10 seconds) to Monitor Cassandra | ||
scheduler.addTaskWithDelay( | ||
CassandraMonitor.JOBNAME, | ||
CassandraMonitor.class, | ||
|
@@ -139,19 +166,20 @@ public void scheduleService() throws Exception { | |
// Set up management services like flush, compactions etc. | ||
clusterManagementService.scheduleService(); | ||
|
||
// Set up the background configuration dumping thread | ||
scheduleTask( | ||
scheduler, | ||
PriamConfigurationPersister.class, | ||
PriamConfigurationPersister.getTimer(config)); | ||
|
||
// Set up V1 Snapshot Service | ||
backupService.scheduleService(); | ||
|
||
// Set up V2 Snapshot Service | ||
backupV2Service.scheduleService(); | ||
} | ||
|
||
private void startCassandra() throws IOException { | ||
if (!config.doesCassandraStartManually()) cassProcess.start(true); // Start cassandra. | ||
else | ||
logger.info( | ||
"config.doesCassandraStartManually() is set to True, hence Cassandra needs to be started manually ..."); | ||
} | ||
|
||
@Override | ||
public void updateServicePre() throws Exception {} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -97,4 +97,18 @@ default int getBackupVerificationSLOInHours() { | |
default boolean enableV2Restore() { | ||
return false; | ||
} | ||
|
||
/** | ||
* Build the instance from backups by using restore process in case of an instance replacements. | ||
* Note that we prefer this when data size is HUGE. C* streaming is super slow and for instances | ||
* with big data size can lead to C* streaming for multiple days. Note that this is a little bit | ||
* dangerous as you "will" some of the writes accepted by old instance but not uploaded to | ||
arunagrawal84 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
* backup file system. Also we do not plan to run local repair on the replaced instance, so data | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that not running repair is acceptable for a first iteration. Hypothetically though, how would we do it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ideally, we should be deferring that task to the repair service. Where that repair service sits, how it gets executed is a different conversation though. |
||
* will be stale. We hope that repair will take care of the inconsistency. | ||
* | ||
* @return use restore for replacements (bypassing cassandra streaming), if backup is available. | ||
*/ | ||
default boolean enableBypassCassandraStreaming() { | ||
return true; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How does Priam determine whether Cassandra hasn't successfully bootstrapped? I'm looked for an existing check, but I didn't see one.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TokenRetrieverUtils.inferTokenOwnerFromGossip is used to fetch the instance identity. That method should tell correctly if Cassandra had already bootstrapped successfully.