Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CASSANDRA-19472 - CEP-15 (C*) Integrate accord with repair #3174

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitmodules
@@ -1,4 +1,4 @@
[submodule "modules/accord"]
path = modules/accord
url = https://github.com/apache/cassandra-accord.git
branch = trunk
url = ../cassandra-accord.git
branch = accord-IR
Expand Up @@ -35,6 +35,7 @@
/** A class that extracts system properties for the cassandra node it runs within. */
public enum CassandraRelevantProperties
{
ACCORD_AGENT_CLASS("cassandra.test.accord.agent"),
ACCORD_REPAIR_RANGE_STEP_UPDATE_INTERVAL("cassandra.accord.repair.range_step_update_interval", "100"),
ACQUIRE_RETRY_SECONDS("cassandra.acquire_retry_seconds", "60"),
ACQUIRE_SLEEP_MS("cassandra.acquire_sleep_ms", "1000"),
Expand Down
Expand Up @@ -246,7 +246,7 @@ public void finished()
checkNotNull(minVersion, "Unable to determine minimum cluster version");
IAccordService accordService = AccordService.instance();
if (session.streamOperation().requiresBarrierTransaction()
&& cfs.metadata().isAccordEnabled()
&& cfs.metadata().requiresAccordSupport()
aweisberg marked this conversation as resolved.
Show resolved Hide resolved
&& CassandraVersion.CASSANDRA_5_0.compareTo(minVersion) >= 0)
accordService.postStreamReceivingBarrier(cfs, ranges);

Expand Down
12 changes: 6 additions & 6 deletions src/java/org/apache/cassandra/metrics/TableMetrics.java
Expand Up @@ -186,9 +186,9 @@ public class TableMetrics
/** Latency for locally run key migrations **/
public final LatencyMetrics keyMigration;
/** Latency for range migrations run by locally coordinated Accord repairs **/
public final LatencyMetrics rangeMigration;
public final TableMeter rangeMigrationUnexpectedFailures;
public final TableMeter rangeMigrationDependencyLimitFailures;
public final LatencyMetrics accordRepair;
public final TableMeter accordRepairUnexpectedFailures;
public final TableMeter accordRepairDependencyLimitFailures;
/** percent of the data that is repaired */
public final Gauge<Double> percentRepaired;
/** Reports the size of sstables in repaired, unrepaired, and any ongoing repair buckets */
Expand Down Expand Up @@ -813,9 +813,9 @@ public Long getValue()
casPropose = createLatencyMetrics("CasPropose", cfs.keyspace.metric.casPropose);
casCommit = createLatencyMetrics("CasCommit", cfs.keyspace.metric.casCommit);
keyMigration = createLatencyMetrics("KeyMigration", cfs.keyspace.metric.keyMigration, GLOBAL_KEY_MIGRATION_LATENCY);
rangeMigration = createLatencyMetrics("RangeMigration", cfs.keyspace.metric.rangeMigration, GLOBAL_RANGE_MIGRATION_LATENCY);
rangeMigrationUnexpectedFailures = createTableMeter("RangeMigrationUnexpectedFailures", cfs.keyspace.metric.rangeMigrationUnexpectedFailures);
rangeMigrationDependencyLimitFailures = createTableMeter("RangeMigrationDependencyLimitFaiures", cfs.keyspace.metric.rangeMigrationDependencyLimitFailures);
accordRepair = createLatencyMetrics("AccordRepair", cfs.keyspace.metric.rangeMigration, GLOBAL_RANGE_MIGRATION_LATENCY);
accordRepairUnexpectedFailures = createTableMeter("AccordRepairUnexpectedFailures", cfs.keyspace.metric.rangeMigrationUnexpectedFailures);
accordRepairDependencyLimitFailures = createTableMeter("AccordRepairDependencyLimitFaiures", cfs.keyspace.metric.rangeMigrationDependencyLimitFailures);

repairsStarted = createTableCounter("RepairJobsStarted");
repairsCompleted = createTableCounter("RepairJobsCompleted");
Expand Down
66 changes: 0 additions & 66 deletions src/java/org/apache/cassandra/repair/AbstractRepairJob.java

This file was deleted.

3 changes: 2 additions & 1 deletion src/java/org/apache/cassandra/repair/AbstractRepairTask.java
Expand Up @@ -79,7 +79,8 @@ private List<RepairSession> submitRepairSessions(TimeUUID parentSession,
options.optimiseStreams(),
options.repairPaxos(),
options.paxosOnly(),
options.accordRepair(),
options.accordOnly(),
options.isConsensusMigration(),
executor,
validationScheduler,
cfnames);
Expand Down
18 changes: 18 additions & 0 deletions src/java/org/apache/cassandra/repair/RepairCoordinator.java
Expand Up @@ -66,6 +66,7 @@
import org.apache.cassandra.repair.state.CoordinatorState;
import org.apache.cassandra.schema.SchemaConstants;
import org.apache.cassandra.schema.SystemDistributedKeyspace;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.service.ActiveRepairService;
import org.apache.cassandra.service.ActiveRepairService.ParentRepairStatus;
import org.apache.cassandra.service.ClientState;
Expand Down Expand Up @@ -282,12 +283,29 @@ public void run()
}
}

private void validate(RepairOption options)
{
if (options.paxosOnly() && options.accordOnly())
throw new IllegalArgumentException("Cannot specify a repair as both paxos only and accord only");

for (ColumnFamilyStore cfs : columnFamilies)
{
TableMetadata metadata = cfs.metadata();
if (options.paxosOnly() && !metadata.supportsPaxosOperations())
throw new IllegalArgumentException(String.format("Cannot run paxos only repair on %s.%s, which isn't configured for paxos operations", cfs.keyspace.getName(), cfs.name));

if (options.accordOnly() && !metadata.requiresAccordSupport())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As an operator this might be painful because it requires you to know which kind you should invoke when in reality you just want to run the active consensus system's repair and you don't care which. I think we should split the ranges and then run the appropriate repair for each.

If we can make existing external tooling continue to "just work" we definitely should.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thinking a little harder about this being the X only case which is unusual, it still seems like we shouldn't expose the need to determine which ranges belong to which system outside of C*.

If you want to make this a TODO and file a JIRA we can kick the can down the road on this one.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

throw new IllegalArgumentException(String.format("Cannot run accord only repair on %s.%s, which isn't configured for accord operations", cfs.keyspace.getName(), cfs.name));
}
}

private void runMayThrow() throws Throwable
{
state.phase.setup();
ctx.repair().recordRepairStatus(state.cmd, ParentRepairStatus.IN_PROGRESS, ImmutableList.of());

populateColumnFamilies();
validate(state.options);

this.traceState = maybeCreateTraceState(columnFamilies);
notifyStarting();
Expand Down