Skip to content

Commit

Permalink
[#22138] docdb: Remove master-failover retry code from CloneStateManager
Browse files Browse the repository at this point in the history
Summary:
The clone state manager currently persists sufficient data to retry a clone after a master failover. Since we deemed this behavior is overly complicated for now, a lot of this persisted data (e.g. the source / target tablet ids for all cloned tablets) can be moved to the CloneStateInfo structure (in memory) instead.

Fixes #22138.

**Upgrade/Rollback safety:**
Only touches an entity that is not meant for production use yet (guarded by `FLAGS_enable_db_clone`).

Jira: DB-11063

Test Plan: existing tests

Reviewers: mhaddad

Reviewed By: mhaddad

Subscribers: ybase

Differential Revision: https://phorge.dev.yugabyte.com/D34499
  • Loading branch information
SrivastavaAnubhav committed May 4, 2024
1 parent e24c0f8 commit 994579d
Show file tree
Hide file tree
Showing 7 changed files with 260 additions and 217 deletions.
32 changes: 12 additions & 20 deletions src/yb/master/catalog_entity_info.proto
Original file line number Diff line number Diff line change
Expand Up @@ -582,36 +582,28 @@ message SysSnapshotEntryPB {

message SysCloneStatePB {
// State for clone operation.
// Initial state:
// - For YSQL: CLONE_SCHEMA_STARTED: async task to create PG schema is started.
// - For YCQL: CREATING: CLONE_OP for every tablet is issued.
// Transitions:
// CLONE_SCHEMA_STARTED --> CREATING: After successful response from create pg schema async task.
// CLONE_SCHEMA_STARTED: Initial state.
// CLONE_SCHEMA_STARTED --> CREATING: Once DB objects have been created by ysql_dump (YSQL) or
// ImportSnapshot (YCQL). Clone tablet RPCs are scheduled for all tablets.
// CREATING --> RESTORING: All tablets in clone namespace are created and in a running
// stated. Restore snapshot is issued.
// RESTORING --> RESTORED: Restore snapshot completed
// state. Restore snapshot is issued.
// RESTORING --> RESTORED: Restore snapshot completed.
enum State {
CLONE_SCHEMA_STARTED = 1;
CREATING = 2;
RESTORING = 3;
RESTORED = 4;
}

message TabletData {
optional string source_tablet_id = 1;
optional string target_tablet_id = 2;
}
optional State aggregate_state = 1;

repeated TabletData tablet_data = 1;
optional bytes source_snapshot_id = 2;
optional bytes target_snapshot_id = 3;
optional string source_namespace_id = 4;
optional uint32 clone_request_seq_no = 5;
optional fixed64 restore_time = 6;
// Needed to load the clone state into the CloneStateManager's map.
optional string source_namespace_id = 2;
optional uint32 clone_request_seq_no = 3;

// Aggregate state of the clone operation. Valid transitions:
// CLONE_SCHEMA_STARTED --> CREATING --> RESTORING --> RESTORED
optional State aggregate_state = 7;
// Used for debugging.
optional string target_namespace_name = 4;
optional fixed64 restore_time = 5;
}

message SchemaVersionMappingPB {
Expand Down
40 changes: 40 additions & 0 deletions src/yb/master/clone/clone_state_entity.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,44 @@ void CloneStateInfo::Load(const SysCloneStatePB& metadata) {
CloneStateInfo::CloneStateInfo(std::string id):
clone_request_id_(std::move(id)) {}

std::vector<CloneStateInfo::TabletData> CloneStateInfo::GetTabletData() {
std::lock_guard l(mutex_);
return tablet_data_;
}

void CloneStateInfo::AddTabletData(TabletData tablet_data) {
std::lock_guard l(mutex_);
tablet_data_.push_back(std::move(tablet_data));
}

const TxnSnapshotId& CloneStateInfo::SourceSnapshotId() {
std::lock_guard l(mutex_);
return source_snapshot_id_;
}

void CloneStateInfo::SetSourceSnapshotId(const TxnSnapshotId& source_snapshot_id) {
std::lock_guard l(mutex_);
source_snapshot_id_ = source_snapshot_id;
}

const TxnSnapshotId& CloneStateInfo::TargetSnapshotId() {
std::lock_guard l(mutex_);
return target_snapshot_id_;
}

void CloneStateInfo::SetTargetSnapshotId(const TxnSnapshotId& target_snapshot_id) {
std::lock_guard l(mutex_);
target_snapshot_id_ = target_snapshot_id;
}

const TxnSnapshotRestorationId& CloneStateInfo::RestorationId() {
std::lock_guard l(mutex_);
return restoration_id_;
}

void CloneStateInfo::SetRestorationId(const TxnSnapshotRestorationId& restoration_id) {
std::lock_guard l(mutex_);
restoration_id_ = restoration_id;
}

} // namespace yb::master
28 changes: 28 additions & 0 deletions src/yb/master/clone/clone_state_entity.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#pragma once

#include "yb/common/entity_ids_types.h"
#include "yb/common/snapshot.h"
#include "yb/master/catalog_entity_base.h"
#include "yb/master/catalog_entity_info.pb.h"
#include "yb/master/sys_catalog.h"
Expand All @@ -26,19 +27,46 @@ struct PersistentCloneStateInfo :
class CloneStateInfo : public RefCountedThreadSafe<CloneStateInfo>,
public MetadataCowWrapper<PersistentCloneStateInfo> {
public:
struct TabletData {
TabletId source_tablet_id;
TabletId target_tablet_id;
};

explicit CloneStateInfo(std::string id);

virtual const std::string& id() const override { return clone_request_id_; };

void Load(const SysCloneStatePB& metadata) override;

std::vector<TabletData> GetTabletData();
void AddTabletData(CloneStateInfo::TabletData tablet_data);

const TxnSnapshotId& SourceSnapshotId();
void SetSourceSnapshotId(const TxnSnapshotId& source_snapshot_id);

const TxnSnapshotId& TargetSnapshotId();
void SetTargetSnapshotId(const TxnSnapshotId& target_snapshot_id);

const TxnSnapshotRestorationId& RestorationId();
void SetRestorationId(const TxnSnapshotRestorationId& restoration_id);

private:
friend class RefCountedThreadSafe<CloneStateInfo>;
~CloneStateInfo() = default;

// The ID field is used in the sys_catalog table.
const std::string clone_request_id_;

// These fields are set before the clone state is set to CREATING.
std::vector<TabletData> tablet_data_ GUARDED_BY(mutex_);
TxnSnapshotId source_snapshot_id_ GUARDED_BY(mutex_) = TxnSnapshotId::Nil();
TxnSnapshotId target_snapshot_id_ GUARDED_BY(mutex_) = TxnSnapshotId::Nil();

// This is set before the clone state is set to RESTORING.
TxnSnapshotRestorationId restoration_id_ GUARDED_BY(mutex_) = TxnSnapshotRestorationId::Nil();

std::mutex mutex_;

DISALLOW_COPY_AND_ASSIGN(CloneStateInfo);
};

Expand Down

0 comments on commit 994579d

Please sign in to comment.