diff --git a/src/yb/fs/fs_manager.cc b/src/yb/fs/fs_manager.cc index 77c168d547f..ce20b7a9928 100644 --- a/src/yb/fs/fs_manager.cc +++ b/src/yb/fs/fs_manager.cc @@ -339,12 +339,23 @@ Status FsManager::SetUniverseUuidOnTserverInstanceMetadata( const UniverseUuid& universe_uuid) { std::lock_guard lock(metadata_mutex_); SCHECK_NOTNULL(metadata_); + LOG(INFO) << "Setting the universe_uuid to " << universe_uuid; metadata_->mutable_tserver_instance_metadata()->set_universe_uuid(universe_uuid.ToString()); auto instance_metadata_path = VERIFY_RESULT(GetExistingInstanceMetadataPath()); return pb_util::WritePBContainerToPath( env_, instance_metadata_path, *metadata_.get(), pb_util::OVERWRITE, pb_util::SYNC); } +Status FsManager::ClearUniverseUuidOnTserverInstanceMetadata() { + std::lock_guard lock(metadata_mutex_); + SCHECK_NOTNULL(metadata_); + LOG(INFO) << "Clearing the universe_uuid from Instance Metadata"; + metadata_->mutable_tserver_instance_metadata()->clear_universe_uuid(); + auto instance_metadata_path = VERIFY_RESULT(GetExistingInstanceMetadataPath()); + return pb_util::WritePBContainerToPath( + env_, instance_metadata_path, *metadata_.get(), pb_util::OVERWRITE, pb_util::SYNC); +} + Status FsManager::CheckAndOpenFileSystemRoots() { RETURN_NOT_OK(Init()); diff --git a/src/yb/fs/fs_manager.h b/src/yb/fs/fs_manager.h index b0e7b5624ce..bdd77ae7826 100644 --- a/src/yb/fs/fs_manager.h +++ b/src/yb/fs/fs_manager.h @@ -203,6 +203,8 @@ class FsManager { Status SetUniverseUuidOnTserverInstanceMetadata(const UniverseUuid& universe_uuid); + Status ClearUniverseUuidOnTserverInstanceMetadata(); + // Return the path where InstanceMetadataPB is stored. std::string GetInstanceMetadataPath(const std::string& root) const; diff --git a/src/yb/integration-tests/master_heartbeat-itest.cc b/src/yb/integration-tests/master_heartbeat-itest.cc index bbe73c6ab3e..1f6177af20d 100644 --- a/src/yb/integration-tests/master_heartbeat-itest.cc +++ b/src/yb/integration-tests/master_heartbeat-itest.cc @@ -52,6 +52,7 @@ DECLARE_bool(master_enable_universe_uuid_heartbeat_check); namespace yb { +using master::GetMasterClusterConfigResponsePB; namespace integration_tests { class MasterHeartbeatITest : public YBTableTestBase { @@ -147,6 +148,89 @@ TEST_F(MasterHeartbeatITest, IgnorePeerNotInConfig) { }, FLAGS_heartbeat_interval_ms * 5ms, "Wait for proper replica locations.")); } +class MasterHeartbeatITestWithUpgrade : public YBTableTestBase { + public: + void SetUp() override { + // Start the cluster without the universe_uuid generation FLAG to test upgrade. + ANNOTATE_UNPROTECTED_WRITE(FLAGS_master_enable_universe_uuid_heartbeat_check) = false; + YBTableTestBase::SetUp(); + proxy_cache_ = std::make_unique(client_->messenger()); + } + + Status GetClusterConfig(GetMasterClusterConfigResponsePB *config_resp) { + const auto* master = VERIFY_RESULT(mini_cluster_->GetLeaderMiniMaster()); + return master->catalog_manager().GetClusterConfig(config_resp); + } + + Status ClearUniverseUuid() { + for (auto& ts : mini_cluster_->mini_tablet_servers()) { + RETURN_NOT_OK(ts->server()->ClearUniverseUuid()); + } + return Status::OK(); + } + + protected: + std::unique_ptr proxy_cache_; +}; + +TEST_F(MasterHeartbeatITestWithUpgrade, ClearUniverseUuidToRecoverUniverse) { + GetMasterClusterConfigResponsePB resp; + ASSERT_OK(GetClusterConfig(&resp)); + auto cluster_config_version = resp.cluster_config().version(); + LOG(INFO) << "Cluster Config version : " << cluster_config_version; + + // Attempt to clear universe uuid. Should fail when it is not set. + ASSERT_NOK(ClearUniverseUuid()); + + // Enable the flag and wait for heartbeat to propagate the universe_uuid. + ANNOTATE_UNPROTECTED_WRITE(FLAGS_master_enable_universe_uuid_heartbeat_check) = true; + + // Wait for ClusterConfig version to increase. + ASSERT_OK(LoggedWaitFor([&]() { + if (!GetClusterConfig(&resp).ok()) { + return false; + } + + if (!resp.cluster_config().has_universe_uuid()) { + return false; + } + + return true; + }, 60s, "Waiting for new universe uuid to be generated")); + + ASSERT_GE(resp.cluster_config().version(), cluster_config_version); + LOG(INFO) << "Updated cluster config version:" << resp.cluster_config().version(); + LOG(INFO) << "Universe UUID:" << resp.cluster_config().universe_uuid(); + + // Wait for propagation of universe_uuid. + ASSERT_OK(LoggedWaitFor([&]() { + for (auto& ts : mini_cluster_->mini_tablet_servers()) { + auto uuid_str = ts->server()->fs_manager()->GetUniverseUuidFromTserverInstanceMetadata(); + if (!uuid_str.ok() || uuid_str.get() != resp.cluster_config().universe_uuid()) { + return false; + } + } + + return true; + }, 60s, "Waiting for tservers to pick up new universe uuid")); + + // Verify servers are heartbeating correctly. + master::TSDescriptorVector ts_descs; + ASSERT_OK(mini_cluster_->WaitForTabletServerCount(3, &ts_descs, true /* live_only */)); + + // Artificially generate a fake universe uuid and propagate that by clearing the universe_uuid. + ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_master_universe_uuid) = Uuid::Generate().ToString(); + ANNOTATE_UNPROTECTED_WRITE(FLAGS_tserver_unresponsive_timeout_ms) = 10 * 1000; + + // Heartbeats should first fail due to universe_uuid mismatch. + ASSERT_OK(mini_cluster_->WaitForTabletServerCount(0, &ts_descs, true /* live_only */)); + + // Once t-server instance metadata is cleared, heartbeats should succeed again. + ASSERT_OK(ClearUniverseUuid()); + ASSERT_OK(mini_cluster_->WaitForTabletServerCount(3, &ts_descs, true /* live_only */)); +} + + class MasterHeartbeatITestWithExternal : public MasterHeartbeatITest { public: bool use_external_mini_cluster() { return true; } diff --git a/src/yb/master/catalog_manager.cc b/src/yb/master/catalog_manager.cc index 5b57a0c9357..79c600a9fb9 100644 --- a/src/yb/master/catalog_manager.cc +++ b/src/yb/master/catalog_manager.cc @@ -1640,6 +1640,7 @@ Status CatalogManager::SetUniverseUuidIfNeeded() { universe_uuid); l.mutable_data()->pb.set_universe_uuid(universe_uuid); + l.mutable_data()->pb.set_version(l.mutable_data()->pb.version() + 1); RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), cluster_config_.get())); l.Commit(); return Status::OK(); @@ -12390,6 +12391,11 @@ Status CatalogManager::SetClusterConfig( return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s); } + if (config.universe_uuid() != l->pb.universe_uuid()) { + Status s = STATUS(InvalidArgument, "Config Universe UUID cannot be updated"); + return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s); + } + // TODO(bogdan): should this live here? const ReplicationInfoPB& replication_info = config.replication_info(); for (auto& read_replica : replication_info.read_replicas()) { diff --git a/src/yb/master/sys_catalog-test.cc b/src/yb/master/sys_catalog-test.cc index 9d9dac59bd6..3bb1726192c 100644 --- a/src/yb/master/sys_catalog-test.cc +++ b/src/yb/master/sys_catalog-test.cc @@ -331,10 +331,15 @@ TEST_F(SysCatalogTest, TestSysCatalogPlacementOperations) { req.mutable_cluster_config()->set_cluster_uuid("some-cluster-uuid"); auto status = master_->catalog_manager()->SetClusterConfig(&req, &resp); ASSERT_TRUE(status.IsInvalidArgument()); - - // Setting the cluster uuid should make the request succeed. req.mutable_cluster_config()->set_cluster_uuid(config.cluster_uuid()); + // Verify that we receive an error when trying to change the universe uuid. + req.mutable_cluster_config()->set_universe_uuid("some-universe-uuid"); + status = master_->catalog_manager()->SetClusterConfig(&req, &resp); + ASSERT_TRUE(status.IsInvalidArgument()); + req.mutable_cluster_config()->set_universe_uuid(config.universe_uuid()); + + // Setting the cluster and universe uuid correctly should make the request succeed. ASSERT_OK(master_->catalog_manager()->SetClusterConfig(&req, &resp)); l.Commit(); } diff --git a/src/yb/tools/ts-cli.cc b/src/yb/tools/ts-cli.cc index 807681d43e9..799e10599fc 100644 --- a/src/yb/tools/ts-cli.cc +++ b/src/yb/tools/ts-cli.cc @@ -73,10 +73,12 @@ using yb::consensus::RaftConfigPB; using yb::rpc::Messenger; using yb::rpc::MessengerBuilder; using yb::rpc::RpcController; -using yb::server::ServerStatusPB; using yb::server::ReloadCertificatesRequestPB; using yb::server::ReloadCertificatesResponsePB; +using yb::server::ServerStatusPB; using yb::tablet::TabletStatusPB; +using yb::tserver::ClearUniverseUuidRequestPB; +using yb::tserver::ClearUniverseUuidResponsePB; using yb::tserver::CountIntentsRequestPB; using yb::tserver::CountIntentsResponsePB; using yb::tserver::DeleteTabletRequestPB; @@ -114,6 +116,7 @@ const char* const kCompactAllTabletsOp = "compact_all_tablets"; const char* const kReloadCertificatesOp = "reload_certificates"; const char* const kRemoteBootstrapOp = "remote_bootstrap"; const char* const kListMasterServersOp = "list_master_servers"; +const char* const kClearUniverseUuidOp = "clear_universe_uuid"; DEFINE_UNKNOWN_string(server_address, "localhost", @@ -248,6 +251,9 @@ class TsAdminClient { // List information for all master servers. Status ListMasterServers(); + // Clear Universe Uuid. + Status ClearUniverseUuid(); + private: std::string addr_; MonoDelta timeout_; @@ -681,6 +687,21 @@ Status TsAdminClient::ListMasterServers() { return Status::OK(); } +Status TsAdminClient::ClearUniverseUuid() { + CHECK(initted_); + ClearUniverseUuidRequestPB req; + ClearUniverseUuidResponsePB resp; + RpcController rpc; + + rpc.set_timeout(timeout_); + RETURN_NOT_OK(ts_proxy_->ClearUniverseUuid(req, &resp, &rpc)); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + + std::cout << "Universe UUID cleared from Instance Metadata" << std::endl; + return Status::OK(); +} namespace { @@ -709,7 +730,8 @@ void SetUsage(const char* argv0) { << " \n" << " " << kReloadCertificatesOp << "\n" << " " << kRemoteBootstrapOp << " \n" - << " " << kListMasterServersOp << "\n"; + << " " << kListMasterServersOp << "\n" + << " " << kClearUniverseUuidOp << "\n"; google::SetUsageMessage(str.str()); } @@ -924,6 +946,11 @@ static int TsCliMain(int argc, char** argv) { RETURN_NOT_OK_PREPEND_FROM_MAIN(client.ListMasterServers(), "Unable to list master servers on " + addr); + } else if (op == kClearUniverseUuidOp) { + CHECK_ARGC_OR_RETURN_WITH_USAGE(op, 2); + + RETURN_NOT_OK_PREPEND_FROM_MAIN(client.ClearUniverseUuid(), + "Unable to clear universe uuid on " + addr); } else { std::cerr << "Invalid operation: " << op << std::endl; google::ShowUsageWithFlagsRestrict(argv[0], __FILE__); diff --git a/src/yb/tserver/tablet_server.cc b/src/yb/tserver/tablet_server.cc index aee3de27af7..d35da322a10 100644 --- a/src/yb/tserver/tablet_server.cc +++ b/src/yb/tserver/tablet_server.cc @@ -1133,6 +1133,15 @@ Status TabletServer::SetConfigVersionAndConsumerRegistry( return Status::OK(); } +Status TabletServer::ClearUniverseUuid() { + auto instance_universe_uuid_str = VERIFY_RESULT( + fs_manager_->GetUniverseUuidFromTserverInstanceMetadata()); + auto instance_universe_uuid = VERIFY_RESULT(UniverseUuid::FromString(instance_universe_uuid_str)); + SCHECK_EQ(false, instance_universe_uuid.IsNil(), IllegalState, + "universe_uuid is not set in instance metadata"); + return fs_manager_->ClearUniverseUuidOnTserverInstanceMetadata(); +} + Status TabletServer::ValidateAndMaybeSetUniverseUuid(const UniverseUuid& universe_uuid) { auto instance_universe_uuid_str = VERIFY_RESULT( fs_manager_->GetUniverseUuidFromTserverInstanceMetadata()); @@ -1144,6 +1153,7 @@ Status TabletServer::ValidateAndMaybeSetUniverseUuid(const UniverseUuid& univers "uuid is $1", universe_uuid.ToString(), instance_universe_uuid.ToString())); return Status::OK(); } + return fs_manager_->SetUniverseUuidOnTserverInstanceMetadata(universe_uuid); } diff --git a/src/yb/tserver/tablet_server.h b/src/yb/tserver/tablet_server.h index b01b71e2873..7feb93dc8de 100644 --- a/src/yb/tserver/tablet_server.h +++ b/src/yb/tserver/tablet_server.h @@ -284,6 +284,8 @@ class TabletServer : public DbServerBase, public TabletServerIf { Status ValidateAndMaybeSetUniverseUuid(const UniverseUuid& universe_uuid); + Status ClearUniverseUuid(); + XClusterConsumer* GetXClusterConsumer() const; // Mark the CDC service as enabled via heartbeat. diff --git a/src/yb/tserver/tablet_service.cc b/src/yb/tserver/tablet_service.cc index 9e992ed18a3..d8f4bb24817 100644 --- a/src/yb/tserver/tablet_service.cc +++ b/src/yb/tserver/tablet_service.cc @@ -2613,6 +2613,17 @@ void TabletServiceImpl::ListMasterServers(const ListMasterServersRequestPB* req, context.RespondSuccess(); } +void TabletServiceImpl::ClearUniverseUuid(const ClearUniverseUuidRequestPB* req, + ClearUniverseUuidResponsePB* resp, + rpc::RpcContext context) { + const Status s = server_->tablet_manager()->server()->ClearUniverseUuid(); + if (!s.ok()) { + SetupErrorAndRespond(resp->mutable_error(), s, &context); + return; + } + context.RespondSuccess(); +} + void TabletServiceImpl::GetLockStatus(const GetLockStatusRequestPB* req, GetLockStatusResponsePB* resp, rpc::RpcContext context) { diff --git a/src/yb/tserver/tablet_service.h b/src/yb/tserver/tablet_service.h index 8e506b20343..6300070e72f 100644 --- a/src/yb/tserver/tablet_service.h +++ b/src/yb/tserver/tablet_service.h @@ -178,6 +178,10 @@ class TabletServiceImpl : public TabletServerServiceIf, public ReadTabletProvide ListMasterServersResponsePB* resp, rpc::RpcContext context) override; + void ClearUniverseUuid(const ClearUniverseUuidRequestPB* req, + ClearUniverseUuidResponsePB* resp, + rpc::RpcContext context) override; + void GetLockStatus(const GetLockStatusRequestPB* req, GetLockStatusResponsePB* resp, rpc::RpcContext context) override; diff --git a/src/yb/tserver/tserver.proto b/src/yb/tserver/tserver.proto index 7b17d79d57d..a39747f9cfb 100644 --- a/src/yb/tserver/tserver.proto +++ b/src/yb/tserver/tserver.proto @@ -330,3 +330,10 @@ message GetCompatibleSchemaVersionResponsePB { optional TabletServerErrorPB error = 1; optional uint32 compatible_schema_version = 2; } + +message ClearUniverseUuidRequestPB { +} + +message ClearUniverseUuidResponsePB { + optional TabletServerErrorPB error = 1; +} diff --git a/src/yb/tserver/tserver_service.proto b/src/yb/tserver/tserver_service.proto index 34e94b49f76..125e7feb0b0 100644 --- a/src/yb/tserver/tserver_service.proto +++ b/src/yb/tserver/tserver_service.proto @@ -104,7 +104,9 @@ service TabletServerService { // Returns the schema version on the XCluster consumer side that is // compatible with the schema provided in the request. rpc GetCompatibleSchemaVersion(GetCompatibleSchemaVersionRequestPB) - returns (GetCompatibleSchemaVersionResponsePB); + returns (GetCompatibleSchemaVersionResponsePB); + + rpc ClearUniverseUuid(ClearUniverseUuidRequestPB) returns (ClearUniverseUuidResponsePB); } message GetLockStatusRequestPB {