From c295c1069eedd9cc473b10dd8e606f38ecc49e16 Mon Sep 17 00:00:00 2001 From: Anubhav Srivastava Date: Mon, 8 Apr 2024 12:49:35 -0400 Subject: [PATCH] [#21572] docdb: Clone hidden tables from active rocksdb Summary: When cloning to time `t`, some of the tablets which were running as of `t` might have been deleted before the snapshot covering time `t` was taken. Since we do not snapshot hidden tables, the SST files for these tablets must be hardlinked from the tablet's active rocksdb (since the tablets are hidden, there is no difference between the data in the active rocks db and the data as of `t`). Note that it is not sufficient to clone from active rocksdb for all tablets that are hidden when the clone operation starts because if a tablet **was** included in the covering snapshot, then active rocksdb is only guaranteed to contain information after the covering snapshot time. This diff adds the above functionality by keeping track of which tablets were not part of the covering snapshot, and setting the new field `clone_from_active_rocksdb` in the clone requests for those tablets. **Upgrade/Rollback safety:** Change is to an API (CloneTablet) which should not be used in any production system yet. Fixes #21572. Jira: DB-10456 Test Plan: `./yb_build.sh --cxx_test snapshot-schedule-test --gtest_filter=CloneFromScheduleTest.CloneAfterDrop` Reviewers: mhaddad Reviewed By: mhaddad Subscribers: ybase, bogdan Differential Revision: https://phorge.dev.yugabyte.com/D33922 --- src/yb/client/snapshot-schedule-test.cc | 27 +++ .../minicluster-snapshot-test.cc | 4 +- src/yb/master/catalog_manager.h | 22 +- src/yb/master/catalog_manager_ext.cc | 197 +++++++++++------- src/yb/master/catalog_manager_if.h | 3 +- .../master/clone/clone_state_manager-test.cc | 25 ++- src/yb/master/clone/clone_state_manager.cc | 34 ++- src/yb/master/clone/clone_state_manager.h | 4 +- src/yb/master/clone/external_functions.h | 7 +- src/yb/tablet/operations.proto | 5 + src/yb/tserver/ts_tablet_manager.cc | 25 ++- 11 files changed, 240 insertions(+), 113 deletions(-) diff --git a/src/yb/client/snapshot-schedule-test.cc b/src/yb/client/snapshot-schedule-test.cc index 8a0b4955b99..9af3a66940f 100644 --- a/src/yb/client/snapshot-schedule-test.cc +++ b/src/yb/client/snapshot-schedule-test.cc @@ -447,6 +447,33 @@ TEST_F(CloneFromScheduleTest, CloneWithNoSchedule) { ASSERT_STR_CONTAINS(status.message().ToBuffer(), "Could not find snapshot schedule"); } +TEST_F(CloneFromScheduleTest, CloneAfterDrop) { + auto schedule_id = ASSERT_RESULT( + snapshot_util_->CreateSchedule(table_, kTableName.namespace_type(), + kTableName.namespace_name())); + ASSERT_OK(snapshot_util_->WaitScheduleSnapshot(schedule_id)); + + ASSERT_NO_FATALS(WriteData(WriteOpType::INSERT, 0 /* transaction */)); + auto row_count = CountTableRows(table_); + auto ht = cluster_->mini_master()->master()->clock()->Now(); + + ASSERT_OK(client_->DeleteTable(kTableName)); + + master::CloneNamespaceRequestPB req; + master::NamespaceIdentifierPB source_namespace; + source_namespace.set_name(kTableName.namespace_name()); + source_namespace.set_database_type(YQLDatabase::YQL_DATABASE_CQL); + *req.mutable_source_namespace() = source_namespace; + req.set_restore_ht(ht.ToUint64()); + req.set_target_namespace_name("clone" /* target_namespace_name */); + ASSERT_OK(CloneAndWait(req)); + + YBTableName clone(YQL_DATABASE_CQL, "clone", kTableName.table_name()); + TableHandle clone_handle; + ASSERT_OK(clone_handle.Open(clone, client_.get())); + ASSERT_EQ(CountTableRows(clone_handle), row_count); +} + TEST_F(SnapshotScheduleTest, RemoveNewTablets) { const auto kInterval = 5s * kTimeMultiplier; const auto kRetention = kInterval * 2; diff --git a/src/yb/integration-tests/minicluster-snapshot-test.cc b/src/yb/integration-tests/minicluster-snapshot-test.cc index dce7b5e6ef0..02070e6acc0 100644 --- a/src/yb/integration-tests/minicluster-snapshot-test.cc +++ b/src/yb/integration-tests/minicluster-snapshot-test.cc @@ -469,7 +469,7 @@ TEST_P(MasterExportSnapshotTest, ExportSnapshotAsOfTime) { LOG(INFO) << Format( "Exporting snapshot from snapshot schedule: $0, Hybrid time = $1", schedule_id, time); auto deadline = CoarseMonoClock::Now() + timeout; - master::SnapshotInfoPB snapshot_info_as_of_time = ASSERT_RESULT( + auto [snapshot_info_as_of_time, not_snapshotted_tablets] = ASSERT_RESULT( mini_cluster()->mini_master()->catalog_manager_impl().GenerateSnapshotInfoFromSchedule( schedule_id, HybridTime::FromMicros(static_cast(time.ToInt64())), deadline)); // 6. @@ -508,7 +508,7 @@ TEST_P(MasterExportSnapshotTest, ExportSnapshotAsOfTimeWithHiddenTables) { LOG(INFO) << Format( "Exporting snapshot from snapshot schedule: $0, Hybrid time = $1", schedule_id, time); auto deadline = CoarseMonoClock::Now() + timeout; - master::SnapshotInfoPB snapshot_info_as_of_time = ASSERT_RESULT( + auto [snapshot_info_as_of_time, not_snapshotted_tablets] = ASSERT_RESULT( mini_cluster()->mini_master()->catalog_manager_impl().GenerateSnapshotInfoFromSchedule( schedule_id, HybridTime::FromMicros(static_cast(time.ToInt64())), deadline)); // 6. Assert the output of 5 and 3 are the same. diff --git a/src/yb/master/catalog_manager.h b/src/yb/master/catalog_manager.h index 46749f7ca9d..9c2cd842638 100644 --- a/src/yb/master/catalog_manager.h +++ b/src/yb/master/catalog_manager.h @@ -1213,16 +1213,19 @@ class CatalogManager : public tserver::TabletPeerLookupIf, const ListSnapshotRestorationsRequestPB* req, ListSnapshotRestorationsResponsePB* resp) override; - // Generate the snapshot info as of export_time from the provided snapshot schedule. - Result GenerateSnapshotInfoFromSchedule( - const SnapshotScheduleId& snapshot_schedule_id, HybridTime export_time, - CoarseTimePoint deadline) override; + Result GetSnapshotInfoForBackup(const TxnSnapshotId& snapshot_id); + // Generate the SnapshotInfoPB as of read_time from the provided snapshot schedule, and return + // the set of tablets that were RUNNING as of read_time but were HIDDEN before the actual snapshot + // was taken). // The SnapshotInfoPB generated by export snapshot as of time should be identical to the - // SnapshotInfoPB generated by the normal export_snapshot (even ordering of tables/tablets) - Result GenerateSnapshotInfoPbAsOfTime( - const TxnSnapshotId& snapshot_id, HybridTime read_time, const docdb::DocDB& doc_db, - std::reference_wrapper db_pending_op); + // SnapshotInfoPB generated by the normal export_snapshot (even ordering of tables/tablets). + Result>> GenerateSnapshotInfoFromSchedule( + const SnapshotScheduleId& snapshot_schedule_id, HybridTime read_time, + CoarseTimePoint deadline) override; + + Result> GetBackupEntriesAsOfTime( + const TxnSnapshotId& snapshot_id, const NamespaceId& source_ns_id, HybridTime read_time); Status RestoreSnapshot( const RestoreSnapshotRequestPB* req, RestoreSnapshotResponsePB* resp, rpc::RpcContext* rpc, @@ -2764,6 +2767,9 @@ class CatalogManager : public tserver::TabletPeerLookupIf, void ScheduleTabletSnapshotOp(const AsyncTabletSnapshotOpPtr& operation) override; + Result> RestoreSnapshotToTmpRocksDb( + tablet::Tablet* tablet, const TxnSnapshotId& snapshot_id, HybridTime restore_at); + Status RestoreSysCatalogCommon( SnapshotScheduleRestoration* restoration, tablet::Tablet* tablet, std::reference_wrapper pending_op, diff --git a/src/yb/master/catalog_manager_ext.cc b/src/yb/master/catalog_manager_ext.cc index 5008c0829c4..3671888f595 100644 --- a/src/yb/master/catalog_manager_ext.cc +++ b/src/yb/master/catalog_manager_ext.cc @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "yb/common/colocated_util.h" @@ -165,6 +166,8 @@ DEFINE_RUNTIME_uint32(default_snapshot_retention_hours, 24, namespace yb { +using google::protobuf::RepeatedPtrField; + using rpc::RpcContext; using pb_util::ParseFromSlice; using client::internal::RemoteTabletServer; @@ -186,9 +189,10 @@ struct TableWithTabletsEntries { } TableWithTabletsEntries() {} - // Add the table with table_id and its tablets entries to snapshot_info - void AddToSnapshotInfo(const TableId& table_id, SnapshotInfoPB* snapshot_info) { - BackupRowEntryPB* table_backup_entry = snapshot_info->add_backup_entries(); + // Add the table with table_id and its tablets entries to a list of backup entries. + void AddToBackupEntries( + const TableId& table_id, RepeatedPtrField* backup_entries) { + BackupRowEntryPB* table_backup_entry = backup_entries->Add(); std::string output; table_entry.AppendToString(&output); *table_backup_entry->mutable_entry() = @@ -196,11 +200,11 @@ struct TableWithTabletsEntries { if (table_entry.schema().has_pgschema_name() && table_entry.schema().pgschema_name() != "") { table_backup_entry->set_pg_schema_name(table_entry.schema().pgschema_name()); } - for (const auto& sysTabletEntry : tablets_entries) { + for (const auto& tablet_entry : tablets_entries) { std::string output; - sysTabletEntry.second.AppendToString(&output); - *snapshot_info->add_backup_entries()->mutable_entry() = - ToSysRowEntry(sysTabletEntry.first, SysRowEntryType::TABLET, std::move(output)); + tablet_entry.second.AppendToString(&output); + *backup_entries->Add()->mutable_entry() = + ToSysRowEntry(tablet_entry.first, SysRowEntryType::TABLET, std::move(output)); } } @@ -1348,41 +1352,7 @@ Status CatalogManager::ImportSnapshotMeta(const ImportSnapshotMetaRequestPB* req return Status::OK(); } -Result CatalogManager::GenerateSnapshotInfoFromSchedule( - const SnapshotScheduleId& snapshot_schedule_id, HybridTime export_time, - CoarseTimePoint deadline) { - LOG(INFO) << Format( - "Servicing GenerateSnapshotInfoFromSchedule for snapshot_schedule_id: $0 and export " - "time: $1", - snapshot_schedule_id, export_time); - auto suitable_snapshot = VERIFY_RESULT(snapshot_coordinator_.GetSuitableSnapshot( - snapshot_schedule_id, export_time, LeaderTerm(), deadline)); - auto snapshot_id = VERIFY_RESULT(FullyDecodeTxnSnapshotId(suitable_snapshot.id())); - LOG(INFO) << Format("Found suitable snapshot: $0", snapshot_id); - auto tablet = VERIFY_RESULT(tablet_peer()->shared_tablet_safe()); - LOG(INFO) << Format( - "Opening Temporary SysCatalog DocDB for : $0 , export_time: $1", snapshot_id, export_time); - // Open a temporary on the side DocDb for the sys.catalog using the data files of snapshot_id and - // reading sys.catalog data as of read_time. - auto dir = VERIFY_RESULT(tablet->snapshots().RestoreToTemporary(snapshot_id, export_time)); - rocksdb::Options rocksdb_options; - tablet->InitRocksDBOptions(&rocksdb_options, /*log_prefix*/ " [TMP]: "); - auto db = VERIFY_RESULT(rocksdb::DB::Open(rocksdb_options, dir)); - // db can't be closed concurrently, so it is ok to use dummy ScopedRWOperation. - auto db_pending_op = ScopedRWOperation(); - auto doc_db = docdb::DocDB::FromRegularUnbounded(db.get()); - - SnapshotInfoPB snapshot_info = VERIFY_RESULT( - GenerateSnapshotInfoPbAsOfTime(snapshot_id, export_time, doc_db, db_pending_op)); - LOG(INFO) << Format("snapshot_info returned: $0", snapshot_info.ShortDebugString()); - return snapshot_info; -} - -Result CatalogManager::GenerateSnapshotInfoPbAsOfTime( - const TxnSnapshotId& snapshot_id, HybridTime read_time, const docdb::DocDB& doc_db, - std::reference_wrapper db_pending_op) { - // Get the SnapshotInfoPB of snapshot_id to set some fields in snapshot_info (all fields except - // backup_entries which will be populated from iterating over DocDB). +Result CatalogManager::GetSnapshotInfoForBackup(const TxnSnapshotId& snapshot_id) { ListSnapshotsRequestPB req; ListSnapshotsResponsePB resp; req.set_snapshot_id(snapshot_id.data(), snapshot_id.size()); @@ -1393,39 +1363,111 @@ Result CatalogManager::GenerateSnapshotInfoPbAsOfTime( return STATUS_FORMAT(InvalidArgument, "Unknown snapshot: $0", snapshot_id); } SnapshotInfoPB snapshot_info = resp.snapshots()[0]; - // Get the namespace_id that this snapshot covers. - auto namespace_it = std::find_if( - snapshot_info.backup_entries().begin(), snapshot_info.backup_entries().end(), - [](const BackupRowEntryPB& backup_entry) { - return backup_entry.entry().type() == SysRowEntryType::NAMESPACE; - }); - if (namespace_it == snapshot_info.backup_entries().end()) { + RSTATUS_DCHECK( + snapshot_info.entry().tablet_snapshots().empty(), IllegalState, + "Expected tablet_snapshots field to be cleared by ListSnapshots"); + RSTATUS_DCHECK( + snapshot_info.entry().entries().empty(), IllegalState, + "Expected entries field to be cleared by ListSnapshots"); + + return snapshot_info; +} + +Result>> + CatalogManager::GenerateSnapshotInfoFromSchedule( + const SnapshotScheduleId& snapshot_schedule_id, HybridTime read_time, + CoarseTimePoint deadline) { + LOG(INFO) << Format( + "Servicing GenerateSnapshotInfoFromSchedule for snapshot_schedule_id: $0 and read_time: $1", + snapshot_schedule_id, read_time); + + // Find or create a snapshot that covers read_time. + auto suitable_snapshot = VERIFY_RESULT(snapshot_coordinator_.GetSuitableSnapshot( + snapshot_schedule_id, read_time, LeaderTerm(), deadline)); + auto snapshot_id = VERIFY_RESULT(FullyDecodeTxnSnapshotId(suitable_snapshot.id())); + LOG(INFO) << Format("Found suitable snapshot: $0", snapshot_id); + + ListSnapshotSchedulesResponsePB resp; + RETURN_NOT_OK(snapshot_coordinator_.ListSnapshotSchedules(snapshot_schedule_id, &resp)); + const auto& filter = resp.schedules(0).options().filter().tables().tables(); + if (filter.empty() || filter.begin()->namespace_().id().empty()) { return STATUS_FORMAT( - NotFound, Format("No namespace entry found in snapshot: $0", snapshot_info.id())); + IllegalState, "No namespace found in filter for schedule id $0", snapshot_schedule_id); + } + const NamespaceId& source_ns_id = filter.begin()->namespace_().id(); + + // Get the SnapshotInfoPB, save the set of tablets it contained, and clear backup_entries. + // backup_entries will be repopulated with the set of tablets that were running at read_time + // later when reading from DocDB as of read_time. + auto snapshot_info = VERIFY_RESULT(GetSnapshotInfoForBackup(snapshot_id)); + std::unordered_set snapshotted_tablets; + for (auto& backup_entry : snapshot_info.backup_entries()) { + if (backup_entry.entry().type() == SysRowEntryType::TABLET) { + snapshotted_tablets.insert(backup_entry.entry().id()); + } } - NamespaceId namespace_id = namespace_it->entry().id(); - // Remove backup_entries from original snapshot as they will be generated as of - // read_time. snapshot_info.clear_backup_entries(); + + // Set backup_entries based on what entries were running in the sys catalog as of read_time. + *snapshot_info.mutable_backup_entries() = VERIFY_RESULT( + GetBackupEntriesAsOfTime(snapshot_id, source_ns_id, read_time)); + VLOG_WITH_FUNC(1) << Format("snapshot_info returned: $0", snapshot_info.ShortDebugString()); + + // Compute the set of tablets that were running as of read_time but were not snapshotted because + // they were hidden before the snapshot was taken. + std::unordered_set not_snapshotted_tablets; + for (const auto& backup_entry : snapshot_info.backup_entries()) { + if (backup_entry.entry().type() == SysRowEntryType::TABLET && + !snapshotted_tablets.contains(backup_entry.entry().id())) { + not_snapshotted_tablets.insert(backup_entry.entry().id()); + } + } + return std::make_pair(std::move(snapshot_info), std::move(not_snapshotted_tablets)); +} + +Result> CatalogManager::GetBackupEntriesAsOfTime( + const TxnSnapshotId& snapshot_id, const NamespaceId& source_ns_id, HybridTime read_time) { + // Open a temporary on-the-side DocDB for the sys.catalog using the data files of snapshot_id and + // read sys.catalog data as of export_time to get the list of tablets that were running at that + // time. + RepeatedPtrField backup_entries; + auto tablet = VERIFY_RESULT(tablet_peer()->shared_tablet_safe()); + LOG(INFO) << Format("Opening temporary SysCatalog DocDB for snapshot $0 at read_time $1", + snapshot_id, read_time); + auto db = VERIFY_RESULT(RestoreSnapshotToTmpRocksDb(tablet.get(), snapshot_id, read_time)); + auto doc_db = docdb::DocDB::FromRegularUnbounded(db.get()); + const docdb::DocReadContext& doc_read_cntxt = doc_read_context(); dockv::ReaderProjection projection(doc_read_cntxt.schema()); + + // db can't be closed concurrently, so it is ok to use dummy ScopedRWOperation. + auto db_pending_op = ScopedRWOperation(); + // Pass 1: Get the SysNamespaceEntryPB of the selected database. docdb::DocRowwiseIterator namespace_iter = docdb::DocRowwiseIterator( projection, doc_read_cntxt, TransactionOperationContext(), doc_db, docdb::ReadOperationData::FromSingleReadTime(read_time), db_pending_op); + bool found_ns = false; RETURN_NOT_OK(EnumerateSysCatalog( &namespace_iter, doc_read_cntxt.schema(), SysRowEntryType::NAMESPACE, - [namespace_id, &snapshot_info](const Slice& id, const Slice& data) -> Status { - if (id.ToBuffer() == namespace_id) { + [&source_ns_id, &backup_entries, &found_ns](const Slice& id, const Slice& data) -> Status { + if (id.ToBuffer() == source_ns_id) { + if (found_ns) { + LOG(WARNING) << "Found duplicate backup entry for namespace " << source_ns_id; + } auto pb = VERIFY_RESULT(pb_util::ParseFromSlice(data)); VLOG_WITH_FUNC(1) << "Found SysNamespaceEntryPB: " << pb.ShortDebugString(); - SysRowEntry* ns_entry = snapshot_info.add_backup_entries()->mutable_entry(); + SysRowEntry* ns_entry = backup_entries.Add()->mutable_entry(); ns_entry->set_id(id.ToBuffer()); ns_entry->set_type(SysRowEntryType::NAMESPACE); ns_entry->set_data(data.ToBuffer()); + found_ns = true; } return Status::OK(); })); + RSTATUS_DCHECK(found_ns, IllegalState, + Format("Did not find backup entry for namespace $0", source_ns_id)); + // Pass 2: Get all the SysTablesEntry of the database that are in running state and not Hidden as // of read_time. // Stores SysTablesEntry and its SysTabletsEntries to order the tablets of each table by @@ -1437,10 +1479,10 @@ Result CatalogManager::GenerateSnapshotInfoPbAsOfTime( docdb::ReadOperationData::FromSingleReadTime(read_time), db_pending_op); RETURN_NOT_OK(EnumerateSysCatalog( &tables_iter, doc_read_cntxt.schema(), SysRowEntryType::TABLE, - [namespace_id, &tables_to_tablets, &colocation_parent_table_id]( + [&source_ns_id, &tables_to_tablets, &colocation_parent_table_id]( const Slice& id, const Slice& data) -> Status { auto pb = VERIFY_RESULT(pb_util::ParseFromSlice(data)); - if (pb.namespace_id() == namespace_id && pb.state() == SysTablesEntryPB::RUNNING && + if (pb.namespace_id() == source_ns_id && pb.state() == SysTablesEntryPB::RUNNING && pb.hide_state() == SysTablesEntryPB_HideState_VISIBLE && !pb.schema().table_properties().is_ysql_catalog_table()) { VLOG_WITH_FUNC(1) << "Found SysTablesEntryPB: " << pb.ShortDebugString(); @@ -1454,6 +1496,7 @@ Result CatalogManager::GenerateSnapshotInfoPbAsOfTime( } return Status::OK(); })); + // Pass 3: Get all the SysTabletsEntry that are in a running state as of read_time and belongs to // the running tables from pass 2. docdb::DocRowwiseIterator tablets_iter = docdb::DocRowwiseIterator( @@ -1461,10 +1504,10 @@ Result CatalogManager::GenerateSnapshotInfoPbAsOfTime( docdb::ReadOperationData::FromSingleReadTime(read_time), db_pending_op); RETURN_NOT_OK(EnumerateSysCatalog( &tablets_iter, doc_read_cntxt.schema(), SysRowEntryType::TABLET, - [namespace_id, &tables_to_tablets](const Slice& id, const Slice& data) -> Status { + [&tables_to_tablets](const Slice& id, const Slice& data) -> Status { auto pb = VERIFY_RESULT(pb_util::ParseFromSlice(data)); // TODO(Yamen): handle tablet splitting cases by either keeping the parent or the children - // according to their state. + // according to their state. if (tables_to_tablets.contains(pb.table_id()) && pb.state() == SysTabletsEntryPB::RUNNING && pb.hide_hybrid_time() == 0) { VLOG_WITH_FUNC(1) << "Found SysTabletsEntryPB: " << pb.ShortDebugString(); @@ -1481,14 +1524,14 @@ Result CatalogManager::GenerateSnapshotInfoPbAsOfTime( // Populate the backup_entries with SysTablesEntry and SysTabletsEntry // Start with the colocation_parent_table_id if the database is colocated. if (colocation_parent_table_id) { - tables_to_tablets[colocation_parent_table_id.value()].AddToSnapshotInfo( - colocation_parent_table_id.value(), &snapshot_info); + tables_to_tablets[colocation_parent_table_id.value()].AddToBackupEntries( + colocation_parent_table_id.value(), &backup_entries); tables_to_tablets.erase(colocation_parent_table_id.value()); } for (auto& sys_table_entry : tables_to_tablets) { - sys_table_entry.second.AddToSnapshotInfo(sys_table_entry.first, &snapshot_info); + sys_table_entry.second.AddToBackupEntries(sys_table_entry.first, &backup_entries); } - return snapshot_info; + return backup_entries; } Status CatalogManager::GetFullUniverseKeyRegistry(const GetFullUniverseKeyRegistryRequestPB* req, @@ -2652,24 +2695,32 @@ void CatalogManager::ScheduleTabletSnapshotOp(const AsyncTabletSnapshotOpPtr& ta WARN_NOT_OK(ScheduleTask(task), "Failed to send create snapshot request"); } +Result> CatalogManager::RestoreSnapshotToTmpRocksDb( + tablet::Tablet* tablet, const TxnSnapshotId& snapshot_id, HybridTime restore_at) { + std::string log_prefix = LogPrefix(); + // Remove ": " to patch suffix. + log_prefix.erase(log_prefix.size() - 2); + + // Restore master snapshot and load it to RocksDB. + auto dir = VERIFY_RESULT(tablet->snapshots().RestoreToTemporary(snapshot_id, restore_at)); + rocksdb::Options rocksdb_options; + tablet->InitRocksDBOptions(&rocksdb_options, log_prefix + " [TMP]: "); + + return rocksdb::DB::Open(rocksdb_options, dir); +} + Status CatalogManager::RestoreSysCatalogCommon( SnapshotScheduleRestoration* restoration, tablet::Tablet* tablet, std::reference_wrapper tablet_pending_op, RestoreSysCatalogState* state, docdb::DocWriteBatch* write_batch, docdb::KeyValuePairPB* restore_kv) { // Restore master snapshot and load it to RocksDB. - auto dir = VERIFY_RESULT(tablet->snapshots().RestoreToTemporary( - restoration->snapshot_id, restoration->restore_at)); - rocksdb::Options rocksdb_options; - std::string log_prefix = LogPrefix(); - // Remove ": " to patch suffix. - log_prefix.erase(log_prefix.size() - 2); - tablet->InitRocksDBOptions(&rocksdb_options, log_prefix + " [TMP]: "); + auto db = VERIFY_RESULT(RestoreSnapshotToTmpRocksDb( + tablet, restoration->snapshot_id, restoration->restore_at)); + auto doc_db = docdb::DocDB::FromRegularUnbounded(db.get()); - auto db = VERIFY_RESULT(rocksdb::DB::Open(rocksdb_options, dir)); // db can't be closed concurrently, so it is ok to use dummy ScopedRWOperation. auto db_pending_op = ScopedRWOperation(); - auto doc_db = docdb::DocDB::FromRegularUnbounded(db.get()); // Load objects to restore and determine obsolete objects. auto schema_packing_provider = &tablet->GetSchemaPackingProvider(); diff --git a/src/yb/master/catalog_manager_if.h b/src/yb/master/catalog_manager_if.h index 744909cc3eb..6d4571f7e06 100644 --- a/src/yb/master/catalog_manager_if.h +++ b/src/yb/master/catalog_manager_if.h @@ -220,7 +220,8 @@ class CatalogManagerIf { virtual Status ListSnapshotRestorations( const ListSnapshotRestorationsRequestPB* req, ListSnapshotRestorationsResponsePB* resp) = 0; - virtual Result GenerateSnapshotInfoFromSchedule( + virtual Result>> + GenerateSnapshotInfoFromSchedule( const SnapshotScheduleId& snapshot_schedule_id, HybridTime export_time, CoarseTimePoint deadline) = 0; diff --git a/src/yb/master/clone/clone_state_manager-test.cc b/src/yb/master/clone/clone_state_manager-test.cc index 100ba316445..fba82638b83 100644 --- a/src/yb/master/clone/clone_state_manager-test.cc +++ b/src/yb/master/clone/clone_state_manager-test.cc @@ -23,8 +23,8 @@ #include "yb/common/snapshot.h" #include "yb/gutil/map-util.h" - #include "yb/gutil/ref_counted.h" + #include "yb/master/catalog_entity_info.h" #include "yb/master/catalog_entity_info.pb.h" #include "yb/master/clone/clone_state_entity.h" @@ -61,11 +61,17 @@ MATCHER_P(CloneTabletRequestPBMatcher, expected, "CloneTabletRequestPBs did not return pb_util::ArePBsEqual(arg, expected, nullptr /* diff_str */); } -std::ostream& operator<<(std::ostream& os, const Result& res) { +// This is needed for the mock of GenerateSnapshotInfoFromSchedule. +std::ostream& operator<<( + std::ostream& os, const Result>>& res) { if (!res.ok()) { os << res.status().ToString(); } else { - os << res->ShortDebugString(); + os << res->first.ShortDebugString(); + os << "Not snapshotted tablets: "; + for (const auto& tablet_id : res->second) { + os << tablet_id << ", "; + } } return os; } @@ -113,9 +119,10 @@ class CloneStateManagerTest : public YBTest { CoarseTimePoint deadline, const LeaderEpoch& epoch), (override)); MOCK_METHOD( - Result, GenerateSnapshotInfoFromSchedule, + (Result>>), + GenerateSnapshotInfoFromSchedule, (const SnapshotScheduleId& snapshot_schedule_id, HybridTime export_time, - CoarseTimePoint deadline), (override)); + CoarseTimePoint deadline), (override)); MOCK_METHOD( Status, DoImportSnapshotMeta, @@ -238,8 +245,10 @@ class CloneStateManagerTest : public YBTest { return clone_state_manager_->GetCloneStateFromSourceNamespace(namespace_id); } - Status ScheduleCloneOps(const CloneStateInfoPtr& clone_state, const LeaderEpoch& epoch) { - return clone_state_manager_->ScheduleCloneOps(clone_state, epoch); + Status ScheduleCloneOps( + const CloneStateInfoPtr& clone_state, const LeaderEpoch& epoch, + const std::unordered_set& not_snapshotted_tablets) { + return clone_state_manager_->ScheduleCloneOps(clone_state, epoch, not_snapshotted_tablets); } MockExternalFunctions& MockFuncs() { @@ -351,7 +360,7 @@ TEST_F(CloneStateManagerTest, ScheduleCloneOps) { EXPECT_CALL(MockFuncs(), ScheduleCloneTabletCall( source_tablets_[i], epoch, CloneTabletRequestPBMatcher(expected_req))); } - ASSERT_OK(ScheduleCloneOps(clone_state, epoch)); + ASSERT_OK(ScheduleCloneOps(clone_state, epoch, {} /* not_snapshotted_tablets */)); } TEST_F(CloneStateManagerTest, HandleCreatingStateAllTabletsCreating) { diff --git a/src/yb/master/clone/clone_state_manager.cc b/src/yb/master/clone/clone_state_manager.cc index 2d6f34dc8ba..527825833c7 100644 --- a/src/yb/master/clone/clone_state_manager.cc +++ b/src/yb/master/clone/clone_state_manager.cc @@ -16,6 +16,7 @@ #include #include "yb/common/common_types.pb.h" +#include "yb/common/entity_ids_types.h" #include "yb/common/snapshot.h" #include "yb/common/wire_protocol.h" #include "yb/gutil/macros.h" @@ -115,7 +116,7 @@ class CloneStateManagerExternalFunctions : public CloneStateManagerExternalFunct return catalog_manager_->DoCreateSnapshot(req, resp, deadline, epoch); } - Result GenerateSnapshotInfoFromSchedule( + Result>> GenerateSnapshotInfoFromSchedule( const SnapshotScheduleId& snapshot_schedule_id, HybridTime export_time, CoarseTimePoint deadline) override { return catalog_manager_->GenerateSnapshotInfoFromSchedule( @@ -215,14 +216,16 @@ Result> CloneStateManager::CloneNamespace( InvalidArgument, "Expected source namespace identifier to have database type and name. " "Got: $0", source_namespace_identifier.ShortDebugString()); } + auto source_namespace = VERIFY_RESULT( + external_funcs_->FindNamespace(source_namespace_identifier)); + const auto& source_namespace_id = source_namespace->id(); + ListSnapshotSchedulesResponsePB resp; RETURN_NOT_OK(external_funcs_->ListSnapshotSchedules(&resp)); auto snapshot_schedule_id = SnapshotScheduleId::Nil(); for (const auto& schedule : resp.schedules()) { auto& tables = schedule.options().filter().tables().tables(); - if (!tables.empty() && - tables[0].namespace_().name() == source_namespace_identifier.name() && - tables[0].namespace_().database_type() == source_namespace_identifier.database_type()) { + if (!tables.empty() && tables[0].namespace_().id() == source_namespace_id) { snapshot_schedule_id = VERIFY_RESULT(FullyDecodeSnapshotScheduleId(schedule.id())); break; } @@ -233,10 +236,7 @@ Result> CloneStateManager::CloneNamespace( source_namespace_identifier.name()); } - auto source_namespace = VERIFY_RESULT( - external_funcs_->FindNamespace(source_namespace_identifier)); auto seq_no = source_namespace->FetchAndIncrementCloneSeqNo(); - const auto source_namespace_id = source_namespace->id(); // Set up persisted clone state. auto clone_state = VERIFY_RESULT(CreateCloneState(seq_no, source_namespace, restore_time)); @@ -277,8 +277,9 @@ Status CloneStateManager::StartTabletsCloning( const HybridTime& restore_time, const std::string& target_namespace_name, CoarseTimePoint deadline, const LeaderEpoch& epoch) { // Export snapshot info. - auto snapshot_info = VERIFY_RESULT(external_funcs_->GenerateSnapshotInfoFromSchedule( - snapshot_schedule_id, restore_time, deadline)); + auto [snapshot_info, not_snapshotted_tablets] = VERIFY_RESULT( + external_funcs_->GenerateSnapshotInfoFromSchedule( + snapshot_schedule_id, restore_time, deadline)); auto source_snapshot_id = VERIFY_RESULT(FullyDecodeTxnSnapshotId(snapshot_info.id())); // Import snapshot info. @@ -322,7 +323,7 @@ Status CloneStateManager::StartTabletsCloning( RETURN_NOT_OK(UpdateCloneStateWithSnapshotInfo( clone_state, source_snapshot_id, target_snapshot_id, tables_data)); - RETURN_NOT_OK(ScheduleCloneOps(clone_state, epoch)); + RETURN_NOT_OK(ScheduleCloneOps(clone_state, epoch, not_snapshotted_tablets)); return Status::OK(); } @@ -444,7 +445,8 @@ Status CloneStateManager::UpdateCloneStateWithSnapshotInfo( } Status CloneStateManager::ScheduleCloneOps( - const CloneStateInfoPtr& clone_state, const LeaderEpoch& epoch) { + const CloneStateInfoPtr& clone_state, const LeaderEpoch& epoch, + const std::unordered_set& not_snapshotted_tablets) { auto lock = clone_state->LockForRead(); auto& pb = lock->pb; for (auto& tablet_data : pb.tablet_data()) { @@ -452,10 +454,20 @@ Status CloneStateManager::ScheduleCloneOps( external_funcs_->GetTabletInfo(tablet_data.source_tablet_id())); auto target_tablet = VERIFY_RESULT( external_funcs_->GetTabletInfo(tablet_data.target_tablet_id())); + auto source_table = source_tablet->table(); auto target_table = target_tablet->table(); + + // Don't need to worry about ordering here because these are both read locks. + auto source_table_lock = source_table->LockForRead(); auto target_table_lock = target_table->LockForRead(); tablet::CloneTabletRequestPB req; + if (not_snapshotted_tablets.contains(tablet_data.source_tablet_id())) { + RSTATUS_DCHECK(source_tablet->LockForRead()->pb.hide_hybrid_time() != 0, IllegalState, + Format("Expected not snapshotted tablet to be in HIDDEN state. Actual: $0", + source_table_lock->state_name())); + req.set_clone_from_active_rocksdb(true); + } req.set_tablet_id(tablet_data.source_tablet_id()); req.set_target_tablet_id(tablet_data.target_tablet_id()); req.set_source_snapshot_id(pb.source_snapshot_id().data(), pb.source_snapshot_id().size()); diff --git a/src/yb/master/clone/clone_state_manager.h b/src/yb/master/clone/clone_state_manager.h index d93125dbd4f..0a0a4643e84 100644 --- a/src/yb/master/clone/clone_state_manager.h +++ b/src/yb/master/clone/clone_state_manager.h @@ -96,7 +96,9 @@ class CloneStateManager { Status LoadCloneState(const std::string& id, const SysCloneStatePB& metadata); - Status ScheduleCloneOps(const CloneStateInfoPtr& clone_state, const LeaderEpoch& epoch); + Status ScheduleCloneOps( + const CloneStateInfoPtr& clone_state, const LeaderEpoch& epoch, + const std::unordered_set& not_snapshotted_tablets); Result GetCloneStateFromSourceNamespace(const NamespaceId& namespace_id); diff --git a/src/yb/master/clone/external_functions.h b/src/yb/master/clone/external_functions.h index 1088538c05a..2634be85f54 100644 --- a/src/yb/master/clone/external_functions.h +++ b/src/yb/master/clone/external_functions.h @@ -59,9 +59,10 @@ class CloneStateManagerExternalFunctionsBase { const CreateSnapshotRequestPB* req, CreateSnapshotResponsePB* resp, CoarseTimePoint deadline, const LeaderEpoch& epoch) = 0; - virtual Result GenerateSnapshotInfoFromSchedule( - const SnapshotScheduleId& snapshot_schedule_id, HybridTime export_time, - CoarseTimePoint deadline) = 0; + virtual Result>> + GenerateSnapshotInfoFromSchedule( + const SnapshotScheduleId& snapshot_schedule_id, HybridTime export_time, + CoarseTimePoint deadline) = 0; virtual Status DoImportSnapshotMeta( const SnapshotInfoPB& snapshot_pb, const LeaderEpoch& epoch, diff --git a/src/yb/tablet/operations.proto b/src/yb/tablet/operations.proto index 5420fa3fd5d..63949b6a341 100644 --- a/src/yb/tablet/operations.proto +++ b/src/yb/tablet/operations.proto @@ -180,4 +180,9 @@ message CloneTabletRequestPB { optional SchemaPB target_schema = 12; optional PartitionSchemaPB target_partition_schema = 13; optional bool target_skip_table_tombstone_check = 14; + + // If set, checkout the active rocksdb folder to the target snapshot. + // This is set for tablets that were hidden before the snapshot that covers the clone time was + // taken (we do not take snapshots after hiding a tablet). + optional bool clone_from_active_rocksdb = 15; } diff --git a/src/yb/tserver/ts_tablet_manager.cc b/src/yb/tserver/ts_tablet_manager.cc index 91bc324ae5f..9511578d229 100644 --- a/src/yb/tserver/ts_tablet_manager.cc +++ b/src/yb/tserver/ts_tablet_manager.cc @@ -98,6 +98,7 @@ #include "yb/tablet/tablet_metadata.h" #include "yb/tablet/tablet_options.h" #include "yb/tablet/tablet_peer.h" +#include "yb/tablet/tablet_snapshots.h" #include "yb/tablet/tablet_types.pb.h" #include "yb/tools/yb-admin_util.h" @@ -1361,8 +1362,6 @@ Status TSTabletManager::ApplyCloneTablet( auto target_meta = VERIFY_RESULT(RaftGroupMetadata::CreateNew( target_meta_data, data_root_dir, wal_root_dir)); - auto source_snapshot_dir = JoinPathSegments( - VERIFY_RESULT(source_tablet->metadata()->TopSnapshotsDir()), source_snapshot_id.ToString()); // Create target parent table directory if required. auto target_parent_table_dir = DirName(target_meta->snapshots_dir()); RETURN_NOT_OK_PREPEND( @@ -1370,10 +1369,24 @@ Status TSTabletManager::ApplyCloneTablet( Format("Failed to create RocksDB table directory $0", target_parent_table_dir)); auto target_snapshot_dir = JoinPathSegments( VERIFY_RESULT(target_meta->TopSnapshotsDir()), target_snapshot_id.ToString()); - LOG(INFO) << Format("Hard-linking from $0 to $1", source_snapshot_dir, target_snapshot_dir); - RETURN_NOT_OK(CopyDirectory( - fs_manager_->env(), source_snapshot_dir, target_snapshot_dir, UseHardLinks::kTrue, - CreateIfMissing::kTrue)); + + // Copy or create the snapshot into the target snapshot directory. + if (request->clone_from_active_rocksdb()) { + RETURN_NOT_OK_PREPEND(source_tablet->snapshots().Create(tablet::CreateSnapshotData { + .snapshot_hybrid_time = operation->hybrid_time(), + .hybrid_time = operation->hybrid_time(), + .op_id = operation->op_id(), + .snapshot_dir = target_snapshot_dir, + .schedule_id = SnapshotScheduleId::Nil(), + }), Format("Could not create from active rocksdb of tablet $0", source_tablet_id)); + } else { + auto source_snapshot_dir = JoinPathSegments(VERIFY_RESULT( + source_tablet->metadata()->TopSnapshotsDir()), source_snapshot_id.ToString()); + LOG(INFO) << Format("Hard-linking from $0 to $1", source_snapshot_dir, target_snapshot_dir); + RETURN_NOT_OK(CopyDirectory( + fs_manager_->env(), source_snapshot_dir, target_snapshot_dir, UseHardLinks::kTrue, + CreateIfMissing::kTrue)); + } if (PREDICT_FALSE(FLAGS_TEST_crash_before_clone_target_marked_ready)) { LOG(FATAL) << "Crashing due to FLAGS_TEST_crash_before_clone_target_marked_ready";