diff --git a/src/yb/client/snapshot-schedule-test.cc b/src/yb/client/snapshot-schedule-test.cc index 8a0b4955b99..9af3a66940f 100644 --- a/src/yb/client/snapshot-schedule-test.cc +++ b/src/yb/client/snapshot-schedule-test.cc @@ -447,6 +447,33 @@ TEST_F(CloneFromScheduleTest, CloneWithNoSchedule) { ASSERT_STR_CONTAINS(status.message().ToBuffer(), "Could not find snapshot schedule"); } +TEST_F(CloneFromScheduleTest, CloneAfterDrop) { + auto schedule_id = ASSERT_RESULT( + snapshot_util_->CreateSchedule(table_, kTableName.namespace_type(), + kTableName.namespace_name())); + ASSERT_OK(snapshot_util_->WaitScheduleSnapshot(schedule_id)); + + ASSERT_NO_FATALS(WriteData(WriteOpType::INSERT, 0 /* transaction */)); + auto row_count = CountTableRows(table_); + auto ht = cluster_->mini_master()->master()->clock()->Now(); + + ASSERT_OK(client_->DeleteTable(kTableName)); + + master::CloneNamespaceRequestPB req; + master::NamespaceIdentifierPB source_namespace; + source_namespace.set_name(kTableName.namespace_name()); + source_namespace.set_database_type(YQLDatabase::YQL_DATABASE_CQL); + *req.mutable_source_namespace() = source_namespace; + req.set_restore_ht(ht.ToUint64()); + req.set_target_namespace_name("clone" /* target_namespace_name */); + ASSERT_OK(CloneAndWait(req)); + + YBTableName clone(YQL_DATABASE_CQL, "clone", kTableName.table_name()); + TableHandle clone_handle; + ASSERT_OK(clone_handle.Open(clone, client_.get())); + ASSERT_EQ(CountTableRows(clone_handle), row_count); +} + TEST_F(SnapshotScheduleTest, RemoveNewTablets) { const auto kInterval = 5s * kTimeMultiplier; const auto kRetention = kInterval * 2; diff --git a/src/yb/integration-tests/minicluster-snapshot-test.cc b/src/yb/integration-tests/minicluster-snapshot-test.cc index dce7b5e6ef0..02070e6acc0 100644 --- a/src/yb/integration-tests/minicluster-snapshot-test.cc +++ b/src/yb/integration-tests/minicluster-snapshot-test.cc @@ -469,7 +469,7 @@ TEST_P(MasterExportSnapshotTest, ExportSnapshotAsOfTime) { LOG(INFO) << Format( "Exporting snapshot from snapshot schedule: $0, Hybrid time = $1", schedule_id, time); auto deadline = CoarseMonoClock::Now() + timeout; - master::SnapshotInfoPB snapshot_info_as_of_time = ASSERT_RESULT( + auto [snapshot_info_as_of_time, not_snapshotted_tablets] = ASSERT_RESULT( mini_cluster()->mini_master()->catalog_manager_impl().GenerateSnapshotInfoFromSchedule( schedule_id, HybridTime::FromMicros(static_cast(time.ToInt64())), deadline)); // 6. @@ -508,7 +508,7 @@ TEST_P(MasterExportSnapshotTest, ExportSnapshotAsOfTimeWithHiddenTables) { LOG(INFO) << Format( "Exporting snapshot from snapshot schedule: $0, Hybrid time = $1", schedule_id, time); auto deadline = CoarseMonoClock::Now() + timeout; - master::SnapshotInfoPB snapshot_info_as_of_time = ASSERT_RESULT( + auto [snapshot_info_as_of_time, not_snapshotted_tablets] = ASSERT_RESULT( mini_cluster()->mini_master()->catalog_manager_impl().GenerateSnapshotInfoFromSchedule( schedule_id, HybridTime::FromMicros(static_cast(time.ToInt64())), deadline)); // 6. Assert the output of 5 and 3 are the same. diff --git a/src/yb/master/catalog_manager.h b/src/yb/master/catalog_manager.h index 46749f7ca9d..9c2cd842638 100644 --- a/src/yb/master/catalog_manager.h +++ b/src/yb/master/catalog_manager.h @@ -1213,16 +1213,19 @@ class CatalogManager : public tserver::TabletPeerLookupIf, const ListSnapshotRestorationsRequestPB* req, ListSnapshotRestorationsResponsePB* resp) override; - // Generate the snapshot info as of export_time from the provided snapshot schedule. - Result GenerateSnapshotInfoFromSchedule( - const SnapshotScheduleId& snapshot_schedule_id, HybridTime export_time, - CoarseTimePoint deadline) override; + Result GetSnapshotInfoForBackup(const TxnSnapshotId& snapshot_id); + // Generate the SnapshotInfoPB as of read_time from the provided snapshot schedule, and return + // the set of tablets that were RUNNING as of read_time but were HIDDEN before the actual snapshot + // was taken). // The SnapshotInfoPB generated by export snapshot as of time should be identical to the - // SnapshotInfoPB generated by the normal export_snapshot (even ordering of tables/tablets) - Result GenerateSnapshotInfoPbAsOfTime( - const TxnSnapshotId& snapshot_id, HybridTime read_time, const docdb::DocDB& doc_db, - std::reference_wrapper db_pending_op); + // SnapshotInfoPB generated by the normal export_snapshot (even ordering of tables/tablets). + Result>> GenerateSnapshotInfoFromSchedule( + const SnapshotScheduleId& snapshot_schedule_id, HybridTime read_time, + CoarseTimePoint deadline) override; + + Result> GetBackupEntriesAsOfTime( + const TxnSnapshotId& snapshot_id, const NamespaceId& source_ns_id, HybridTime read_time); Status RestoreSnapshot( const RestoreSnapshotRequestPB* req, RestoreSnapshotResponsePB* resp, rpc::RpcContext* rpc, @@ -2764,6 +2767,9 @@ class CatalogManager : public tserver::TabletPeerLookupIf, void ScheduleTabletSnapshotOp(const AsyncTabletSnapshotOpPtr& operation) override; + Result> RestoreSnapshotToTmpRocksDb( + tablet::Tablet* tablet, const TxnSnapshotId& snapshot_id, HybridTime restore_at); + Status RestoreSysCatalogCommon( SnapshotScheduleRestoration* restoration, tablet::Tablet* tablet, std::reference_wrapper pending_op, diff --git a/src/yb/master/catalog_manager_ext.cc b/src/yb/master/catalog_manager_ext.cc index 5008c0829c4..3671888f595 100644 --- a/src/yb/master/catalog_manager_ext.cc +++ b/src/yb/master/catalog_manager_ext.cc @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "yb/common/colocated_util.h" @@ -165,6 +166,8 @@ DEFINE_RUNTIME_uint32(default_snapshot_retention_hours, 24, namespace yb { +using google::protobuf::RepeatedPtrField; + using rpc::RpcContext; using pb_util::ParseFromSlice; using client::internal::RemoteTabletServer; @@ -186,9 +189,10 @@ struct TableWithTabletsEntries { } TableWithTabletsEntries() {} - // Add the table with table_id and its tablets entries to snapshot_info - void AddToSnapshotInfo(const TableId& table_id, SnapshotInfoPB* snapshot_info) { - BackupRowEntryPB* table_backup_entry = snapshot_info->add_backup_entries(); + // Add the table with table_id and its tablets entries to a list of backup entries. + void AddToBackupEntries( + const TableId& table_id, RepeatedPtrField* backup_entries) { + BackupRowEntryPB* table_backup_entry = backup_entries->Add(); std::string output; table_entry.AppendToString(&output); *table_backup_entry->mutable_entry() = @@ -196,11 +200,11 @@ struct TableWithTabletsEntries { if (table_entry.schema().has_pgschema_name() && table_entry.schema().pgschema_name() != "") { table_backup_entry->set_pg_schema_name(table_entry.schema().pgschema_name()); } - for (const auto& sysTabletEntry : tablets_entries) { + for (const auto& tablet_entry : tablets_entries) { std::string output; - sysTabletEntry.second.AppendToString(&output); - *snapshot_info->add_backup_entries()->mutable_entry() = - ToSysRowEntry(sysTabletEntry.first, SysRowEntryType::TABLET, std::move(output)); + tablet_entry.second.AppendToString(&output); + *backup_entries->Add()->mutable_entry() = + ToSysRowEntry(tablet_entry.first, SysRowEntryType::TABLET, std::move(output)); } } @@ -1348,41 +1352,7 @@ Status CatalogManager::ImportSnapshotMeta(const ImportSnapshotMetaRequestPB* req return Status::OK(); } -Result CatalogManager::GenerateSnapshotInfoFromSchedule( - const SnapshotScheduleId& snapshot_schedule_id, HybridTime export_time, - CoarseTimePoint deadline) { - LOG(INFO) << Format( - "Servicing GenerateSnapshotInfoFromSchedule for snapshot_schedule_id: $0 and export " - "time: $1", - snapshot_schedule_id, export_time); - auto suitable_snapshot = VERIFY_RESULT(snapshot_coordinator_.GetSuitableSnapshot( - snapshot_schedule_id, export_time, LeaderTerm(), deadline)); - auto snapshot_id = VERIFY_RESULT(FullyDecodeTxnSnapshotId(suitable_snapshot.id())); - LOG(INFO) << Format("Found suitable snapshot: $0", snapshot_id); - auto tablet = VERIFY_RESULT(tablet_peer()->shared_tablet_safe()); - LOG(INFO) << Format( - "Opening Temporary SysCatalog DocDB for : $0 , export_time: $1", snapshot_id, export_time); - // Open a temporary on the side DocDb for the sys.catalog using the data files of snapshot_id and - // reading sys.catalog data as of read_time. - auto dir = VERIFY_RESULT(tablet->snapshots().RestoreToTemporary(snapshot_id, export_time)); - rocksdb::Options rocksdb_options; - tablet->InitRocksDBOptions(&rocksdb_options, /*log_prefix*/ " [TMP]: "); - auto db = VERIFY_RESULT(rocksdb::DB::Open(rocksdb_options, dir)); - // db can't be closed concurrently, so it is ok to use dummy ScopedRWOperation. - auto db_pending_op = ScopedRWOperation(); - auto doc_db = docdb::DocDB::FromRegularUnbounded(db.get()); - - SnapshotInfoPB snapshot_info = VERIFY_RESULT( - GenerateSnapshotInfoPbAsOfTime(snapshot_id, export_time, doc_db, db_pending_op)); - LOG(INFO) << Format("snapshot_info returned: $0", snapshot_info.ShortDebugString()); - return snapshot_info; -} - -Result CatalogManager::GenerateSnapshotInfoPbAsOfTime( - const TxnSnapshotId& snapshot_id, HybridTime read_time, const docdb::DocDB& doc_db, - std::reference_wrapper db_pending_op) { - // Get the SnapshotInfoPB of snapshot_id to set some fields in snapshot_info (all fields except - // backup_entries which will be populated from iterating over DocDB). +Result CatalogManager::GetSnapshotInfoForBackup(const TxnSnapshotId& snapshot_id) { ListSnapshotsRequestPB req; ListSnapshotsResponsePB resp; req.set_snapshot_id(snapshot_id.data(), snapshot_id.size()); @@ -1393,39 +1363,111 @@ Result CatalogManager::GenerateSnapshotInfoPbAsOfTime( return STATUS_FORMAT(InvalidArgument, "Unknown snapshot: $0", snapshot_id); } SnapshotInfoPB snapshot_info = resp.snapshots()[0]; - // Get the namespace_id that this snapshot covers. - auto namespace_it = std::find_if( - snapshot_info.backup_entries().begin(), snapshot_info.backup_entries().end(), - [](const BackupRowEntryPB& backup_entry) { - return backup_entry.entry().type() == SysRowEntryType::NAMESPACE; - }); - if (namespace_it == snapshot_info.backup_entries().end()) { + RSTATUS_DCHECK( + snapshot_info.entry().tablet_snapshots().empty(), IllegalState, + "Expected tablet_snapshots field to be cleared by ListSnapshots"); + RSTATUS_DCHECK( + snapshot_info.entry().entries().empty(), IllegalState, + "Expected entries field to be cleared by ListSnapshots"); + + return snapshot_info; +} + +Result>> + CatalogManager::GenerateSnapshotInfoFromSchedule( + const SnapshotScheduleId& snapshot_schedule_id, HybridTime read_time, + CoarseTimePoint deadline) { + LOG(INFO) << Format( + "Servicing GenerateSnapshotInfoFromSchedule for snapshot_schedule_id: $0 and read_time: $1", + snapshot_schedule_id, read_time); + + // Find or create a snapshot that covers read_time. + auto suitable_snapshot = VERIFY_RESULT(snapshot_coordinator_.GetSuitableSnapshot( + snapshot_schedule_id, read_time, LeaderTerm(), deadline)); + auto snapshot_id = VERIFY_RESULT(FullyDecodeTxnSnapshotId(suitable_snapshot.id())); + LOG(INFO) << Format("Found suitable snapshot: $0", snapshot_id); + + ListSnapshotSchedulesResponsePB resp; + RETURN_NOT_OK(snapshot_coordinator_.ListSnapshotSchedules(snapshot_schedule_id, &resp)); + const auto& filter = resp.schedules(0).options().filter().tables().tables(); + if (filter.empty() || filter.begin()->namespace_().id().empty()) { return STATUS_FORMAT( - NotFound, Format("No namespace entry found in snapshot: $0", snapshot_info.id())); + IllegalState, "No namespace found in filter for schedule id $0", snapshot_schedule_id); + } + const NamespaceId& source_ns_id = filter.begin()->namespace_().id(); + + // Get the SnapshotInfoPB, save the set of tablets it contained, and clear backup_entries. + // backup_entries will be repopulated with the set of tablets that were running at read_time + // later when reading from DocDB as of read_time. + auto snapshot_info = VERIFY_RESULT(GetSnapshotInfoForBackup(snapshot_id)); + std::unordered_set snapshotted_tablets; + for (auto& backup_entry : snapshot_info.backup_entries()) { + if (backup_entry.entry().type() == SysRowEntryType::TABLET) { + snapshotted_tablets.insert(backup_entry.entry().id()); + } } - NamespaceId namespace_id = namespace_it->entry().id(); - // Remove backup_entries from original snapshot as they will be generated as of - // read_time. snapshot_info.clear_backup_entries(); + + // Set backup_entries based on what entries were running in the sys catalog as of read_time. + *snapshot_info.mutable_backup_entries() = VERIFY_RESULT( + GetBackupEntriesAsOfTime(snapshot_id, source_ns_id, read_time)); + VLOG_WITH_FUNC(1) << Format("snapshot_info returned: $0", snapshot_info.ShortDebugString()); + + // Compute the set of tablets that were running as of read_time but were not snapshotted because + // they were hidden before the snapshot was taken. + std::unordered_set not_snapshotted_tablets; + for (const auto& backup_entry : snapshot_info.backup_entries()) { + if (backup_entry.entry().type() == SysRowEntryType::TABLET && + !snapshotted_tablets.contains(backup_entry.entry().id())) { + not_snapshotted_tablets.insert(backup_entry.entry().id()); + } + } + return std::make_pair(std::move(snapshot_info), std::move(not_snapshotted_tablets)); +} + +Result> CatalogManager::GetBackupEntriesAsOfTime( + const TxnSnapshotId& snapshot_id, const NamespaceId& source_ns_id, HybridTime read_time) { + // Open a temporary on-the-side DocDB for the sys.catalog using the data files of snapshot_id and + // read sys.catalog data as of export_time to get the list of tablets that were running at that + // time. + RepeatedPtrField backup_entries; + auto tablet = VERIFY_RESULT(tablet_peer()->shared_tablet_safe()); + LOG(INFO) << Format("Opening temporary SysCatalog DocDB for snapshot $0 at read_time $1", + snapshot_id, read_time); + auto db = VERIFY_RESULT(RestoreSnapshotToTmpRocksDb(tablet.get(), snapshot_id, read_time)); + auto doc_db = docdb::DocDB::FromRegularUnbounded(db.get()); + const docdb::DocReadContext& doc_read_cntxt = doc_read_context(); dockv::ReaderProjection projection(doc_read_cntxt.schema()); + + // db can't be closed concurrently, so it is ok to use dummy ScopedRWOperation. + auto db_pending_op = ScopedRWOperation(); + // Pass 1: Get the SysNamespaceEntryPB of the selected database. docdb::DocRowwiseIterator namespace_iter = docdb::DocRowwiseIterator( projection, doc_read_cntxt, TransactionOperationContext(), doc_db, docdb::ReadOperationData::FromSingleReadTime(read_time), db_pending_op); + bool found_ns = false; RETURN_NOT_OK(EnumerateSysCatalog( &namespace_iter, doc_read_cntxt.schema(), SysRowEntryType::NAMESPACE, - [namespace_id, &snapshot_info](const Slice& id, const Slice& data) -> Status { - if (id.ToBuffer() == namespace_id) { + [&source_ns_id, &backup_entries, &found_ns](const Slice& id, const Slice& data) -> Status { + if (id.ToBuffer() == source_ns_id) { + if (found_ns) { + LOG(WARNING) << "Found duplicate backup entry for namespace " << source_ns_id; + } auto pb = VERIFY_RESULT(pb_util::ParseFromSlice(data)); VLOG_WITH_FUNC(1) << "Found SysNamespaceEntryPB: " << pb.ShortDebugString(); - SysRowEntry* ns_entry = snapshot_info.add_backup_entries()->mutable_entry(); + SysRowEntry* ns_entry = backup_entries.Add()->mutable_entry(); ns_entry->set_id(id.ToBuffer()); ns_entry->set_type(SysRowEntryType::NAMESPACE); ns_entry->set_data(data.ToBuffer()); + found_ns = true; } return Status::OK(); })); + RSTATUS_DCHECK(found_ns, IllegalState, + Format("Did not find backup entry for namespace $0", source_ns_id)); + // Pass 2: Get all the SysTablesEntry of the database that are in running state and not Hidden as // of read_time. // Stores SysTablesEntry and its SysTabletsEntries to order the tablets of each table by @@ -1437,10 +1479,10 @@ Result CatalogManager::GenerateSnapshotInfoPbAsOfTime( docdb::ReadOperationData::FromSingleReadTime(read_time), db_pending_op); RETURN_NOT_OK(EnumerateSysCatalog( &tables_iter, doc_read_cntxt.schema(), SysRowEntryType::TABLE, - [namespace_id, &tables_to_tablets, &colocation_parent_table_id]( + [&source_ns_id, &tables_to_tablets, &colocation_parent_table_id]( const Slice& id, const Slice& data) -> Status { auto pb = VERIFY_RESULT(pb_util::ParseFromSlice(data)); - if (pb.namespace_id() == namespace_id && pb.state() == SysTablesEntryPB::RUNNING && + if (pb.namespace_id() == source_ns_id && pb.state() == SysTablesEntryPB::RUNNING && pb.hide_state() == SysTablesEntryPB_HideState_VISIBLE && !pb.schema().table_properties().is_ysql_catalog_table()) { VLOG_WITH_FUNC(1) << "Found SysTablesEntryPB: " << pb.ShortDebugString(); @@ -1454,6 +1496,7 @@ Result CatalogManager::GenerateSnapshotInfoPbAsOfTime( } return Status::OK(); })); + // Pass 3: Get all the SysTabletsEntry that are in a running state as of read_time and belongs to // the running tables from pass 2. docdb::DocRowwiseIterator tablets_iter = docdb::DocRowwiseIterator( @@ -1461,10 +1504,10 @@ Result CatalogManager::GenerateSnapshotInfoPbAsOfTime( docdb::ReadOperationData::FromSingleReadTime(read_time), db_pending_op); RETURN_NOT_OK(EnumerateSysCatalog( &tablets_iter, doc_read_cntxt.schema(), SysRowEntryType::TABLET, - [namespace_id, &tables_to_tablets](const Slice& id, const Slice& data) -> Status { + [&tables_to_tablets](const Slice& id, const Slice& data) -> Status { auto pb = VERIFY_RESULT(pb_util::ParseFromSlice(data)); // TODO(Yamen): handle tablet splitting cases by either keeping the parent or the children - // according to their state. + // according to their state. if (tables_to_tablets.contains(pb.table_id()) && pb.state() == SysTabletsEntryPB::RUNNING && pb.hide_hybrid_time() == 0) { VLOG_WITH_FUNC(1) << "Found SysTabletsEntryPB: " << pb.ShortDebugString(); @@ -1481,14 +1524,14 @@ Result CatalogManager::GenerateSnapshotInfoPbAsOfTime( // Populate the backup_entries with SysTablesEntry and SysTabletsEntry // Start with the colocation_parent_table_id if the database is colocated. if (colocation_parent_table_id) { - tables_to_tablets[colocation_parent_table_id.value()].AddToSnapshotInfo( - colocation_parent_table_id.value(), &snapshot_info); + tables_to_tablets[colocation_parent_table_id.value()].AddToBackupEntries( + colocation_parent_table_id.value(), &backup_entries); tables_to_tablets.erase(colocation_parent_table_id.value()); } for (auto& sys_table_entry : tables_to_tablets) { - sys_table_entry.second.AddToSnapshotInfo(sys_table_entry.first, &snapshot_info); + sys_table_entry.second.AddToBackupEntries(sys_table_entry.first, &backup_entries); } - return snapshot_info; + return backup_entries; } Status CatalogManager::GetFullUniverseKeyRegistry(const GetFullUniverseKeyRegistryRequestPB* req, @@ -2652,24 +2695,32 @@ void CatalogManager::ScheduleTabletSnapshotOp(const AsyncTabletSnapshotOpPtr& ta WARN_NOT_OK(ScheduleTask(task), "Failed to send create snapshot request"); } +Result> CatalogManager::RestoreSnapshotToTmpRocksDb( + tablet::Tablet* tablet, const TxnSnapshotId& snapshot_id, HybridTime restore_at) { + std::string log_prefix = LogPrefix(); + // Remove ": " to patch suffix. + log_prefix.erase(log_prefix.size() - 2); + + // Restore master snapshot and load it to RocksDB. + auto dir = VERIFY_RESULT(tablet->snapshots().RestoreToTemporary(snapshot_id, restore_at)); + rocksdb::Options rocksdb_options; + tablet->InitRocksDBOptions(&rocksdb_options, log_prefix + " [TMP]: "); + + return rocksdb::DB::Open(rocksdb_options, dir); +} + Status CatalogManager::RestoreSysCatalogCommon( SnapshotScheduleRestoration* restoration, tablet::Tablet* tablet, std::reference_wrapper tablet_pending_op, RestoreSysCatalogState* state, docdb::DocWriteBatch* write_batch, docdb::KeyValuePairPB* restore_kv) { // Restore master snapshot and load it to RocksDB. - auto dir = VERIFY_RESULT(tablet->snapshots().RestoreToTemporary( - restoration->snapshot_id, restoration->restore_at)); - rocksdb::Options rocksdb_options; - std::string log_prefix = LogPrefix(); - // Remove ": " to patch suffix. - log_prefix.erase(log_prefix.size() - 2); - tablet->InitRocksDBOptions(&rocksdb_options, log_prefix + " [TMP]: "); + auto db = VERIFY_RESULT(RestoreSnapshotToTmpRocksDb( + tablet, restoration->snapshot_id, restoration->restore_at)); + auto doc_db = docdb::DocDB::FromRegularUnbounded(db.get()); - auto db = VERIFY_RESULT(rocksdb::DB::Open(rocksdb_options, dir)); // db can't be closed concurrently, so it is ok to use dummy ScopedRWOperation. auto db_pending_op = ScopedRWOperation(); - auto doc_db = docdb::DocDB::FromRegularUnbounded(db.get()); // Load objects to restore and determine obsolete objects. auto schema_packing_provider = &tablet->GetSchemaPackingProvider(); diff --git a/src/yb/master/catalog_manager_if.h b/src/yb/master/catalog_manager_if.h index 744909cc3eb..6d4571f7e06 100644 --- a/src/yb/master/catalog_manager_if.h +++ b/src/yb/master/catalog_manager_if.h @@ -220,7 +220,8 @@ class CatalogManagerIf { virtual Status ListSnapshotRestorations( const ListSnapshotRestorationsRequestPB* req, ListSnapshotRestorationsResponsePB* resp) = 0; - virtual Result GenerateSnapshotInfoFromSchedule( + virtual Result>> + GenerateSnapshotInfoFromSchedule( const SnapshotScheduleId& snapshot_schedule_id, HybridTime export_time, CoarseTimePoint deadline) = 0; diff --git a/src/yb/master/clone/clone_state_manager-test.cc b/src/yb/master/clone/clone_state_manager-test.cc index 100ba316445..fba82638b83 100644 --- a/src/yb/master/clone/clone_state_manager-test.cc +++ b/src/yb/master/clone/clone_state_manager-test.cc @@ -23,8 +23,8 @@ #include "yb/common/snapshot.h" #include "yb/gutil/map-util.h" - #include "yb/gutil/ref_counted.h" + #include "yb/master/catalog_entity_info.h" #include "yb/master/catalog_entity_info.pb.h" #include "yb/master/clone/clone_state_entity.h" @@ -61,11 +61,17 @@ MATCHER_P(CloneTabletRequestPBMatcher, expected, "CloneTabletRequestPBs did not return pb_util::ArePBsEqual(arg, expected, nullptr /* diff_str */); } -std::ostream& operator<<(std::ostream& os, const Result& res) { +// This is needed for the mock of GenerateSnapshotInfoFromSchedule. +std::ostream& operator<<( + std::ostream& os, const Result>>& res) { if (!res.ok()) { os << res.status().ToString(); } else { - os << res->ShortDebugString(); + os << res->first.ShortDebugString(); + os << "Not snapshotted tablets: "; + for (const auto& tablet_id : res->second) { + os << tablet_id << ", "; + } } return os; } @@ -113,9 +119,10 @@ class CloneStateManagerTest : public YBTest { CoarseTimePoint deadline, const LeaderEpoch& epoch), (override)); MOCK_METHOD( - Result, GenerateSnapshotInfoFromSchedule, + (Result>>), + GenerateSnapshotInfoFromSchedule, (const SnapshotScheduleId& snapshot_schedule_id, HybridTime export_time, - CoarseTimePoint deadline), (override)); + CoarseTimePoint deadline), (override)); MOCK_METHOD( Status, DoImportSnapshotMeta, @@ -238,8 +245,10 @@ class CloneStateManagerTest : public YBTest { return clone_state_manager_->GetCloneStateFromSourceNamespace(namespace_id); } - Status ScheduleCloneOps(const CloneStateInfoPtr& clone_state, const LeaderEpoch& epoch) { - return clone_state_manager_->ScheduleCloneOps(clone_state, epoch); + Status ScheduleCloneOps( + const CloneStateInfoPtr& clone_state, const LeaderEpoch& epoch, + const std::unordered_set& not_snapshotted_tablets) { + return clone_state_manager_->ScheduleCloneOps(clone_state, epoch, not_snapshotted_tablets); } MockExternalFunctions& MockFuncs() { @@ -351,7 +360,7 @@ TEST_F(CloneStateManagerTest, ScheduleCloneOps) { EXPECT_CALL(MockFuncs(), ScheduleCloneTabletCall( source_tablets_[i], epoch, CloneTabletRequestPBMatcher(expected_req))); } - ASSERT_OK(ScheduleCloneOps(clone_state, epoch)); + ASSERT_OK(ScheduleCloneOps(clone_state, epoch, {} /* not_snapshotted_tablets */)); } TEST_F(CloneStateManagerTest, HandleCreatingStateAllTabletsCreating) { diff --git a/src/yb/master/clone/clone_state_manager.cc b/src/yb/master/clone/clone_state_manager.cc index 2d6f34dc8ba..527825833c7 100644 --- a/src/yb/master/clone/clone_state_manager.cc +++ b/src/yb/master/clone/clone_state_manager.cc @@ -16,6 +16,7 @@ #include #include "yb/common/common_types.pb.h" +#include "yb/common/entity_ids_types.h" #include "yb/common/snapshot.h" #include "yb/common/wire_protocol.h" #include "yb/gutil/macros.h" @@ -115,7 +116,7 @@ class CloneStateManagerExternalFunctions : public CloneStateManagerExternalFunct return catalog_manager_->DoCreateSnapshot(req, resp, deadline, epoch); } - Result GenerateSnapshotInfoFromSchedule( + Result>> GenerateSnapshotInfoFromSchedule( const SnapshotScheduleId& snapshot_schedule_id, HybridTime export_time, CoarseTimePoint deadline) override { return catalog_manager_->GenerateSnapshotInfoFromSchedule( @@ -215,14 +216,16 @@ Result> CloneStateManager::CloneNamespace( InvalidArgument, "Expected source namespace identifier to have database type and name. " "Got: $0", source_namespace_identifier.ShortDebugString()); } + auto source_namespace = VERIFY_RESULT( + external_funcs_->FindNamespace(source_namespace_identifier)); + const auto& source_namespace_id = source_namespace->id(); + ListSnapshotSchedulesResponsePB resp; RETURN_NOT_OK(external_funcs_->ListSnapshotSchedules(&resp)); auto snapshot_schedule_id = SnapshotScheduleId::Nil(); for (const auto& schedule : resp.schedules()) { auto& tables = schedule.options().filter().tables().tables(); - if (!tables.empty() && - tables[0].namespace_().name() == source_namespace_identifier.name() && - tables[0].namespace_().database_type() == source_namespace_identifier.database_type()) { + if (!tables.empty() && tables[0].namespace_().id() == source_namespace_id) { snapshot_schedule_id = VERIFY_RESULT(FullyDecodeSnapshotScheduleId(schedule.id())); break; } @@ -233,10 +236,7 @@ Result> CloneStateManager::CloneNamespace( source_namespace_identifier.name()); } - auto source_namespace = VERIFY_RESULT( - external_funcs_->FindNamespace(source_namespace_identifier)); auto seq_no = source_namespace->FetchAndIncrementCloneSeqNo(); - const auto source_namespace_id = source_namespace->id(); // Set up persisted clone state. auto clone_state = VERIFY_RESULT(CreateCloneState(seq_no, source_namespace, restore_time)); @@ -277,8 +277,9 @@ Status CloneStateManager::StartTabletsCloning( const HybridTime& restore_time, const std::string& target_namespace_name, CoarseTimePoint deadline, const LeaderEpoch& epoch) { // Export snapshot info. - auto snapshot_info = VERIFY_RESULT(external_funcs_->GenerateSnapshotInfoFromSchedule( - snapshot_schedule_id, restore_time, deadline)); + auto [snapshot_info, not_snapshotted_tablets] = VERIFY_RESULT( + external_funcs_->GenerateSnapshotInfoFromSchedule( + snapshot_schedule_id, restore_time, deadline)); auto source_snapshot_id = VERIFY_RESULT(FullyDecodeTxnSnapshotId(snapshot_info.id())); // Import snapshot info. @@ -322,7 +323,7 @@ Status CloneStateManager::StartTabletsCloning( RETURN_NOT_OK(UpdateCloneStateWithSnapshotInfo( clone_state, source_snapshot_id, target_snapshot_id, tables_data)); - RETURN_NOT_OK(ScheduleCloneOps(clone_state, epoch)); + RETURN_NOT_OK(ScheduleCloneOps(clone_state, epoch, not_snapshotted_tablets)); return Status::OK(); } @@ -444,7 +445,8 @@ Status CloneStateManager::UpdateCloneStateWithSnapshotInfo( } Status CloneStateManager::ScheduleCloneOps( - const CloneStateInfoPtr& clone_state, const LeaderEpoch& epoch) { + const CloneStateInfoPtr& clone_state, const LeaderEpoch& epoch, + const std::unordered_set& not_snapshotted_tablets) { auto lock = clone_state->LockForRead(); auto& pb = lock->pb; for (auto& tablet_data : pb.tablet_data()) { @@ -452,10 +454,20 @@ Status CloneStateManager::ScheduleCloneOps( external_funcs_->GetTabletInfo(tablet_data.source_tablet_id())); auto target_tablet = VERIFY_RESULT( external_funcs_->GetTabletInfo(tablet_data.target_tablet_id())); + auto source_table = source_tablet->table(); auto target_table = target_tablet->table(); + + // Don't need to worry about ordering here because these are both read locks. + auto source_table_lock = source_table->LockForRead(); auto target_table_lock = target_table->LockForRead(); tablet::CloneTabletRequestPB req; + if (not_snapshotted_tablets.contains(tablet_data.source_tablet_id())) { + RSTATUS_DCHECK(source_tablet->LockForRead()->pb.hide_hybrid_time() != 0, IllegalState, + Format("Expected not snapshotted tablet to be in HIDDEN state. Actual: $0", + source_table_lock->state_name())); + req.set_clone_from_active_rocksdb(true); + } req.set_tablet_id(tablet_data.source_tablet_id()); req.set_target_tablet_id(tablet_data.target_tablet_id()); req.set_source_snapshot_id(pb.source_snapshot_id().data(), pb.source_snapshot_id().size()); diff --git a/src/yb/master/clone/clone_state_manager.h b/src/yb/master/clone/clone_state_manager.h index d93125dbd4f..0a0a4643e84 100644 --- a/src/yb/master/clone/clone_state_manager.h +++ b/src/yb/master/clone/clone_state_manager.h @@ -96,7 +96,9 @@ class CloneStateManager { Status LoadCloneState(const std::string& id, const SysCloneStatePB& metadata); - Status ScheduleCloneOps(const CloneStateInfoPtr& clone_state, const LeaderEpoch& epoch); + Status ScheduleCloneOps( + const CloneStateInfoPtr& clone_state, const LeaderEpoch& epoch, + const std::unordered_set& not_snapshotted_tablets); Result GetCloneStateFromSourceNamespace(const NamespaceId& namespace_id); diff --git a/src/yb/master/clone/external_functions.h b/src/yb/master/clone/external_functions.h index 1088538c05a..2634be85f54 100644 --- a/src/yb/master/clone/external_functions.h +++ b/src/yb/master/clone/external_functions.h @@ -59,9 +59,10 @@ class CloneStateManagerExternalFunctionsBase { const CreateSnapshotRequestPB* req, CreateSnapshotResponsePB* resp, CoarseTimePoint deadline, const LeaderEpoch& epoch) = 0; - virtual Result GenerateSnapshotInfoFromSchedule( - const SnapshotScheduleId& snapshot_schedule_id, HybridTime export_time, - CoarseTimePoint deadline) = 0; + virtual Result>> + GenerateSnapshotInfoFromSchedule( + const SnapshotScheduleId& snapshot_schedule_id, HybridTime export_time, + CoarseTimePoint deadline) = 0; virtual Status DoImportSnapshotMeta( const SnapshotInfoPB& snapshot_pb, const LeaderEpoch& epoch, diff --git a/src/yb/tablet/operations.proto b/src/yb/tablet/operations.proto index 5420fa3fd5d..63949b6a341 100644 --- a/src/yb/tablet/operations.proto +++ b/src/yb/tablet/operations.proto @@ -180,4 +180,9 @@ message CloneTabletRequestPB { optional SchemaPB target_schema = 12; optional PartitionSchemaPB target_partition_schema = 13; optional bool target_skip_table_tombstone_check = 14; + + // If set, checkout the active rocksdb folder to the target snapshot. + // This is set for tablets that were hidden before the snapshot that covers the clone time was + // taken (we do not take snapshots after hiding a tablet). + optional bool clone_from_active_rocksdb = 15; } diff --git a/src/yb/tserver/ts_tablet_manager.cc b/src/yb/tserver/ts_tablet_manager.cc index 91bc324ae5f..9511578d229 100644 --- a/src/yb/tserver/ts_tablet_manager.cc +++ b/src/yb/tserver/ts_tablet_manager.cc @@ -98,6 +98,7 @@ #include "yb/tablet/tablet_metadata.h" #include "yb/tablet/tablet_options.h" #include "yb/tablet/tablet_peer.h" +#include "yb/tablet/tablet_snapshots.h" #include "yb/tablet/tablet_types.pb.h" #include "yb/tools/yb-admin_util.h" @@ -1361,8 +1362,6 @@ Status TSTabletManager::ApplyCloneTablet( auto target_meta = VERIFY_RESULT(RaftGroupMetadata::CreateNew( target_meta_data, data_root_dir, wal_root_dir)); - auto source_snapshot_dir = JoinPathSegments( - VERIFY_RESULT(source_tablet->metadata()->TopSnapshotsDir()), source_snapshot_id.ToString()); // Create target parent table directory if required. auto target_parent_table_dir = DirName(target_meta->snapshots_dir()); RETURN_NOT_OK_PREPEND( @@ -1370,10 +1369,24 @@ Status TSTabletManager::ApplyCloneTablet( Format("Failed to create RocksDB table directory $0", target_parent_table_dir)); auto target_snapshot_dir = JoinPathSegments( VERIFY_RESULT(target_meta->TopSnapshotsDir()), target_snapshot_id.ToString()); - LOG(INFO) << Format("Hard-linking from $0 to $1", source_snapshot_dir, target_snapshot_dir); - RETURN_NOT_OK(CopyDirectory( - fs_manager_->env(), source_snapshot_dir, target_snapshot_dir, UseHardLinks::kTrue, - CreateIfMissing::kTrue)); + + // Copy or create the snapshot into the target snapshot directory. + if (request->clone_from_active_rocksdb()) { + RETURN_NOT_OK_PREPEND(source_tablet->snapshots().Create(tablet::CreateSnapshotData { + .snapshot_hybrid_time = operation->hybrid_time(), + .hybrid_time = operation->hybrid_time(), + .op_id = operation->op_id(), + .snapshot_dir = target_snapshot_dir, + .schedule_id = SnapshotScheduleId::Nil(), + }), Format("Could not create from active rocksdb of tablet $0", source_tablet_id)); + } else { + auto source_snapshot_dir = JoinPathSegments(VERIFY_RESULT( + source_tablet->metadata()->TopSnapshotsDir()), source_snapshot_id.ToString()); + LOG(INFO) << Format("Hard-linking from $0 to $1", source_snapshot_dir, target_snapshot_dir); + RETURN_NOT_OK(CopyDirectory( + fs_manager_->env(), source_snapshot_dir, target_snapshot_dir, UseHardLinks::kTrue, + CreateIfMissing::kTrue)); + } if (PREDICT_FALSE(FLAGS_TEST_crash_before_clone_target_marked_ready)) { LOG(FATAL) << "Crashing due to FLAGS_TEST_crash_before_clone_target_marked_ready";