Skip to content

Commit

Permalink
[#22118] docdb: add tablet replica limit information to master ui tab…
Browse files Browse the repository at this point in the history
…let servers page

Summary:
Add an overview table to the tablet server status page. This table is broken down by placement cluster and lists tablet peer counts. It has a new column for hidden tablet peers which is not displayed anywhere else in the UI.

Blacklisted and hidden peer:
{F173985}

Dead peer:
{F173986}
Jira: DB-11045

Test Plan:
manual inspection of some scripted scenarios, unfortunately.

```
#!/bin/zsh

source ~/.config/zsh/yb.zsh
live_nodes=3
read_nodes=2
prefix=$HOME/yugabyted_clusters

yugabyted start --advertise_address=127.0.0.1 --base_dir=${prefix}/live_cluster/node-1

for i in $(seq 2 $live_nodes)
do
    echo ${prefix}/node-$i
    yugabyted start --join 127.0.0.1 --advertise_address=127.0.0.$i --base_dir=${prefix}/live_cluster/node-$i
done

for i in $(seq $read_nodes)
do
    advertise_addr_suffix=$(($live_nodes + $i))
    echo $advertise_addr_suffix
    advertise_address=127.0.0.$advertise_addr_suffix
    echo $advertise_address
    yugabyted start --join 127.0.0.1 --advertise_address=127.0.0.$advertise_addr_suffix --base_dir=${prefix}/read_cluster/node-$i --read_replica
done

yugabyted configure_read_replica new --rf=$read_nodes --base_dir=~/yugabyted_clusters/read_cluster/node-1

ybactl create_snapshot_schedule 1 5 ysql.yugabyte

sleep 3

ysqlsh <<EOF
create table foo1 (k int primary key, v int);
create table foo2 (k int primary key, v int);
create table foo3 (k int primary key, v int);
create table foo4 (k int primary key, v int);
insert into foo2 values (1, 1);
drop table foo2;
EOF
```

Reviewers: mlillibridge, slingam

Reviewed By: mlillibridge

Subscribers: esheng, ybase, slingam

Differential Revision: https://phorge.dev.yugabyte.com/D34501
  • Loading branch information
druzac committed Apr 27, 2024
1 parent a91117f commit a5a05d0
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 7 deletions.
113 changes: 106 additions & 7 deletions src/yb/master/master-path-handlers.cc
Expand Up @@ -166,6 +166,14 @@ class AutoFieldsetScope {

std::stringstream& output_;
};

std::optional<uint64_t> ToUnsignedOrNullopt(int64_t val) {
if (val == std::numeric_limits<int64_t>::max()) {
return std::nullopt;
} else {
return val;
}
}
} // namespace

using consensus::RaftPeerPB;
Expand All @@ -191,6 +199,7 @@ void MasterPathHandlers::TabletCounts::operator+=(const TabletCounts& other) {
user_tablet_followers += other.user_tablet_followers;
system_tablet_leaders += other.system_tablet_leaders;
system_tablet_followers += other.system_tablet_followers;
hidden_tablet_peers += other.hidden_tablet_peers;
}

MasterPathHandlers::ZoneTabletCounts::ZoneTabletCounts(
Expand Down Expand Up @@ -400,6 +409,40 @@ bool TabletServerComparator(

} // anonymous namespace

MasterPathHandlers::UniverseTabletCounts MasterPathHandlers::CalculateUniverseTabletCounts(
const TabletCountMap& tablet_count_map, const std::vector<std::shared_ptr<TSDescriptor>>& descs,
const BlacklistSet& blacklist_set,
int hide_dead_node_threshold_mins) {
UniverseTabletCounts counts;
for (const auto& desc : descs) {
if (ShouldHideTserverNodeFromDisplay(desc.get(), hide_dead_node_threshold_mins)) {
continue;
}
const auto& placement_uuid = desc->placement_uuid();
PlacementClusterTabletCounts& placement_cluster_counts =
counts.per_placement_cluster_counts[placement_uuid];
if (auto* tablet_count = FindOrNull(tablet_count_map, desc->permanent_uuid())) {
placement_cluster_counts.counts += *tablet_count;
}
if (desc->IsBlacklisted(blacklist_set)) {
placement_cluster_counts.blacklisted_node_count++;
} else if (desc->IsLive()) {
placement_cluster_counts.live_node_count++;
} else {
placement_cluster_counts.dead_node_count++;
}
placement_cluster_counts.active_tablet_peer_count += desc->num_live_replicas();
}

auto limits = tserver::GetTabletReplicaPerResourceLimits();
for (auto& [placement_uuid, cluster_counts] : counts.per_placement_cluster_counts) {
auto cluster_info = ComputeAggregatedClusterInfo(descs, placement_uuid);
cluster_counts.tablet_replica_limit =
ToUnsignedOrNullopt(ComputeTabletReplicaLimit(cluster_info, limits));
}
return counts;
}

void MasterPathHandlers::TServerDisplay(const std::string& current_uuid,
std::vector<std::shared_ptr<TSDescriptor>>* descs,
TabletCountMap* tablet_map,
Expand All @@ -425,7 +468,7 @@ void MasterPathHandlers::TServerDisplay(const std::string& current_uuid,
// Comparator orders by cloud, region, zone and uuid fields.
std::sort(local_descs.begin(), local_descs.end(), &TabletServerComparator);

for (auto desc : local_descs) {
for (const auto& desc : local_descs) {
if (desc->placement_uuid() == current_uuid) {
if (ShouldHideTserverNodeFromDisplay(desc.get(), hide_dead_node_threshold_mins)) {
continue;
Expand Down Expand Up @@ -522,6 +565,57 @@ void MasterPathHandlers::TServerDisplay(const std::string& current_uuid,
*output << "</table>\n";
}

void MasterPathHandlers::DisplayUniverseSummary(
const TabletCountMap& tablet_map, const std::vector<std::shared_ptr<TSDescriptor>>& all_descs,
const std::string& live_id,
int hide_dead_node_threshold_mins,
std::stringstream* output) {
auto blacklist_result = master_->catalog_manager()->BlacklistSetFromPB();
BlacklistSet blacklist = blacklist_result.ok() ? *blacklist_result : BlacklistSet();
auto universe_counts = CalculateUniverseTabletCounts(
tablet_map, all_descs, blacklist, hide_dead_node_threshold_mins);

// auto include_placement_uuids = universe_counts.per_placement_cluster_counts.size() > 1;
// auto placement_uuid_header = include_placement_uuids ? "<th>Cluster UUID</th>\n" : "";
*output << "<h2>Universe Summary</h2>\n"
<< "<table class='table table-striped'>\n"
<< " <tr>\n"
<< " <th>Cluster UUID</th>\n"
<< " <th>Total Live TServers</th>\n"
<< " <th>Total Blacklisted TServers</th>\n"
<< " <th>Total Dead TServers</th>\n"
<< " <th>User Tablet-Peers</th>\n"
<< " <th>System Tablet-Peers</th>\n"
<< " <th>Hidden Tablet-Peers</th>\n"
<< " <th>Active Tablet-Peers</th>\n"
<< " <th>Tablet Peer Limit</th>\n"
<< " </tr>\n";
for (const auto& [placement_uuid, cluster_counts] :
universe_counts.per_placement_cluster_counts) {
auto placement_uuid_entry = Format(
"$0 $1", placement_uuid == live_id ? "Primary Cluster" : "Read Replica", placement_uuid);
auto limit_entry = cluster_counts.tablet_replica_limit.has_value()
? Format("$0", *cluster_counts.tablet_replica_limit)
: "N/A";
auto user_total =
cluster_counts.counts.user_tablet_followers + cluster_counts.counts.user_tablet_leaders;
auto system_total =
cluster_counts.counts.system_tablet_followers + cluster_counts.counts.system_tablet_leaders;
*output << "<tr>\n"
// << placement_uuid_entry
<< " <td>" << placement_uuid_entry << "</td>\n"
<< " <td>" << cluster_counts.live_node_count << "</td>\n"
<< " <td>" << cluster_counts.blacklisted_node_count << "</td>\n"
<< " <td>" << cluster_counts.dead_node_count << "</td>\n"
<< " <td>" << user_total << "</td>\n"
<< " <td>" << system_total << "</td>\n"
<< " <td>" << cluster_counts.counts.hidden_tablet_peers << "</td>\n"
<< " <td>" << cluster_counts.active_tablet_peer_count << "</td>\n"
<< " <td>" << limit_entry << "</td>\n";
}
*output << "</table>\n";
}

void MasterPathHandlers::DisplayTabletZonesTable(
const ZoneTabletCounts::CloudTree& cloud_tree,
std::stringstream* output
Expand Down Expand Up @@ -680,6 +774,9 @@ void MasterPathHandlers::HandleTabletServers(const Webserver::WebRequest& req,
}

*output << std::setprecision(output_precision_);
if (viewType == TServersViewType::kTServersDefaultView) {
DisplayUniverseSummary(tablet_map, descs, live_id, hide_dead_node_threshold_override, output);
}
*output << "<h2>Tablet Servers</h2>\n";

if (!live_id.empty()) {
Expand Down Expand Up @@ -3372,22 +3469,24 @@ void MasterPathHandlers::CalculateTabletMap(TabletCountMap* tablet_map) {

TabletInfos tablets = table->GetTablets(IncludeInactive::kTrue);
bool is_user_table = master_->catalog_manager()->IsUserCreatedTable(*table);

for (const auto& tablet : tablets) {
auto replication_locations = tablet->GetReplicaLocations();

for (const auto& replica : *replication_locations) {
auto& counts = (*tablet_map)[replica.first];
if (tablet->LockForRead()->is_hidden()) {
counts.hidden_tablet_peers++;
}
if (is_user_table || table->IsColocationParentTable()) {
if (replica.second.role == PeerRole::LEADER) {
(*tablet_map)[replica.first].user_tablet_leaders++;
counts.user_tablet_leaders++;
} else {
(*tablet_map)[replica.first].user_tablet_followers++;
counts.user_tablet_followers++;
}
} else {
if (replica.second.role == PeerRole::LEADER) {
(*tablet_map)[replica.first].system_tablet_leaders++;
counts.system_tablet_leaders++;
} else {
(*tablet_map)[replica.first].system_tablet_followers++;
counts.system_tablet_followers++;
}
}
}
Expand Down
31 changes: 31 additions & 0 deletions src/yb/master/master-path-handlers.h
Expand Up @@ -127,6 +127,9 @@ class MasterPathHandlers {
uint32_t user_tablet_followers = 0;
uint32_t system_tablet_leaders = 0;
uint32_t system_tablet_followers = 0;
// Hidden tablets are not broken down by leader vs. follower or user vs. system. They just count
// the number of tablets peers which are hidden.
uint32_t hidden_tablet_peers = 0;

void operator+=(const TabletCounts& other);
};
Expand All @@ -148,9 +151,31 @@ class MasterPathHandlers {
typedef std::map<std::string, ZoneTree> RegionTree;
typedef std::map<std::string, RegionTree> CloudTree;
};

struct PlacementClusterTabletCounts {
TabletCounts counts;
uint32_t live_node_count = 0;
uint32_t blacklisted_node_count = 0;
uint32_t dead_node_count = 0;
uint32_t active_tablet_peer_count = 0;
// Tablet replica limits are computed from flag values. If these flag values are unset the
// universe will have no limit. This is represented with std::nullopt.
std::optional<uint64_t> tablet_replica_limit = 0;
};

struct UniverseTabletCounts {
// Keys are placement_uuids.
std::unordered_map<std::string, PlacementClusterTabletCounts> per_placement_cluster_counts;
};

// Map of tserver UUID -> TabletCounts
typedef std::unordered_map<std::string, TabletCounts> TabletCountMap;

UniverseTabletCounts CalculateUniverseTabletCounts(
const TabletCountMap& tablet_count_map,
const std::vector<std::shared_ptr<TSDescriptor>>& descs, const BlacklistSet& blacklist_set,
int hide_dead_node_threshold_mins);

struct ReplicaInfo {
PeerRole role;
TabletId tablet_id;
Expand Down Expand Up @@ -183,6 +208,12 @@ class MasterPathHandlers {
const int hide_dead_node_threshold_override,
TServersViewType viewType);

void DisplayUniverseSummary(
const TabletCountMap& tablet_map, const std::vector<std::shared_ptr<TSDescriptor>>& all_descs,
const std::string& live_id,
int hide_dead_node_threshold_mins,
std::stringstream* output);

// Outputs a ZoneTabletCounts::CloudTree as an html table with a heading.
static void DisplayTabletZonesTable(
const ZoneTabletCounts::CloudTree& counts,
Expand Down
5 changes: 5 additions & 0 deletions src/yb/master/tablet_creation_limits.cc
Expand Up @@ -74,6 +74,11 @@ AggregatedClusterInfo ComputeAggregatedClusterInfo(
};
}

// TODO(zdrudi): This function is passed a filtered version of TSDescriptorVector - blacklisted and
// non-live tservers are removed. But tablet replicas hosted on blacklisted tservers aren't going
// to be deleted so they should be counted towards the total number of live tablet replicas. Alter
// this function to take the complete, unfiltered TSDescriptorVector and put logic directly into
// ComputeAggregatedClusterInfo to do the right thing with blacklisted and non-live tservers.
Status CanCreateTabletReplicas(
int num_tablets, const ReplicationInfoPB& replication_info,
const TSDescriptorVector& ts_descs) {
Expand Down

0 comments on commit a5a05d0

Please sign in to comment.