Skip to content

Commit

Permalink
[#21888] yugabyted: Support for YugabyteDB version upgrades using yug…
Browse files Browse the repository at this point in the history
…abyted CLI.

Summary:
This diff introduces a new top-level command `upgrade_finalize` in yugabyted. This enhancement enables users to run post upgrade tasks specifically to **promote auto flags** and **upgrade ysql catalog**.

The following flag has been added to `upgrade_finalize` command :

`--upgrade_ysql_timeout`: Custom timeout value to upgrade the ysql system catalog.

**User journey to upgrade a YugabyteDB cluster through yugabyted**-

  # Stop the yugabyted node one at a time

  # restart the yugabyted node with the new YB release

  # yugabyted will verify that all nodes are upgraded to the new YB release, after that run `yugabyted upgrade_finalize` to run the post-upgrade tasks.
Jira: DB-10790

Test Plan: Manual Testing

Reviewers: nikhil

Reviewed By: nikhil

Subscribers: yugabyted-dev, shikhar.sahay

Differential Revision: https://phorge.dev.yugabyte.com/D33976
  • Loading branch information
ShikharSahay committed Apr 17, 2024
1 parent bfd9358 commit 5cfe9eb
Showing 1 changed file with 153 additions and 0 deletions.
153 changes: 153 additions & 0 deletions bin/yugabyted
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ PREFIX = {
'demo destroy' : "",
'cert' : "",
'cert generate_server_certs' : "",
'finalize_upgrade' : "",
}

USAGE = {
Expand Down Expand Up @@ -196,6 +197,7 @@ USAGE = {
'demo destroy' : "yugabyted demo destroy [flags]",
'cert' : "yugabyted cert [command] [flags]",
'cert generate_server_certs' : "yugabyted cert generate_server_certs [flag]",
'finalize_upgrade': "yugabyted finalize_upgrade [flags]",
}

EXAMPLE = {
Expand Down Expand Up @@ -278,6 +280,11 @@ EXAMPLE = {
"# Display point-in-time recovery status for a YugabyteDB cluster:\n" +
"yugabyted configure point_in_time_recovery " + \
"--status \n\n",
'finalize_upgrade': "# Finalize the cluster upgrade process:\n" +
"yugabyted finalize_upgrade\n\n" +
"# Finalize the cluster upgrade by specifying a timeout value for" +
" the YSQL catalog upgrade:\n" +
"yugabyted finalize_upgrade --upgrade_ysql_timeout <time_limit_ms>\n\n",
}

EPILOG_COMMON = "Run '{} [command] -h' for help with specific commands.".format(SCRIPT_NAME)
Expand All @@ -294,6 +301,7 @@ EPILOG_SPECIFIC = {
'cert' : "",
"backup" : "",
"restore" : "",
"finalize_upgrade" : "",
}

# YugabyteDB configs.
Expand All @@ -310,6 +318,7 @@ DEFAULT_YSQL_METRIC_PORT = 13000
DEFAULT_YCQL_METRIC_PORT = 12000
DEFAULT_WEBSERVER_PORT = 7200
DEFAULT_YUGABYTED_UI_PORT = 15433
DEFAULT_UPGRADE_YSQL_TIMEOUT = 60000
DEFAULT_CALLHOME = True
DEFAULT_YSQL_USER = "yugabyte"
DEFAULT_YSQL_PASSWORD = "yugabyte"
Expand Down Expand Up @@ -1044,6 +1053,68 @@ class ControlScript(object):
output += "-" * 70 + "\n"
Output.print_out(output)

# Finalize YugabyteDB cluster upgrade
def finalize_upgrade(self):
if not self.script.is_running():
Output.log_error_and_exit(Output.make_red("ERROR") + ": No YugabyteDB node " +
"is running in the data_dir {}".format(self.configs.saved_data.get("data_dir")))

# Find current YBDB version
VERSION_METADATA_PATH = find_version_metadata_location("version_metadata.json")
with open(VERSION_METADATA_PATH) as metadata:
version_metadata = json.load(metadata)
executable_version = version_metadata.get("version_number")

tserver_addresses = self.get_tserver_addresses()

# Verify if all nodes were upgraded successfully
Output.init_animation("Verifying version compatibility across all nodes...")
mismatched_nodes_version = self.verify_all_nodes_version(
tserver_addresses, executable_version)

if len(mismatched_nodes_version) > 0:
mismatched_nodes = ', '.join(mismatched_nodes_version)
Output.update_animation("Version compatibility verification failed.",
status=Output.ANIMATION_FAIL)
Output.log_error_and_exit(
Output.make_red("Error") + ": Version mismatch detected in the " +
"following nodes: {}. Please upgrade these".format(mismatched_nodes) +
" nodes to version {} and then rerun the".format(executable_version) +
Output.make_yellow(" yugabyted finalize_upgrade") + " command."
)

# All nodes match the correct version
# Run post-upgrade tasks
Output.update_animation("Verified version compatibility across all nodes.")
master_addresses = self.configs.saved_data.get("current_masters")
upgrade_ysql_timeout = self.configs.temp_data.get("upgrade_ysql_timeout")

Output.init_animation("Promoting Auto Flags...")
if not YBAdminProxy.promote_auto_flags(master_addresses):
Output.log_error_and_exit(Output.update_animation("Failed to "
"Promote Auto Flags.",status=Output.ANIMATION_FAIL))
else:
Output.log("Promote auto flags step was successful.")

Output.update_animation("Successfully Promoted Auto Flags.")
Output.init_animation("Upgrading YSQL...")
if not YBAdminProxy.upgrade_ysql(master_addresses,
upgrade_ysql_timeout):
Output.log_error_and_exit(Output.update_animation("Failed to "
"Upgrade YSQL.",status=Output.ANIMATION_FAIL))
else:
Output.log("Upgrade YSQL step was successful.")

Output.update_animation("Successfully Upgraded YSQL.")
status_details = []
status_display_info = {}
final_status = "Post-upgrade tasks completed successfully."
status_details = [
(Output.make_yellow("Status"), final_status)]
status_display_info[final_status] = Output.make_green
status_details.append((Output.make_yellow("Version"), executable_version))
Output.print_out(self.get_status_string_common(status_details, status_display_info))

# Starts an interactive YSQL shell.
def connect_ysql(self):
if self.get_failed_node_processes():
Expand Down Expand Up @@ -3472,6 +3543,46 @@ class ControlScript(object):
Output.log_error_and_exit(Output.make_red("ERROR:") + " Master node " +
"present at {} is not reachable.".format(master_hostport))

# Get tserver addresses
def get_tserver_addresses(self):
leader_master = self.get_leader_master().split(':')[0]
leader_master_http_endpoint = "{}:{}".format(
leader_master, self.configs.saved_data.get("master_webserver_port"))
all_tserver_info = self.get_all_tserver_info(leader_master_http_endpoint)

tserver_addresses = []
for uuid in all_tserver_info:
tservers = all_tserver_info[uuid]
for tserver in tservers:
tserver_addresses.append(tserver)

return tserver_addresses

# Get node version
def get_node_version(self, tserver_hostport):
try:
versionURL = "http://{}/api/v1/version".format(tserver_hostport)
response = urlopen(Request(versionURL))
jsonResponse = json.load(response)
return jsonResponse.get("version_number")

except HTTPError as http_err:
Output.log_error_and_exit("HTTP error occurred while fetching version from " +
"tserver {}: {}".format(tserver_hostport, http_err))
except Exception as err:
Output.log_error_and_exit("Other error occurred while fetching version from " +
"tserver {}: {}".format(tserver_hostport, err))

# Verify nodes version
def verify_all_nodes_version(self, tserver_addresses, executable_version):
mismatched_nodes_version = []
for tserver_addr in tserver_addresses:
version = self.get_node_version(tserver_addr)
if version != executable_version:
mismatched_nodes_version.append(tserver_addr.split(':')[0])

return mismatched_nodes_version

# Get all masters placement locations
def get_all_nodes_locations(self, all_tserver_info, placement_uuid):
dictOfAllNodes = all_tserver_info.get(placement_uuid)
Expand Down Expand Up @@ -5683,6 +5794,22 @@ class ControlScript(object):
self.configs.temp_data[
"admin_operation_master_addresses"] = args.master_addresses

if args.parser == "finalize_upgrade":
if args.upgrade_ysql_timeout is not None:
# Check if timeout is a digit and its integer value is greater than 0
if not args.upgrade_ysql_timeout.isdigit() or \
int(args.upgrade_ysql_timeout) <= 0:
has_errors = True
Output.print_and_log(Output.make_red("Error") + ": " +
"--upgrade_ysql_timeout value must be" +
" a positive integer greater than 0 in milliseconds." +
" Please specify a valid positive integer to set the" +
" YSQL upgrade timeout value.")
else:
# Valid timeout value
self.configs.temp_data["upgrade_ysql_timeout"] = \
int(args.upgrade_ysql_timeout)

if args.parser == "data_placement":
if args.fault_tolerance is not None:
if args.fault_tolerance.lower() in FAULT_TOLERANCE_CHOICES:
Expand Down Expand Up @@ -6214,6 +6341,7 @@ class ControlScript(object):
("restore", "Restore a database."),
("status", "Print status of YugabyteDB cluster."),
("version", "Release version of YugabyteDB cluster."),
("finalize_upgrade", "Finalize the upgrade process for the YugabyteDB cluster."),
("collect_logs", "Collect and package logs for troubleshooting.")):
example = ""
if EXAMPLE.get(cmd):
Expand Down Expand Up @@ -6505,6 +6633,15 @@ class ControlScript(object):
metavar=""
)

# Top level command: finalize_upgrade
for cmd in ("finalize_upgrade",):
cur_parser = all_parsers[cmd]
cur_parser.add_argument(
"--upgrade_ysql_timeout",
help="Custom timeout for the YSQL upgrade in milliseconds.",
metavar=""
)

# Commands that can alter configuration file.
for cmd in ("start",):
cur_parser = all_parsers[cmd]
Expand Down Expand Up @@ -6776,6 +6913,7 @@ class Configs(object):
"ybc_cloud_storage_bucket": "",
"ybc_cloud_storage_dir": "",
"ybc_status": False,
"upgrade_ysql_timeout": DEFAULT_UPGRADE_YSQL_TIMEOUT,
}
self.config_file = config_file

Expand Down Expand Up @@ -7617,6 +7755,21 @@ class YBAdminProxy(object):
out, err, ret_code = run_process(cmd, timeout=timeout, log_cmd=True)
return ret_code, err

# Promote Auto Flags
@staticmethod
def promote_auto_flags(master_addrs, timeout=10):
cmd = YBAdminProxy.cmd_args + ["-master_addresses", master_addrs, "promote_auto_flags"]
out, err, ret_code = run_process(cmd, timeout=timeout, log_cmd=True)
return (ret_code == 0)

# Upgrade YSQL catalog
@staticmethod
def upgrade_ysql(master_addrs, timeout):
cmd = YBAdminProxy.cmd_args + ["-master_addresses", master_addrs,
"-timeout_ms", str(timeout), "upgrade_ysql"]
out, err, ret_code = run_process(cmd, log_cmd=True)
return (ret_code == 0)

# Passthrough method for all the yb-admin commands
# @staticmethod
# def call_yb_admin_command(master_addrss, command, timeout=10):
Expand Down

0 comments on commit 5cfe9eb

Please sign in to comment.