ceph · zmc · May 17, 2024 · Feb 1, 2024 · Feb 1, 2024 · Feb 1, 2024
diff --git a/docs/COMPONENTS.rst b/docs/COMPONENTS.rst
@@ -52,11 +52,10 @@ nodes if they are available, and invokes ``teuthology-dispatcher`` in
 ``supervisor`` mode. ``supervisor`` reimages the target machines and invokes
 ``teuthology`` (the command). ``teuthology`` proceeds to execute the job
 (execute every task in the YAML job description). After the execution is
-completed (ie ``teuthology`` process exits), ``supervisor`` unlocks or nukes
-the target machines depending on the status of the job. If the requested
-machines are not available, the ``dispatcher`` waits for the machines to be
-available before running anymore jobs. Results from the job are stored in the
-archive directory of the worker for forensic analysis.
+completed (ie ``teuthology`` process exits), ``supervisor`` unlocks the 
+target machines. If the requested machines are not available, the ``dispatcher``
+waits for the machines to be available before running anymore jobs. Results from
+the job are stored in the archive directory of the worker for forensic analysis.
 
 Since `QA suites <https://github.com/ceph/ceph-qa-suite>`__ usually
 specify ``install`` and ``ceph`` tasks, we briefly describe what they do. When

diff --git a/docs/README.rst b/docs/README.rst
@@ -34,7 +34,6 @@ Provided Utilities
 * :ref:`teuthology-lock` - Lock, unlock, and update status of machines
 * :ref:`teuthology-ls` - List job results by examining an archive directory
 * :ref:`teuthology-openstack` - Use OpenStack backend (wrapper around ``teuthology-suite``)
-* :ref:`teuthology-nuke` - Attempt to return a machine to a pristine state
 * :ref:`teuthology-queue` - List, or delete, jobs in the queue
 * :ref:`teuthology-report` - Submit test results to a web service (we use `paddles <https://github.com/ceph/paddles/>`__)
 * :ref:`teuthology-results` - Examing a finished run and email results

diff --git a/docs/commands/teuthology-nuke.rst b/docs/commands/teuthology-nuke.rst
diff --git a/docs/detailed_test_config.rst b/docs/detailed_test_config.rst
@@ -222,12 +222,6 @@ new tasks in this directory.
 Many of these tasks are used to run python scripts that are defined in the
 ceph/ceph-qa-suite.
 
-If machines were locked as part of the run (with the --lock switch),
-teuthology normally leaves them locked when there is any task failure
-for investigation of the machine state.  When developing new teuthology
-tasks, sometimes this behavior is not useful.  The ``unlock_on_failure``
-global option can be set to true to make the unlocking happen unconditionally.
-
 Troubleshooting
 ===============
 

diff --git a/docs/docker-compose/testnode/testnode_stop.sh b/docs/docker-compose/testnode/testnode_stop.sh
@@ -7,4 +7,3 @@ for i in $(seq 1 5); do
     curl -s -f -X PUT -d "$payload" http://paddles:8080/nodes/$hostname/ && break
     sleep 1
 done
-pkill sshd
diff --git a/roles/3-simple.yaml b/roles/3-simple.yaml
diff --git a/roles/overrides.yaml b/roles/overrides.yaml
diff --git a/scripts/node_cleanup.py b/scripts/node_cleanup.py
@@ -0,0 +1,48 @@
+import argparse
+import logging
+import sys
+
+import teuthology
+from teuthology.lock import query, ops
+
+def main():
+    args = parse_args(sys.argv[1:])
+    if args.verbose:
+        teuthology.log.setLevel(logging.DEBUG)
+    log = logging.getLogger(__name__)
+    stale = query.find_stale_locks(args.owner)
+    if not stale:
+        return
+    if args.dry_run:
+        log.info("Would attempt to unlock:")
+        for node in stale:
+            log.info(f"{node['name']}\t{node['description']}")
+    else:
+        names = [node["name"] for node in stale]
+        ops.unlock_safe(names, args.owner)
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(
+        description="Find and unlock nodes that are still locked by jobs that are no "
+            "longer active",
+    )
+    parser.add_argument(
+        '-v', '--verbose',
+        action='store_true',
+        default=False,
+        help='Be more verbose',
+    )
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        default=False,
+        help="List nodes that would be unlocked if the flag were omitted",
+    )
+    parser.add_argument(
+        '--owner',
+        help='Optionally, find nodes locked by a specific user',
+    )
+    return parser.parse_args(argv)
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/nuke.py b/scripts/nuke.py
diff --git a/scripts/test/test_nuke.py b/scripts/test/test_nuke.py
diff --git a/setup.cfg b/setup.cfg
@@ -67,7 +67,6 @@ scripts =
 console_scripts =
     teuthology = scripts.run:main
     teuthology-openstack = scripts.openstack:main
-    teuthology-nuke = scripts.nuke:main
     teuthology-suite = scripts.suite:main
     teuthology-ls = scripts.ls:main
     teuthology-worker = scripts.worker:main
@@ -85,6 +84,7 @@ console_scripts =
     teuthology-dispatcher = scripts.dispatcher:main
     teuthology-wait = scripts.wait:main
     teuthology-exporter = scripts.exporter:main
+    teuthology-node-cleanup = scripts.node_cleanup:main
 
 [options.extras_require]
 manhole =

diff --git a/teuthology/dispatcher/__init__.py b/teuthology/dispatcher/__init__.py
@@ -15,7 +15,6 @@
     # modules
     beanstalk,
     exporter,
-    nuke,
     report,
     repo_utils,
     worker,
@@ -191,7 +190,13 @@ def main(args):
             error_message = "Saw error while trying to spawn supervisor."
             log.exception(error_message)
             if 'targets' in job_config:
-                nuke.nuke(supervisor.create_fake_context(job_config), True)
+                node_names = job_config["targets"].keys()
+                lock_ops.unlock_safe(
+                    node_names,
+                    job_config["owner"],
+                    job_config["name"],
+                    job_config["job_id"]
+                )
             report.try_push_job_info(job_config, dict(
                 status='fail',
                 failure_reason=error_message))

diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py
@@ -8,13 +8,12 @@
 from urllib.parse import urljoin
 from datetime import datetime
 
-from teuthology import exporter, kill, nuke, report, safepath
+from teuthology import exporter, kill, report, safepath
 from teuthology.config import config as teuth_config
 from teuthology.exceptions import SkipJob, MaxWhileTries
 from teuthology import setup_log_file, install_except_hook
 from teuthology.misc import get_user, archive_logs, compress_logs
 from teuthology.config import FakeNamespace
-from teuthology.job_status import get_status
 from teuthology.lock import ops as lock_ops
 from teuthology.task import internal
 from teuthology.misc import decanonicalize_hostname as shortname
@@ -242,7 +241,6 @@ def reimage(job_config):
         ctx.summary = {
             'sentry_event': sentry.report_error(job_config, e, task_name="reimage")
         }
-        nuke.nuke(ctx, True)
         # Machine that fails to reimage after 10 times will be marked down
         check_for_reimage_failures_and_mark_down(targets)
         raise
@@ -255,7 +253,7 @@ def unlock_targets(job_config):
     serializer = report.ResultsSerializer(teuth_config.archive_base)
     job_info = serializer.job_info(job_config['name'], job_config['job_id'])
     machine_statuses = query.get_statuses(job_info['targets'].keys())
-    # only unlock/nuke targets if locked and description matches
+    # only unlock targets if locked and description matches
     locked = []
     for status in machine_statuses:
         name = shortname(status['name'])
@@ -271,16 +269,9 @@ def unlock_targets(job_config):
         locked.append(name)
     if not locked:
         return
-    job_status = get_status(job_info)
-    if job_status == 'pass' or job_config.get('unlock_on_failure', False):
+    if job_config.get("unlock_on_failure", True):
         log.info('Unlocking machines...')
-        fake_ctx = create_fake_context(job_config)
-        for machine in locked:
-            lock_ops.unlock_one(
-                fake_ctx,
-                machine, job_info['owner'],
-                job_info['archive_path']
-            )
+        lock_ops.unlock_safe(locked, job_info["owner"], job_info["name"], job_info["job_id"])
 
 
 def run_with_watchdog(process, job_config):
@@ -305,12 +296,12 @@ def run_with_watchdog(process, job_config):
             log.warning("Job ran longer than {max}s. Killing...".format(
                 max=teuth_config.max_job_time))
             try:
-                # kill processes but do not nuke yet so we can save
+                # kill processes but do not unlock yet so we can save
                 # the logs, coredumps, etc.
                 kill.kill_job(
                     job_info['name'], job_info['job_id'],
                     teuth_config.archive_base, job_config['owner'],
-                    skip_nuke=True
+                    skip_unlock=True
                 )
             except Exception:
                 log.exception('Failed to kill job')
@@ -365,6 +356,7 @@ def create_fake_context(job_config, block=False):
         'os_type': job_config.get('os_type', 'ubuntu'),
         'os_version': os_version,
         'name': job_config['name'],
+        'job_id': job_config['job_id'],
     }
 
     return FakeNamespace(ctx_args)