Instruct runners to always pull from online queue after getting their…

… server-side retry manifest (#81)
rwx-research · Dec 5, 2023 · e9d4285 · e9d4285
1 parent a1942ce
commit e9d4285
Show file tree

Hide file tree

Showing 11 changed files with 455 additions and 82 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,18 @@
+## 1.7.0
+
+ABQ 1.7.0 is a minor release.
+
+This release improves ABQ's behavior when an ABQ runner is terminated before
+being assigned all applicable test in a run manifest. In previous versions of
+ABQ, retrying such a runner would only retry the tests the runner was assigned
+before it terminated. Starting with ABQ 1.7.0, a runner that connects for a run
+ID after it was terminated will run all tests it ran on its first connection,
+and then pull tests from the run queue.
+
+ABQ continues to cancel runs when a runner is terminated with SIGTERM, SIGINT,
+or SIGQUIT. The changes in 1.7.0 apply to runners terminated in other ways, for
+example via SIGKILL.
+
 ## 1.6.4
 
 ABQ 1.6.4 is a patch release.

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/abq_cli/Cargo.toml b/crates/abq_cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "abq"
-version = "1.6.4"
+version = "1.7.0"
 edition = "2021"
 
 [dependencies]

diff --git a/crates/abq_cli/src/report.rs b/crates/abq_cli/src/report.rs
@@ -293,7 +293,7 @@ async fn wait_for_results_help(
             }
             RunInProgress { active_runners } => {
                 if active_runners.is_empty() {
-                    bail!("this ABQ run has not assigned all tests in your test suite, but there are no active runners to assign them to. Please either add more runners, or launch a new run.")
+                    bail!("this ABQ run has not assigned all tests in your test suite, but there are no active runners to assign them to. Please retry a runner, add more runners, or launch a new run.")
                 }
 
                 let active_runners = active_runners

diff --git a/crates/abq_cli/tests/cli.rs b/crates/abq_cli/tests/cli.rs
@@ -417,12 +417,15 @@ fn wait_for_live_worker(worker_stderr: &mut ChildStderr) {
 
 // Waits for the debug line "starting execution of all tests" in the worker.
 fn wait_for_worker_executing(worker_stderr: &mut ChildStderr) {
-    let mut worker_reader = BufReader::new(worker_stderr).lines();
-    // Spin until we know the worker0 is UP
+    wait_for_line(worker_stderr, "starting execution of all tests");
+}
+
+fn wait_for_line<R: std::io::Read>(reader: R, output: &str) {
+    let mut reader = BufReader::new(reader).lines();
     loop {
-        if let Some(line) = worker_reader.next() {
+        if let Some(line) = reader.next() {
             let line = line.expect("line is not a string");
-            if line.contains("starting execution of all tests") {
+            if line.contains(output) {
                 break;
             }
         }
@@ -5017,3 +5020,154 @@ fn write_partial_rwx_v1_json_results_on_early_runner_termination() {
     }
     "###);
 }
+
+#[test]
+#[with_protocol_version]
+#[serial]
+fn retry_continued_manifest_read_on_worker_death() {
+    let name = "retry_continued_manifest_read_on_worker_death";
+    let conf = CSConfigOptions {
+        use_auth_token: true,
+        tls: true,
+    };
+
+    let (_queue_proc, queue_addr) = setup_queue!(name, conf);
+
+    let proto = AbqProtocolVersion::V0_2.get_supported_witness().unwrap();
+
+    let manifest = (0..4)
+        .map(|i| {
+            TestOrGroup::test(Test::new(
+                proto,
+                format!("test{}", i),
+                [],
+                Default::default(),
+            ))
+        })
+        .collect::<Vec<_>>();
+
+    let manifest = ManifestMessage::new(Manifest::new(manifest, Default::default()));
+
+    let make_simulation = |i| {
+        [
+            Connect,
+            //
+            // Write spawn message
+            OpaqueWrite(pack(legal_spawned_message(proto))),
+            //
+            // Write the manifest if we need to.
+            // Otherwise handle the one test.
+            IfGenerateManifest {
+                then_do: vec![OpaqueWrite(pack(&manifest))],
+                else_do: {
+                    let mut actions = vec![
+                        //
+                        // Read init context message + write ACK
+                        OpaqueRead,
+                        OpaqueWrite(pack(InitSuccessMessage::new(proto))),
+                        // Read first test, write okay
+                        IfAliveReadAndWriteFake(Status::Success),
+                        Stdout("finished running first test\n".into()),
+                    ];
+                    if i == 1 {
+                        // First run: sleep forever, we will kill the worker.
+                        actions.push(Sleep(Duration::from_secs(600)));
+                    } else {
+                        for _ in 0..3 {
+                            actions.push(IfAliveReadAndWriteFake(Status::Success));
+                        }
+                    }
+                    actions
+                },
+            },
+            //
+            // Finish
+            Exit(0),
+        ]
+    };
+
+    let simulation1 = make_simulation(1);
+    let simulation2 = make_simulation(2);
+
+    let packed = pack_msgs_to_disk(simulation1);
+
+    let run_id = "test-run-id";
+
+    let test_args = {
+        let simulator = native_runner_simulation_bin();
+        let simfile_path = packed.path.display().to_string();
+        let args = vec![
+            format!("test"),
+            format!("--worker=0"),
+            format!("--queue-addr={queue_addr}"),
+            format!("--run-id={run_id}"),
+            format!("--batch-size=1"),
+        ];
+        let mut args = conf.extend_args_for_client(args);
+        args.extend([s!("--"), simulator, simfile_path]);
+        args
+    };
+
+    let report_args = {
+        let args = vec![
+            format!("report"),
+            format!("--reporter=dot"),
+            format!("--queue-addr={queue_addr}"),
+            format!("--run-id={run_id}"),
+            format!("--color=never"),
+        ];
+        conf.extend_args_for_client(args)
+    };
+
+    let mut worker0_attempt1 = Abq::new(format!("{name}_worker0_attempt1"))
+        .args(&test_args)
+        .spawn();
+
+    let attempt1_stdout = worker0_attempt1.stdout.as_mut().unwrap();
+    wait_for_line(attempt1_stdout, "finished running first test");
+
+    // Kill the worker.
+    worker0_attempt1.kill().unwrap();
+
+    {
+        let CmdOutput { exit_status, .. } = Abq::new(name.to_string() + "_report1")
+            .args(&report_args)
+            .run();
+        assert!(!exit_status.success());
+    }
+
+    std::fs::write(packed.path, pack_msgs(simulation2)).unwrap();
+
+    {
+        let CmdOutput {
+            stdout,
+            stderr,
+            exit_status,
+        } = Abq::new(format!("{name}_worker0_attempt1"))
+            .args(test_args)
+            .run();
+        assert!(
+            exit_status.success(),
+            "STDOUT:\n{stdout}\nSTDERR:\n{stderr}"
+        );
+        assert!(
+            stdout.contains("4 tests, 0 failures"),
+            "STDOUT:\n{stdout}\nSTDERR:\n{stderr}"
+        );
+    }
+
+    {
+        let CmdOutput {
+            stdout,
+            stderr,
+            exit_status,
+        } = Abq::new(name.to_string() + "_report2")
+            .args(report_args)
+            .run();
+        assert!(exit_status.success());
+        assert!(
+            stdout.contains("4 tests, 0 failures"),
+            "STDOUT:\n{stdout}\nSTDERR:\n{stderr}"
+        );
+    }
+}
diff --git a/crates/abq_queue/src/queue.rs b/crates/abq_queue/src/queue.rs
@@ -422,7 +422,7 @@ impl AllRuns {
                             runner_test_command_differs,
                         })
                     }
-                    Some((old_entity, old_finished_state)) => {
+                    Some((old_entity, old_finished_time)) => {
                         if old_entity.id == entity.id {
                             // The same worker entity is connecting twice - this implies that the
                             // same worker process is asking to find a run more than once, which
@@ -453,13 +453,13 @@ impl AllRuns {
                         // itself, and if this is done we could hand out the manifest right here,
                         // rather than asking the runner to reconnect to retrieve the manifest.
                         tracing::info!(
-                            ?old_finished_state,
+                            ?old_finished_time,
                             ?entity,
                             "worker reconnecting for out-of-process retry manifest during active run"
                         );
 
                         AssignedRunStatus::Run(AssignedRun {
-                            kind: AssignedRunKind::Retry,
+                            kind: AssignedRunKind::RetryAndContinue,
                             runner_test_command_differs,
                         })
                     }
@@ -488,7 +488,7 @@ impl AllRuns {
                     }
 
                     AssignedRunStatus::Run(AssignedRun {
-                        kind: AssignedRunKind::Retry,
+                        kind: AssignedRunKind::RetryAndContinue,
                         runner_test_command_differs,
                     })
                 } else {
@@ -1305,21 +1305,11 @@ impl AllRuns {
                 // legal cancellation states
             }
             RunState::InitialManifestDone { seen_workers, .. } => {
-                // Since we already have issued the full manifest out, don't mark this run as
-                // cancelled; this might be a stragling worker or a worker that cancelled an
-                // out-of-process retry.
                 tracing::info!(
                     ?run_id,
                     "refusing to cancel run whose manifest has already been exhausted"
                 );
-                // Mark the worker as now-inactive.
-                let old_tag = seen_workers.write().insert_by_tag(entity, false);
-                log_assert!(
-                    old_tag.is_some(),
-                    ?entity,
-                    ?run_id,
-                    "entity was not seen before it marked cancellation"
-                );
+                seen_workers.write().insert_by_tag(entity, false);
                 return;
             }
         }
@@ -3995,7 +3985,7 @@ mod test {
         assert_eq!(
             assigned,
             AssignedRunStatus::Run(AssignedRun {
-                kind: AssignedRunKind::Retry,
+                kind: AssignedRunKind::RetryAndContinue,
                 runner_test_command_differs: false
             })
         );
@@ -4331,7 +4321,7 @@ mod persistence_on_end_of_manifest {
 
     #[tokio::test]
     #[with_protocol_version]
-    async fn worker_told_to_pull_retry_manifest() {
+    async fn worker_told_to_pull_retry_manifest_and_continue() {
         let queues = SharedRuns::default();
         let remote = remote::NoopPersister::new().into();
 
@@ -4388,10 +4378,75 @@ mod persistence_on_end_of_manifest {
                     .build(),
             )
             .await;
+
+        assert_eq!(
+            assigned,
+            AssignedRunStatus::Run(AssignedRun {
+                kind: AssignedRunKind::RetryAndContinue,
+                runner_test_command_differs: false
+            })
+        );
+    }
+
+    #[tokio::test]
+    #[with_protocol_version]
+    async fn worker_told_to_pull_retry_manifest_no_continue() {
+        let queues = SharedRuns::default();
+        let remote = remote::NoopPersister::new().into();
+
+        let run_id = RunId::unique();
+
+        let worker0 = Entity::runner(1, 1);
+        let worker0_shadow = Entity::runner(1, 1);
+        assert_ne!(worker0.id, worker0_shadow.id);
+        assert_eq!(worker0.tag, worker0_shadow.tag);
+
+        let test1 = fake_test_spec(proto);
+        let test2 = fake_test_spec(proto);
+        let test3 = fake_test_spec(proto);
+
+        let test_command_hash = TestCommandHash::random();
+
+        // Create run, add manifest by worker0
+        {
+            let run_params = RunParamsBuilder::new(&run_id, &remote)
+                .entity(worker0)
+                .runner_test_command_hash(test_command_hash)
+                .build();
+            let manifest = vec![
+                (test1.clone(), GroupId::new()),
+                (test2, GroupId::new()),
+                (test3, GroupId::new()),
+            ];
+            let _ = queues.find_or_create_run(run_params).await;
+            let _ = queues.add_manifest(&run_id, manifest, Default::default());
+        }
+
+        // worker0 pulls tests
+        {
+            let NextWorkResult { bundle, .. } = queues.next_work(worker0, &run_id);
+            assert_eq!(
+                bundle.work,
+                vec![WorkerTest::new(test1.clone(), INIT_RUN_NUMBER)]
+            );
+        }
+
+        queues.mark_worker_complete(&run_id, worker0, std::time::Instant::now());
+
+        // Suppose worker0 re-runs.
+        let assigned = queues
+            .find_or_create_run(
+                RunParamsBuilder::new(&run_id, &remote)
+                    .entity(worker0_shadow)
+                    .runner_test_command_hash(test_command_hash)
+                    .build(),
+            )
+            .await;
+
         assert_eq!(
             assigned,
             AssignedRunStatus::Run(AssignedRun {
-                kind: AssignedRunKind::Retry,
+                kind: AssignedRunKind::RetryAndContinue,
                 runner_test_command_differs: false
             })
         );

diff --git a/crates/abq_workers/src/assigned_run.rs b/crates/abq_workers/src/assigned_run.rs
@@ -7,8 +7,8 @@ use serde_derive::{Deserialize, Serialize};
 pub enum AssignedRunKind {
     /// This worker is connecting for a fresh run, and should fetch tests online.
     Fresh { should_generate_manifest: bool },
-    /// This worker is connecting for a retry, and should fetch its manifest from the queue once.
-    Retry,
+    /// This worker should pull its retry manifest, and then continue to fetch tests online.
+    RetryAndContinue,
 }
 
 #[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]

diff --git a/crates/abq_workers/src/negotiate.rs b/crates/abq_workers/src/negotiate.rs
@@ -248,7 +248,7 @@ impl WorkersNegotiator {
             AssignedRunKind::Fresh {
                 should_generate_manifest,
             } => should_generate_manifest,
-            AssignedRunKind::Retry => false,
+            AssignedRunKind::RetryAndContinue => false,
         };
 
         let runner_strategy_generator = RunnerStrategyGenerator::new(