Skip to content

Commit

Permalink
Make sure --failOnFailedSeed results in exit code of 1
Browse files Browse the repository at this point in the history
When pages are unreachable due to DNS not resolving, the crawler
was failing with exit code 17, which conflicts with expected and
documented behavior.
  • Loading branch information
tw4l committed May 8, 2024
1 parent ddc3e10 commit 46a66e0
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 13 deletions.
26 changes: 18 additions & 8 deletions src/crawler.ts
Expand Up @@ -1734,10 +1734,15 @@ self.__bx_behaviors.selectMainBehavior();

if (failed) {
if (failCrawlOnError) {
logger.fatal("Seed Page Load Error, failing crawl", {
status,
...logDetails,
});
logger.fatal(
"Seed Page Load Error, failing crawl",
{
status,
...logDetails,
},
"general",
1,
);
} else {
logger.error(
isChromeError ? "Page Crashed on Load" : "Page Invalid Status",
Expand Down Expand Up @@ -1775,10 +1780,15 @@ self.__bx_behaviors.selectMainBehavior();
data.skipBehaviors = true;
} else if (failCrawlOnError) {
// if fail on error, immediately fail here
logger.fatal("Page Load Timeout, failing crawl", {
msg,
...logDetails,
});
logger.fatal(
"Page Load Timeout, failing crawl",
{
msg,
...logDetails,
},
"general",
1,
);
} else {
// log if not already log and rethrow
if (msg !== "logged") {
Expand Down
5 changes: 3 additions & 2 deletions src/util/argParser.ts
Expand Up @@ -498,7 +498,8 @@ class ArgParser {

failOnFailedSeed: {
describe:
"If set, crawler will fail with exit code 1 if any seed fails",
"If set, crawler will fail with exit code 1 if any seed fails. Crawl will fail if any seeds have a" +
"4xx or 5xx response regardless of whether failOnInvalidStatus is set",
type: "boolean",
default: false,
},
Expand All @@ -512,7 +513,7 @@ class ArgParser {

failOnInvalidStatus: {
describe:
"If set, will treat pages with non-200 response as failures. When combined with --failOnFailedLimit or --failOnFailedSeed" +
"If set, will treat pages with non-200 response as failures. When combined with --failOnFailedLimit" +
"may result in crawl failing due to non-200 responses",
type: "boolean",
default: false,
Expand Down
6 changes: 3 additions & 3 deletions tests/seeds.test.js
Expand Up @@ -7,7 +7,7 @@ test("ensure one invalid seed doesn't end crawl if failOnFailedSeed is not set",
let passed = true;
try {
await exec(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection invalidseed",
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://example.invalid --generateWACZ --limit 1 --collection invalidseed",
);
} catch (error) {
console.log(error);
Expand All @@ -20,7 +20,7 @@ test("ensure one invalid seed fails crawl if failOnFailedSeed is set", async ()
let passed = true;
try {
await exec(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --failOnFailedSeed --collection failseed",
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://example.invalid --generateWACZ --limit 1 --failOnFailedSeed --collection failseed",
);
} catch (error) {
passed = false;
Expand All @@ -32,7 +32,7 @@ test("ensure crawl fails if no valid seeds are passed", async () => {
let passed = true;
try {
await exec(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection allinvalidseeds",
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url iana.org/ --url example.invalid --generateWACZ --limit 1 --collection allinvalidseeds",
);
} catch (error) {
passed = false;
Expand Down

0 comments on commit 46a66e0

Please sign in to comment.