From d8c492239d4b6c6d514c018028eae380ed65cfe2 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 3 Mar 2024 12:56:33 -0800 Subject: [PATCH] add --failOnInvalidStatus option to treat non-200 responses as failures, especially when combined with --failOnFailedSeed or --failOnFailedLimit requeue: ensure requeued urls are requeued with same depth/priority, not 0 --- src/crawler.ts | 22 ++++++++++++++++------ src/util/argParser.ts | 8 ++++++++ src/util/state.ts | 5 ++++- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 7152f66b5..e21352c1d 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1586,20 +1586,30 @@ self.__bx_behaviors.selectMainBehavior(); }); } - // Handle 4xx or 5xx response as a page load error const status = resp.status(); data.status = status; - if (isChromeError) { + + let failed = isChromeError; + + if (this.params.failOnInvalidStatus && status >= 400) { + // Handle 4xx or 5xx response as a page load error + failed = true; + } + + if (failed) { if (failCrawlOnError) { logger.fatal("Seed Page Load Error, failing crawl", { status, ...logDetails, }); } else { - logger.error("Page Crashed on Load", { - status, - ...logDetails, - }); + logger.error( + isChromeError ? "Page Crashed on Load" : "Page Invalid Status", + { + status, + ...logDetails, + }, + ); throw new Error("logged"); } } diff --git a/src/util/argParser.ts b/src/util/argParser.ts index dafa929d8..d5f942ed3 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -484,6 +484,14 @@ class ArgParser { default: 0, }, + failOnInvalidStatus: { + describe: + "If set, will treat pages with non-200 response as failures. When combined with --failOnFailedLimit or --failOnFailedSeed" + + "may result in crawl failing due to non-200 responses", + type: "boolean", + default: false, + }, + customBehaviors: { describe: "injects a custom behavior file or set of behavior files in a directory", diff --git a/src/util/state.ts b/src/util/state.ts index cfbdc573e..e084992f3 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -120,6 +120,7 @@ declare module "ioredis" { pkeyUrl: string, url: string, maxRetryPending: number, + maxRegularDepth: number, ): Result; } } @@ -253,7 +254,8 @@ if not res then redis.call('hdel', KEYS[1], ARGV[1]); if tonumber(data['retry']) <= tonumber(ARGV[2]) then json = cjson.encode(data); - redis.call('zadd', KEYS[2], 0, json); + local score = (data['depth'] or 0) + ((data['extraHops'] or 0) * ARGV[3]); + redis.call('zadd', KEYS[2], score, json); return 1; else return 2; @@ -661,6 +663,7 @@ return 0; this.pkey + ":" + url, url, this.maxRetryPending, + MAX_DEPTH, ); switch (res) { case 1: