From e82daa7053ade05f40b708cb9deb8847595ccb58 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 22 Mar 2024 13:04:52 -0700 Subject: [PATCH 1/2] service worker capture fix: - due to issues with capturing top-level pages, add option to disabled service workers always - add --serviceWorker option which can be disable, disable-if-profile (previous default) and enabled - ensure page ts is set for direct fetch - warn if page ts is missing, then set to now before serializing bump to 1.0.2 --- package.json | 2 +- src/crawler.ts | 13 +++++++++---- src/util/argParser.ts | 9 +++++++++ src/util/browser.ts | 25 ++++++++++++++++++++++++- src/util/constants.ts | 8 ++++++++ 5 files changed, 51 insertions(+), 6 deletions(-) diff --git a/package.json b/package.json index 6dbe6518f..6f6cc5771 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.0.1", + "version": "1.0.2", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", diff --git a/src/crawler.ts b/src/crawler.ts index d56a31929..e0289c078 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -758,6 +758,7 @@ self.__bx_behaviors.selectMainBehavior(); data.mime = mime; } data.status = 200; + data.ts = new Date(); logger.info( "Direct fetch successful", { url, ...logDetails }, @@ -1256,6 +1257,7 @@ self.__bx_behaviors.selectMainBehavior(); profileUrl: this.params.profile, headless: this.params.headless, emulateDevice: this.emulateDevice, + swOpt: this.params.serviceWorker, chromeOptions: { proxy: false, userAgent: this.emulateDevice.userAgent, @@ -1980,10 +1982,13 @@ self.__bx_behaviors.selectMainBehavior(); }: PageState) { const row: PageEntry = { id: pageid!, url, title, loadState }; - if (ts) { - row.ts = ts.toISOString(); + if (!ts) { + ts = new Date(); + logger.warn("Page date missing, setting to now", { url, ts }); } + row.ts = ts.toISOString(); + if (mime) { row.mime = mime; } @@ -2000,11 +2005,11 @@ self.__bx_behaviors.selectMainBehavior(); row.seed = true; } - if (text !== null) { + if (text) { row.text = text; } - if (favicon !== null) { + if (favicon) { row.favIconUrl = favicon; } diff --git a/src/util/argParser.ts b/src/util/argParser.ts index 8504db2dd..e64a77666 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -11,6 +11,7 @@ import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS, EXTRACT_TEXT_TYPES, + SERVICE_WORKER_OPTS, } from "./constants.js"; import { ScopedSeed } from "./seeds.js"; import { interpolateFilename } from "./storage.js"; @@ -527,6 +528,14 @@ class ArgParser { "prefix for WARC files generated, including WARCs added to WACZ", type: "string", }, + + serviceWorker: { + alias: "sw", + describe: + "service worker handling: disabled, enabled, or disabled with custom profile", + choices: SERVICE_WORKER_OPTS, + default: "disabled", + }, }; } diff --git a/src/util/browser.ts b/src/util/browser.ts index bdb3d698c..3912ea81d 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -9,6 +9,8 @@ import path from "path"; import { LogContext, logger } from "./logger.js"; import { initStorage } from "./storage.js"; +import type { ServiceWorkerOpt } from "./constants.js"; + import puppeteer, { Frame, HTTPRequest, @@ -31,6 +33,8 @@ type LaunchOpts = { // TODO: Fix this the next time the file is edited. // eslint-disable-next-line @typescript-eslint/no-explicit-any ondisconnect?: ((err: any) => NonNullable) | null; + + swOpt?: ServiceWorkerOpt; }; // ================================================================== @@ -48,6 +52,8 @@ export class Browser { // eslint-disable-next-line @typescript-eslint/no-explicit-any recorders: any[] = []; + swOpt?: ServiceWorkerOpt = "disabled"; + constructor() { this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-")); } @@ -58,6 +64,7 @@ export class Browser { signals = false, headless = false, emulateDevice = {}, + swOpt = "disabled", ondisconnect = null, }: LaunchOpts) { if (this.isLaunched()) { @@ -68,6 +75,8 @@ export class Browser { this.customProfile = await this.loadProfile(profileUrl); } + this.swOpt = swOpt; + this.emulateDevice = emulateDevice; const args = this.chromeArgs(chromeOptions); @@ -107,8 +116,22 @@ export class Browser { if (this.customProfile) { logger.info("Disabling Service Workers for profile", {}, "browser"); + } + + switch (this.swOpt) { + case "disabled": + await page.setBypassServiceWorker(true); + break; + + case "disabled-if-profile": + if (this.customProfile) { + await page.setBypassServiceWorker(true); + } + break; - await page.setBypassServiceWorker(true); + case "enabled": + // do nothing + break; } } diff --git a/src/util/constants.ts b/src/util/constants.ts index 069948ce0..9f7bdd20b 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -10,6 +10,14 @@ export const WAIT_UNTIL_OPTS = [ "networkidle2", ]; +export const SERVICE_WORKER_OPTS = [ + "disabled", + "disabled-if-profile", + "enabled", +] as const; + +export type ServiceWorkerOpt = (typeof SERVICE_WORKER_OPTS)[number]; + export const DETECT_SITEMAP = ""; export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"]; From 3b617020ede30601a7d54d8afa6114f8cd51f74f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 22 Mar 2024 13:29:17 -0700 Subject: [PATCH 2/2] better logging --- src/util/browser.ts | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/util/browser.ts b/src/util/browser.ts index 3912ea81d..046bbc59e 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -114,23 +114,25 @@ export class Browser { 'Object.defineProperty(navigator, "webdriver", {value: false});', ); - if (this.customProfile) { - logger.info("Disabling Service Workers for profile", {}, "browser"); - } - switch (this.swOpt) { case "disabled": + logger.info("Service Workers: always disabled", {}, "browser"); await page.setBypassServiceWorker(true); break; case "disabled-if-profile": if (this.customProfile) { + logger.info( + "Service Workers: disabled since using profile", + {}, + "browser", + ); await page.setBypassServiceWorker(true); } break; case "enabled": - // do nothing + logger.info("Service Workers: always enabled", {}, "browser"); break; } }