Skip to content

Commit

Permalink
service worker capture fix: disable by default for now (#506)
Browse files Browse the repository at this point in the history
Due to issues with capturing top-level pages, make bypassing service
workers the default for now. Previously, it was only disabled when using
profiles. (This is also consistent with ArchiveWeb.page behavior).
Includes:
- add --serviceWorker option which can be `disabled`,
disabled-if-profile (previous default) and `enabled`
- ensure page timestamp is set for direct fetch
- warn if page timestamp is missing on serialization, then set to now
before serializing

bump version to 1.0.2
  • Loading branch information
ikreymer committed Mar 22, 2024
1 parent 93c3894 commit 22a7351
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 8 deletions.
2 changes: 1 addition & 1 deletion package.json
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.0.1",
"version": "1.0.2",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
Expand Down
13 changes: 9 additions & 4 deletions src/crawler.ts
Expand Up @@ -758,6 +758,7 @@ self.__bx_behaviors.selectMainBehavior();
data.mime = mime;
}
data.status = 200;
data.ts = new Date();
logger.info(
"Direct fetch successful",
{ url, ...logDetails },
Expand Down Expand Up @@ -1256,6 +1257,7 @@ self.__bx_behaviors.selectMainBehavior();
profileUrl: this.params.profile,
headless: this.params.headless,
emulateDevice: this.emulateDevice,
swOpt: this.params.serviceWorker,
chromeOptions: {
proxy: false,
userAgent: this.emulateDevice.userAgent,
Expand Down Expand Up @@ -1980,10 +1982,13 @@ self.__bx_behaviors.selectMainBehavior();
}: PageState) {
const row: PageEntry = { id: pageid!, url, title, loadState };

if (ts) {
row.ts = ts.toISOString();
if (!ts) {
ts = new Date();
logger.warn("Page date missing, setting to now", { url, ts });
}

row.ts = ts.toISOString();

if (mime) {
row.mime = mime;
}
Expand All @@ -2000,11 +2005,11 @@ self.__bx_behaviors.selectMainBehavior();
row.seed = true;
}

if (text !== null) {
if (text) {
row.text = text;
}

if (favicon !== null) {
if (favicon) {
row.favIconUrl = favicon;
}

Expand Down
9 changes: 9 additions & 0 deletions src/util/argParser.ts
Expand Up @@ -11,6 +11,7 @@ import {
BEHAVIOR_LOG_FUNC,
WAIT_UNTIL_OPTS,
EXTRACT_TEXT_TYPES,
SERVICE_WORKER_OPTS,
} from "./constants.js";
import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js";
Expand Down Expand Up @@ -527,6 +528,14 @@ class ArgParser {
"prefix for WARC files generated, including WARCs added to WACZ",
type: "string",
},

serviceWorker: {
alias: "sw",
describe:
"service worker handling: disabled, enabled, or disabled with custom profile",
choices: SERVICE_WORKER_OPTS,
default: "disabled",
},
};
}

Expand Down
31 changes: 28 additions & 3 deletions src/util/browser.ts
Expand Up @@ -9,6 +9,8 @@ import path from "path";
import { LogContext, logger } from "./logger.js";
import { initStorage } from "./storage.js";

import type { ServiceWorkerOpt } from "./constants.js";

import puppeteer, {
Frame,
HTTPRequest,
Expand All @@ -31,6 +33,8 @@ type LaunchOpts = {
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
ondisconnect?: ((err: any) => NonNullable<unknown>) | null;

swOpt?: ServiceWorkerOpt;
};

// ==================================================================
Expand All @@ -48,6 +52,8 @@ export class Browser {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
recorders: any[] = [];

swOpt?: ServiceWorkerOpt = "disabled";

constructor() {
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
}
Expand All @@ -58,6 +64,7 @@ export class Browser {
signals = false,
headless = false,
emulateDevice = {},
swOpt = "disabled",
ondisconnect = null,
}: LaunchOpts) {
if (this.isLaunched()) {
Expand All @@ -68,6 +75,8 @@ export class Browser {
this.customProfile = await this.loadProfile(profileUrl);
}

this.swOpt = swOpt;

this.emulateDevice = emulateDevice;

const args = this.chromeArgs(chromeOptions);
Expand Down Expand Up @@ -105,10 +114,26 @@ export class Browser {
'Object.defineProperty(navigator, "webdriver", {value: false});',
);

if (this.customProfile) {
logger.info("Disabling Service Workers for profile", {}, "browser");
switch (this.swOpt) {
case "disabled":
logger.info("Service Workers: always disabled", {}, "browser");
await page.setBypassServiceWorker(true);
break;

case "disabled-if-profile":
if (this.customProfile) {
logger.info(
"Service Workers: disabled since using profile",
{},
"browser",
);
await page.setBypassServiceWorker(true);
}
break;

await page.setBypassServiceWorker(true);
case "enabled":
logger.info("Service Workers: always enabled", {}, "browser");
break;
}
}

Expand Down
8 changes: 8 additions & 0 deletions src/util/constants.ts
Expand Up @@ -10,6 +10,14 @@ export const WAIT_UNTIL_OPTS = [
"networkidle2",
];

export const SERVICE_WORKER_OPTS = [
"disabled",
"disabled-if-profile",
"enabled",
] as const;

export type ServiceWorkerOpt = (typeof SERVICE_WORKER_OPTS)[number];

export const DETECT_SITEMAP = "<detect>";

export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
Expand Down

0 comments on commit 22a7351

Please sign in to comment.