Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

service worker capture fix: disable by default for now #506

Merged
merged 2 commits into from Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.0.1",
"version": "1.0.2",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
Expand Down
13 changes: 9 additions & 4 deletions src/crawler.ts
Expand Up @@ -758,6 +758,7 @@ self.__bx_behaviors.selectMainBehavior();
data.mime = mime;
}
data.status = 200;
data.ts = new Date();
logger.info(
"Direct fetch successful",
{ url, ...logDetails },
Expand Down Expand Up @@ -1256,6 +1257,7 @@ self.__bx_behaviors.selectMainBehavior();
profileUrl: this.params.profile,
headless: this.params.headless,
emulateDevice: this.emulateDevice,
swOpt: this.params.serviceWorker,
chromeOptions: {
proxy: false,
userAgent: this.emulateDevice.userAgent,
Expand Down Expand Up @@ -1980,10 +1982,13 @@ self.__bx_behaviors.selectMainBehavior();
}: PageState) {
const row: PageEntry = { id: pageid!, url, title, loadState };

if (ts) {
row.ts = ts.toISOString();
if (!ts) {
ts = new Date();
logger.warn("Page date missing, setting to now", { url, ts });
}

row.ts = ts.toISOString();

if (mime) {
row.mime = mime;
}
Expand All @@ -2000,11 +2005,11 @@ self.__bx_behaviors.selectMainBehavior();
row.seed = true;
}

if (text !== null) {
if (text) {
row.text = text;
}

if (favicon !== null) {
if (favicon) {
row.favIconUrl = favicon;
}

Expand Down
9 changes: 9 additions & 0 deletions src/util/argParser.ts
Expand Up @@ -11,6 +11,7 @@ import {
BEHAVIOR_LOG_FUNC,
WAIT_UNTIL_OPTS,
EXTRACT_TEXT_TYPES,
SERVICE_WORKER_OPTS,
} from "./constants.js";
import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js";
Expand Down Expand Up @@ -527,6 +528,14 @@ class ArgParser {
"prefix for WARC files generated, including WARCs added to WACZ",
type: "string",
},

serviceWorker: {
alias: "sw",
describe:
"service worker handling: disabled, enabled, or disabled with custom profile",
choices: SERVICE_WORKER_OPTS,
default: "disabled",
},
};
}

Expand Down
31 changes: 28 additions & 3 deletions src/util/browser.ts
Expand Up @@ -9,6 +9,8 @@ import path from "path";
import { LogContext, logger } from "./logger.js";
import { initStorage } from "./storage.js";

import type { ServiceWorkerOpt } from "./constants.js";

import puppeteer, {
Frame,
HTTPRequest,
Expand All @@ -31,6 +33,8 @@ type LaunchOpts = {
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
ondisconnect?: ((err: any) => NonNullable<unknown>) | null;

swOpt?: ServiceWorkerOpt;
};

// ==================================================================
Expand All @@ -48,6 +52,8 @@ export class Browser {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
recorders: any[] = [];

swOpt?: ServiceWorkerOpt = "disabled";

constructor() {
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
}
Expand All @@ -58,6 +64,7 @@ export class Browser {
signals = false,
headless = false,
emulateDevice = {},
swOpt = "disabled",
ondisconnect = null,
}: LaunchOpts) {
if (this.isLaunched()) {
Expand All @@ -68,6 +75,8 @@ export class Browser {
this.customProfile = await this.loadProfile(profileUrl);
}

this.swOpt = swOpt;

this.emulateDevice = emulateDevice;

const args = this.chromeArgs(chromeOptions);
Expand Down Expand Up @@ -105,10 +114,26 @@ export class Browser {
'Object.defineProperty(navigator, "webdriver", {value: false});',
);

if (this.customProfile) {
logger.info("Disabling Service Workers for profile", {}, "browser");
switch (this.swOpt) {
case "disabled":
logger.info("Service Workers: always disabled", {}, "browser");
await page.setBypassServiceWorker(true);
break;

case "disabled-if-profile":
if (this.customProfile) {
logger.info(
"Service Workers: disabled since using profile",
{},
"browser",
);
await page.setBypassServiceWorker(true);
}
break;

await page.setBypassServiceWorker(true);
case "enabled":
logger.info("Service Workers: always enabled", {}, "browser");
break;
}
}

Expand Down
8 changes: 8 additions & 0 deletions src/util/constants.ts
Expand Up @@ -10,6 +10,14 @@ export const WAIT_UNTIL_OPTS = [
"networkidle2",
];

export const SERVICE_WORKER_OPTS = [
"disabled",
"disabled-if-profile",
"enabled",
] as const;

export type ServiceWorkerOpt = (typeof SERVICE_WORKER_OPTS)[number];

export const DETECT_SITEMAP = "<detect>";

export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
Expand Down