Skip to content

Commit

Permalink
upgrade puppeteer-core to 22.6.1 (#516)
Browse files Browse the repository at this point in the history
Using latest puppeteer-core to keep up with latest browsers, mostly
minor syntax changes

Due to change in puppeteer hiding the executionContextId, need to create
a frameId->executionContextId mapping and track it ourselves to support
the custom evaluateWithCLI() function
  • Loading branch information
ikreymer committed Mar 27, 2024
1 parent 0ad10a8 commit 0d973d6
Show file tree
Hide file tree
Showing 6 changed files with 219 additions and 162 deletions.
2 changes: 1 addition & 1 deletion package.json
Expand Up @@ -30,7 +30,7 @@
"p-queue": "^7.3.4",
"pixelmatch": "^5.3.0",
"pngjs": "^7.0.0",
"puppeteer-core": "^20.8.2",
"puppeteer-core": "^22.6.1",
"sax": "^1.3.0",
"sharp": "^0.32.6",
"tsc": "^2.0.4",
Expand Down
64 changes: 51 additions & 13 deletions src/crawler.ts
Expand Up @@ -10,7 +10,6 @@ import {
QueueState,
PageState,
WorkerId,
PageCallbacks,
} from "./util/state.js";

import { parseArgs } from "./util/argParser.js";
Expand Down Expand Up @@ -57,7 +56,7 @@ import { OriginOverride } from "./util/originoverride.js";
// to ignore HTTPS error for HEAD check
import { Agent as HTTPAgent } from "http";
import { Agent as HTTPSAgent } from "https";
import { CDPSession, Frame, HTTPRequest, Page } from "puppeteer-core";
import { CDPSession, Frame, HTTPRequest, Page, Protocol } from "puppeteer-core";
import { Recorder } from "./util/recorder.js";
import { SitemapReader } from "./util/sitemapper.js";
import { ScopedSeed } from "./util/seeds.js";
Expand Down Expand Up @@ -624,14 +623,12 @@ export class Crawler {
cdp,
workerid,
callbacks,
}: {
page: Page;
cdp: CDPSession;
workerid: WorkerId;
callbacks: PageCallbacks;
}) {
frameIdToExecId,
}: WorkerOpts) {
await this.browser.setupPage({ page, cdp });

await this.setupExecContextEvents(cdp, frameIdToExecId);

if (
(this.adBlockRules && this.params.blockAds) ||
this.blockRules ||
Expand Down Expand Up @@ -704,6 +701,40 @@ self.__bx_behaviors.selectMainBehavior();
}
}

async setupExecContextEvents(
cdp: CDPSession,
frameIdToExecId: Map<string, number>,
) {
await cdp.send("Runtime.enable");

await cdp.on(
"Runtime.executionContextCreated",
(params: Protocol.Runtime.ExecutionContextCreatedEvent) => {
const { id, auxData } = params.context;
if (auxData && auxData.isDefault && auxData.frameId) {
frameIdToExecId.set(auxData.frameId, id);
}
},
);

await cdp.on(
"Runtime.executionContextDestroyed",
(params: Protocol.Runtime.ExecutionContextDestroyedEvent) => {
const { executionContextId } = params;
for (const [frameId, execId] of frameIdToExecId.entries()) {
if (execId === executionContextId) {
frameIdToExecId.delete(frameId);
break;
}
}
},
);

await cdp.on("Runtime.executionContextsCleared", () => {
frameIdToExecId.clear();
});
}

loadCustomBehaviors(filename: string) {
let str = "";

Expand Down Expand Up @@ -875,7 +906,13 @@ self.__bx_behaviors.selectMainBehavior();
logger.info("Skipping behaviors for slow page", logDetails, "behavior");
} else {
const res = await timedRun(
this.runBehaviors(page, cdp, data.filteredFrames, logDetails),
this.runBehaviors(
page,
cdp,
data.filteredFrames,
opts.frameIdToExecId,
logDetails,
),
this.params.behaviorTimeout,
"Behaviors timed out",
logDetails,
Expand Down Expand Up @@ -954,6 +991,7 @@ self.__bx_behaviors.selectMainBehavior();
page: Page,
cdp: CDPSession,
frames: Frame[],
frameIdToExecId: Map<string, number>,
logDetails: LogDetails,
) {
try {
Expand All @@ -972,9 +1010,9 @@ self.__bx_behaviors.selectMainBehavior();
const results = await Promise.allSettled(
frames.map((frame) =>
this.browser.evaluateWithCLI(
page,
frame,
cdp,
frame,
frameIdToExecId,
`
if (!self.__bx_behaviors) {
console.error("__bx_behaviors missing, can't run behaviors");
Expand All @@ -988,11 +1026,11 @@ self.__bx_behaviors.selectMainBehavior();
);

for (const res of results) {
const { status, reason }: { status: string; reason?: string } = res;
const { status, reason }: { status: string; reason?: unknown } = res;
if (status === "rejected") {
logger.warn(
"Behavior run partially failed",
{ reason, ...logDetails },
{ reason: formatErr(reason), ...logDetails },
"behavior",
);
}
Expand Down
50 changes: 20 additions & 30 deletions src/util/browser.ts
Expand Up @@ -95,7 +95,7 @@ export class Browser {

const launchOpts: PuppeteerLaunchOptions = {
args,
headless: headless ? "new" : false,
headless,
executablePath: this.getBrowserExe(),
ignoreDefaultArgs: ["--enable-automation", "--hide-scrollbars"],
ignoreHTTPSErrors: true,
Expand Down Expand Up @@ -264,10 +264,10 @@ export class Browser {
}
}

async evaluateWithCLI_(
async evaluateWithCLI(
cdp: CDPSession,
frame: Frame,
cdpContextId: number,
frameIdToExecId: Map<string, number>,
funcString: string,
logData: Record<string, string>,
contextName: LogContext,
Expand All @@ -277,7 +277,7 @@ export class Browser {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let details: Record<string, any> = { frameUrl, ...logData };

if (!frameUrl || frame.isDetached()) {
if (!frameUrl || frame.detached) {
logger.info(
"Run Script Skipped, frame no longer attached or has no URL",
details,
Expand All @@ -286,6 +286,20 @@ export class Browser {
return false;
}

// eslint-disable-next-line @typescript-eslint/no-explicit-any
const frameId = (frame as any)._id;

const contextId = frameIdToExecId.get(frameId);

if (!contextId) {
logger.warn(
"Not running behavior, missing CDP context id for frame id",
{ frameId },
"browser",
);
return;
}

logger.info("Run Script Started", details, contextName);

// from puppeteer _evaluateInternal() but with includeCommandLineAPI: true
Expand All @@ -294,7 +308,7 @@ export class Browser {

const { exceptionDetails, result } = await cdp.send("Runtime.evaluate", {
expression,
contextId: cdpContextId,
contextId,
returnByValue: true,
awaitPromise: true,
userGesture: true,
Expand Down Expand Up @@ -385,7 +399,7 @@ export class Browser {
if (target.url() === startPage) {
resolve(target);
if (this.browser) {
this.browser.removeListener("targetcreated", listener);
this.browser.off("targetcreated", listener);
}
}
};
Expand Down Expand Up @@ -508,30 +522,6 @@ export class Browser {
});
}

async evaluateWithCLI(
_: unknown,
frame: Frame,
cdp: CDPSession,
funcString: string,
// eslint-disable-next-line @typescript-eslint/no-explicit-any
logData: Record<string, any>,
contextName: LogContext,
) {
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const context = await (frame as any).executionContext();
cdp = context._client;
const cdpContextId = context._contextId;
return await this.evaluateWithCLI_(
cdp,
frame,
cdpContextId,
funcString,
logData,
contextName,
);
}

interceptRequest(page: Page, callback: (event: HTTPRequest) => void) {
page.on("request", callback);
}
Expand Down
12 changes: 6 additions & 6 deletions src/util/recorder.ts
Expand Up @@ -79,7 +79,7 @@ export class Recorder {
skipIds!: Set<string>;
pageInfo!: PageInfoRecord;

swSessionId?: string | null;
swTargetId?: string | null;
swFrameIds = new Set<string>();
swUrls = new Set<string>();

Expand Down Expand Up @@ -169,19 +169,19 @@ export class Recorder {

// Target
cdp.on("Target.attachedToTarget", async (params) => {
const { url, type, sessionId } = params.targetInfo;
const { url, type, targetId } = params.targetInfo;
if (type === "service_worker") {
this.swSessionId = sessionId;
this.swTargetId = targetId;
this.swUrls.add(url);
}
});

cdp.on("Target.detachedFromTarget", async (params) => {
const { sessionId } = params;
if (this.swSessionId && sessionId === this.swSessionId) {
const { targetId } = params;
if (this.swTargetId && targetId === this.swTargetId) {
this.swUrls.clear();
this.swFrameIds.clear();
this.swSessionId = null;
this.swTargetId = null;
}
});

Expand Down
2 changes: 2 additions & 0 deletions src/util/worker.ts
Expand Up @@ -23,6 +23,7 @@ export type WorkerOpts = {
directFetchCapture?:
| ((url: string) => Promise<{ fetched: boolean; mime: string }>)
| null;
frameIdToExecId: Map<string, number>;
};

// ===========================================================================
Expand Down Expand Up @@ -178,6 +179,7 @@ export class PageWorker {
workerid,
callbacks: this.callbacks,
directFetchCapture,
frameIdToExecId: new Map<string, number>(),
};

if (this.recorder) {
Expand Down

0 comments on commit 0d973d6

Please sign in to comment.