Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

upgrade puppeteer-core to 22.6.1 #516

Merged
merged 4 commits into from Mar 27, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Expand Up @@ -30,7 +30,7 @@
"p-queue": "^7.3.4",
"pixelmatch": "^5.3.0",
"pngjs": "^7.0.0",
"puppeteer-core": "^20.8.2",
"puppeteer-core": "^22.6.1",
"sax": "^1.3.0",
"sharp": "^0.32.6",
"tsc": "^2.0.4",
Expand Down
65 changes: 52 additions & 13 deletions src/crawler.ts
Expand Up @@ -10,7 +10,6 @@ import {
QueueState,
PageState,
WorkerId,
PageCallbacks,
} from "./util/state.js";

import { parseArgs } from "./util/argParser.js";
Expand Down Expand Up @@ -57,7 +56,7 @@ import { OriginOverride } from "./util/originoverride.js";
// to ignore HTTPS error for HEAD check
import { Agent as HTTPAgent } from "http";
import { Agent as HTTPSAgent } from "https";
import { CDPSession, Frame, HTTPRequest, Page } from "puppeteer-core";
import { CDPSession, Frame, HTTPRequest, Page, Protocol } from "puppeteer-core";
import { Recorder } from "./util/recorder.js";
import { SitemapReader } from "./util/sitemapper.js";
import { ScopedSeed } from "./util/seeds.js";
Expand Down Expand Up @@ -624,14 +623,12 @@ export class Crawler {
cdp,
workerid,
callbacks,
}: {
page: Page;
cdp: CDPSession;
workerid: WorkerId;
callbacks: PageCallbacks;
}) {
frameIdToExecId,
}: WorkerOpts) {
await this.browser.setupPage({ page, cdp });

await this.setupExecContextEvents(cdp, frameIdToExecId);

if (
(this.adBlockRules && this.params.blockAds) ||
this.blockRules ||
Expand Down Expand Up @@ -704,6 +701,41 @@ self.__bx_behaviors.selectMainBehavior();
}
}

async setupExecContextEvents(
cdp: CDPSession,
frameIdToExecId: Map<string, number>,
) {
await cdp.send("Runtime.enable");

await cdp.on(
"Runtime.executionContextCreated",
(params: Protocol.Runtime.ExecutionContextCreatedEvent) => {
const { id, auxData } = params.context;
if (auxData && auxData.isDefault && auxData.frameId) {
frameIdToExecId.set(auxData.frameId, id);
}
},
);

await cdp.on(
"Runtime.executionContextDestroyed",
(params: Protocol.Runtime.ExecutionContextDestroyedEvent) => {
const { executionContextId } = params;
for (const [frameId, execId] of frameIdToExecId.entries()) {
if (execId === executionContextId) {
frameIdToExecId.delete(frameId);
break;
}
}
console.log("FRAME-TO-EXEC REM", frameIdToExecId.size);
},
);

await cdp.on("Runtime.executionContextsCleared", () => {
frameIdToExecId.clear();
});
}

loadCustomBehaviors(filename: string) {
let str = "";

Expand Down Expand Up @@ -875,7 +907,13 @@ self.__bx_behaviors.selectMainBehavior();
logger.info("Skipping behaviors for slow page", logDetails, "behavior");
} else {
const res = await timedRun(
this.runBehaviors(page, cdp, data.filteredFrames, logDetails),
this.runBehaviors(
page,
cdp,
data.filteredFrames,
opts.frameIdToExecId,
logDetails,
),
this.params.behaviorTimeout,
"Behaviors timed out",
logDetails,
Expand Down Expand Up @@ -954,6 +992,7 @@ self.__bx_behaviors.selectMainBehavior();
page: Page,
cdp: CDPSession,
frames: Frame[],
frameIdToExecId: Map<string, number>,
logDetails: LogDetails,
) {
try {
Expand All @@ -972,9 +1011,9 @@ self.__bx_behaviors.selectMainBehavior();
const results = await Promise.allSettled(
frames.map((frame) =>
this.browser.evaluateWithCLI(
page,
frame,
cdp,
frame,
frameIdToExecId,
`
if (!self.__bx_behaviors) {
console.error("__bx_behaviors missing, can't run behaviors");
Expand All @@ -988,11 +1027,11 @@ self.__bx_behaviors.selectMainBehavior();
);

for (const res of results) {
const { status, reason }: { status: string; reason?: string } = res;
const { status, reason }: { status: string; reason?: unknown } = res;
if (status === "rejected") {
logger.warn(
"Behavior run partially failed",
{ reason, ...logDetails },
{ reason: formatErr(reason), ...logDetails },
"behavior",
);
}
Expand Down
50 changes: 20 additions & 30 deletions src/util/browser.ts
Expand Up @@ -95,7 +95,7 @@ export class Browser {

const launchOpts: PuppeteerLaunchOptions = {
args,
headless: headless ? "new" : false,
headless,
executablePath: this.getBrowserExe(),
ignoreDefaultArgs: ["--enable-automation", "--hide-scrollbars"],
ignoreHTTPSErrors: true,
Expand Down Expand Up @@ -264,10 +264,10 @@ export class Browser {
}
}

async evaluateWithCLI_(
async evaluateWithCLI(
cdp: CDPSession,
frame: Frame,
cdpContextId: number,
frameIdToExecId: Map<string, number>,
funcString: string,
logData: Record<string, string>,
contextName: LogContext,
Expand All @@ -277,7 +277,7 @@ export class Browser {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let details: Record<string, any> = { frameUrl, ...logData };

if (!frameUrl || frame.isDetached()) {
if (!frameUrl || frame.detached) {
logger.info(
"Run Script Skipped, frame no longer attached or has no URL",
details,
Expand All @@ -286,6 +286,20 @@ export class Browser {
return false;
}

// eslint-disable-next-line @typescript-eslint/no-explicit-any
const frameId = (frame as any)._id;

const contextId = frameIdToExecId.get(frameId);

if (!contextId) {
logger.warn(
"Not running behavior, missing CDP context id for frame id",
{ frameId },
"browser",
);
return;
}

logger.info("Run Script Started", details, contextName);

// from puppeteer _evaluateInternal() but with includeCommandLineAPI: true
Expand All @@ -294,7 +308,7 @@ export class Browser {

const { exceptionDetails, result } = await cdp.send("Runtime.evaluate", {
expression,
contextId: cdpContextId,
contextId,
returnByValue: true,
awaitPromise: true,
userGesture: true,
Expand Down Expand Up @@ -385,7 +399,7 @@ export class Browser {
if (target.url() === startPage) {
resolve(target);
if (this.browser) {
this.browser.removeListener("targetcreated", listener);
this.browser.off("targetcreated", listener);
}
}
};
Expand Down Expand Up @@ -508,30 +522,6 @@ export class Browser {
});
}

async evaluateWithCLI(
_: unknown,
frame: Frame,
cdp: CDPSession,
funcString: string,
// eslint-disable-next-line @typescript-eslint/no-explicit-any
logData: Record<string, any>,
contextName: LogContext,
) {
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const context = await (frame as any).executionContext();
cdp = context._client;
const cdpContextId = context._contextId;
return await this.evaluateWithCLI_(
cdp,
frame,
cdpContextId,
funcString,
logData,
contextName,
);
}

interceptRequest(page: Page, callback: (event: HTTPRequest) => void) {
page.on("request", callback);
}
Expand Down
12 changes: 6 additions & 6 deletions src/util/recorder.ts
Expand Up @@ -79,7 +79,7 @@ export class Recorder {
skipIds!: Set<string>;
pageInfo!: PageInfoRecord;

swSessionId?: string | null;
swTargetId?: string | null;
swFrameIds = new Set<string>();
swUrls = new Set<string>();

Expand Down Expand Up @@ -169,19 +169,19 @@ export class Recorder {

// Target
cdp.on("Target.attachedToTarget", async (params) => {
const { url, type, sessionId } = params.targetInfo;
const { url, type, targetId } = params.targetInfo;
if (type === "service_worker") {
this.swSessionId = sessionId;
this.swTargetId = targetId;
this.swUrls.add(url);
}
});

cdp.on("Target.detachedFromTarget", async (params) => {
const { sessionId } = params;
if (this.swSessionId && sessionId === this.swSessionId) {
const { targetId } = params;
if (this.swTargetId && targetId === this.swTargetId) {
this.swUrls.clear();
this.swFrameIds.clear();
this.swSessionId = null;
this.swTargetId = null;
}
});

Expand Down
2 changes: 2 additions & 0 deletions src/util/worker.ts
Expand Up @@ -23,6 +23,7 @@ export type WorkerOpts = {
directFetchCapture?:
| ((url: string) => Promise<{ fetched: boolean; mime: string }>)
| null;
frameIdToExecId: Map<string, number>;
};

// ===========================================================================
Expand Down Expand Up @@ -178,6 +179,7 @@ export class PageWorker {
workerid,
callbacks: this.callbacks,
directFetchCapture,
frameIdToExecId: new Map<string, number>(),
};

if (this.recorder) {
Expand Down