From ce31d1d0048eb409261e92915c6dd0201f589417 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 6 Mar 2024 20:59:06 -0800 Subject: [PATCH 1/5] logging: exclude log contexts - add --logExcludeContext for log contexts that should be excluded - enable 'recorderNetwork' logging - create default log context exclude list (containing: screencast, recorderNetwork, jsErrors), customizable via --logExcludeContext recorder: track failed requests - cleanup cdp handler methods - intercept requestWillBeSent to track requests that started, but may not complete - fix shouldSkip() still working if no url is provided (eg. check only headers) - set status to 0 for async fetch failures - remove responseServedFromCache interception, as response data generally not available then, and responseReceive is still called - pageinfo: include page requests that failed with status code 0 - ensure page is closed on failure - ensure pageinfo still written even if nothing else is crawled for a page tests: add pageinfo test for crawling invalid URL, which should still result in pageinfo record with status code 0 bump to 1.0.0-beta.7 --- package.json | 2 +- src/crawler.ts | 3 +- src/util/argParser.ts | 14 ++- src/util/logger.ts | 24 +++- src/util/recorder.ts | 209 +++++++++++++++++++-------------- src/util/reqresp.ts | 34 +++--- src/util/worker.ts | 16 ++- tests/pageinfo-records.test.js | 31 ++++- 8 files changed, 224 insertions(+), 109 deletions(-) diff --git a/package.json b/package.json index 3035a9a21..6d20d0e6f 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.0.0-beta.6", + "version": "1.0.0-beta.7", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", diff --git a/src/crawler.ts b/src/crawler.ts index e21352c1d..91a7d80c7 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -189,7 +189,8 @@ export class Crawler { const debugLogging = this.params.logging.includes("debug"); logger.setDebugLogging(debugLogging); logger.setLogLevel(this.params.logLevel); - logger.setContext(this.params.context); + logger.setContext(this.params.logContext); + logger.setExcludeContext(this.params.logExcludeContext); // if automatically restarts on error exit code, // exit with 0 from fatal by default, to avoid unnecessary restart diff --git a/src/util/argParser.ts b/src/util/argParser.ts index d5f942ed3..5242b9e4c 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -15,7 +15,11 @@ import { import { ScopedSeed } from "./seeds.js"; import { interpolateFilename } from "./storage.js"; import { screenshotTypes } from "./screenshots.js"; -import { LOG_CONTEXT_TYPES, logger } from "./logger.js"; +import { + DEFAULT_EXCLUDE_LOG_CONTEXTS, + LOG_CONTEXT_TYPES, + logger, +} from "./logger.js"; // ============================================================================ class ArgParser { @@ -225,6 +229,14 @@ class ArgParser { coerce, }, + logExcludeContext: { + describe: "Comma-separated list of contexts to NOT include in logs", + type: "array", + default: DEFAULT_EXCLUDE_LOG_CONTEXTS, + choices: LOG_CONTEXT_TYPES, + coerce, + }, + text: { describe: "Extract initial (default) or final text to pages.jsonl or WARC resource record(s)", diff --git a/src/util/logger.ts b/src/util/logger.ts index c2786df05..dccb75abf 100644 --- a/src/util/logger.ts +++ b/src/util/logger.ts @@ -26,6 +26,7 @@ export const LOG_CONTEXT_TYPES = [ "general", "worker", "recorder", + "recorderNetwork", "writer", "state", "redis", @@ -51,13 +52,20 @@ export const LOG_CONTEXT_TYPES = [ export type LogContext = (typeof LOG_CONTEXT_TYPES)[number]; +export const DEFAULT_EXCLUDE_LOG_CONTEXTS: LogContext[] = [ + "recorderNetwork", + "jsError", + "screencast", +]; + // =========================================================================== class Logger { logStream: Writable | null = null; debugLogging = false; logErrorsToRedis = false; logLevels: string[] = []; - contexts: string[] = []; + contexts: LogContext[] = []; + excludeContexts: LogContext[] = []; crawlState?: RedisCrawlState | null = null; fatalExitCode = 17; @@ -81,10 +89,14 @@ class Logger { this.logLevels = logLevels; } - setContext(contexts: string[]) { + setContext(contexts: LogContext[]) { this.contexts = contexts; } + setExcludeContext(contexts: LogContext[]) { + this.excludeContexts = contexts; + } + setCrawlState(crawlState: RedisCrawlState) { this.crawlState = crawlState; } @@ -92,7 +104,7 @@ class Logger { logAsJSON( message: string, dataUnknown: unknown, - context: string, + context: LogContext, logLevel = "info", ) { // eslint-disable-next-line @typescript-eslint/no-explicit-any @@ -110,6 +122,12 @@ class Logger { } } + if (this.excludeContexts.length) { + if (this.excludeContexts.indexOf(context) >= 0) { + return; + } + } + const dataToLog = { timestamp: new Date().toISOString(), logLevel: logLevel, diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 242ecbd91..07836e493 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -42,10 +42,9 @@ const MIME_EVENT_STREAM = "text/event-stream"; const encoder = new TextEncoder(); // ================================================================= -// TODO: Fix this the next time the file is edited. -// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unused-vars +// eslint-disable-next-line @typescript-eslint/no-explicit-any function logNetwork(msg: string, data: any) { - // logger.debug(msg, data, "recorderNetwork"); + logger.debug(msg, data, "recorderNetwork"); } // ================================================================= @@ -150,7 +149,6 @@ export class Recorder { async onCreatePage({ cdp }: { cdp: CDPSession }) { // Fetch - cdp.on("Fetch.requestPaused", async (params) => { this.handleRequestPaused(params, cdp); }); @@ -159,85 +157,36 @@ export class Recorder { patterns: [{ urlPattern: "*", requestStage: "Response" }], }); - await cdp.send("Console.enable"); - // Response - cdp.on("Network.responseReceived", (params) => { - // handling to fill in security details - logNetwork("Network.responseReceived", { - requestId: params.requestId, - ...this.logDetails, - }); - this.handleResponseReceived(params); - }); + cdp.on("Network.responseReceived", (params) => + this.handleResponseReceived(params), + ); - cdp.on("Network.responseReceivedExtraInfo", (params) => { - logNetwork("Network.responseReceivedExtraInfo", { - requestId: params.requestId, - ...this.logDetails, - }); - const reqresp = this.pendingReqResp(params.requestId, true); - if (reqresp) { - reqresp.fillResponseReceivedExtraInfo(params); - } - }); + cdp.on("Network.responseReceivedExtraInfo", (params) => + this.handleResponseReceivedExtraInfo(params), + ); // Request + cdp.on("Network.requestWillBeSent", (params) => + this.handleRequestWillBeSent(params), + ); - cdp.on("Network.requestWillBeSent", (params) => { - // only handling redirect here, committing last response in redirect chain - // request data stored from requestPaused - if (params.redirectResponse) { - logNetwork("Network.requestWillBeSent after redirect", { - requestId: params.requestId, - ...this.logDetails, - }); - this.handleRedirectResponse(params); - } - }); - - cdp.on("Network.requestServedFromCache", (params) => { - logNetwork("Network.requestServedFromCache", { - requestId: params.requestId, - ...this.logDetails, - }); - const reqresp = this.pendingReqResp(params.requestId, true); - if (reqresp) { - this.addPageRecord(reqresp); - - this.removeReqResp(params.requestId); - } - }); - - cdp.on("Network.requestWillBeSentExtraInfo", (params) => { - logNetwork("Network.requestWillBeSentExtraInfo", { - requestId: params.requestId, - ...this.logDetails, - }); - this.handleRequestExtraInfo(params); - }); + cdp.on("Network.requestWillBeSentExtraInfo", (params) => + this.handleRequestExtraInfo(params), + ); // Loading - cdp.on("Network.loadingFinished", (params) => { - logNetwork("Network.loadingFinished", { - requestId: params.requestId, - ...this.logDetails, - }); - this.handleLoadingFinished(params); - }); + cdp.on("Network.loadingFinished", (params) => + this.handleLoadingFinished(params), + ); - cdp.on("Network.loadingFailed", (params) => { - logNetwork("Network.loadingFailed", { - requestId: params.requestId, - ...this.logDetails, - }); - this.handleLoadingFailed(params); - }); + cdp.on("Network.loadingFailed", (params) => + this.handleLoadingFailed(params), + ); await cdp.send("Network.enable"); // Target - cdp.on("Target.attachedToTarget", async (params) => { const { url, type, sessionId } = params.targetInfo; if (type === "service_worker") { @@ -255,6 +204,13 @@ export class Recorder { } }); + await cdp.send("Target.setAutoAttach", { + autoAttach: true, + waitForDebuggerOnStart: false, + flatten: true, + }); + + // Console cdp.on("Console.messageAdded", (params) => { const { message } = params; const { source, level } = message; @@ -263,16 +219,16 @@ export class Recorder { } }); - await cdp.send("Target.setAutoAttach", { - autoAttach: true, - waitForDebuggerOnStart: false, - flatten: true, - }); + await cdp.send("Console.enable"); } handleResponseReceived(params: Protocol.Network.ResponseReceivedEvent) { const { requestId, response, type } = params; + logNetwork("Network.responseReceived", { requestId, ...this.logDetails }); + + // handling to fill in security details + const { mimeType } = response; if (mimeType === MIME_EVENT_STREAM) { @@ -289,11 +245,74 @@ export class Recorder { this.addPageRecord(reqresp); } + handleResponseReceivedExtraInfo( + params: Protocol.Network.ResponseReceivedExtraInfoEvent, + ) { + const { requestId } = params; + + logNetwork("Network.responseReceivedExtraInfo", { + requestId, + ...this.logDetails, + }); + + const reqresp = this.pendingReqResp(requestId, true); + if (reqresp) { + reqresp.fillResponseReceivedExtraInfo(params); + } + } + + handleRequestWillBeSent(params: Protocol.Network.RequestWillBeSentEvent) { + // only handling redirect here, committing last response in redirect chain + // request data stored from requestPaused + const { redirectResponse, requestId, request, type } = params; + + const { headers, method, url } = request; + + logNetwork("Network.requestWillBeSent", { + requestId, + redirectResponse, + ...this.logDetails, + }); + + if (redirectResponse) { + this.handleRedirectResponse(params); + } else { + if (!this.shouldSkip(headers, url, method, type)) { + const reqresp = this.pendingReqResp(requestId); + if (reqresp) { + reqresp.fillRequest(request); + } + } + } + } + + // handleRequestServedFromCache( + // params: Protocol.Network.RequestServedFromCacheEvent, + // ) { + // const { requestId } = params; + + // logNetwork("Network.requestServedFromCache", { + // requestId, + // ...this.logDetails, + // }); + // const reqresp = this.pendingReqResp(requestId, true); + // if (reqresp) { + // reqresp.fromCache = true; + // } + // } + handleRequestExtraInfo( params: Protocol.Network.RequestWillBeSentExtraInfoEvent, ) { - if (!this.shouldSkip(params.headers)) { - const reqresp = this.pendingReqResp(params.requestId, true); + const { requestId, headers } = params; + + logNetwork("Network.requestWillBeSentExtraInfo", { + requestId, + ...this.logDetails, + }); + + if (!this.shouldSkip(headers)) { + const reqresp = this.pendingReqResp(requestId, true); if (reqresp) { reqresp.fillRequestExtraInfo(params); } @@ -327,6 +346,11 @@ export class Recorder { handleLoadingFailed(params: Protocol.Network.LoadingFailedEvent) { const { errorText, type, requestId } = params; + logNetwork("Network.loadingFailed", { + requestId, + ...this.logDetails, + }); + const reqresp = this.pendingReqResp(requestId, true); if (!reqresp) { return; @@ -377,17 +401,26 @@ export class Recorder { "recorder", ); } + reqresp.status = 0; + this.addPageRecord(reqresp); this.removeReqResp(requestId); } handleLoadingFinished(params: Protocol.Network.LoadingFinishedEvent) { - const reqresp = this.pendingReqResp(params.requestId, true); + const { requestId } = params; + + logNetwork("Network.loadingFinished", { + requestId: params.requestId, + ...this.logDetails, + }); + + const reqresp = this.pendingReqResp(requestId, true); if (!reqresp || reqresp.asyncLoading) { return; } - this.removeReqResp(params.requestId); + this.removeReqResp(requestId); if (!this.isValidUrl(reqresp.url)) { return; @@ -684,15 +717,19 @@ export class Recorder { ); this.warcQ.add(() => this.writer.writeSingleRecord(resourceRecord)); + + return this.pageInfo.ts; } - async finishPage() { + async awaitPageResources() { for (const [requestId, reqresp] of this.pendingRequests.entries()) { if (reqresp.payload) { this.removeReqResp(requestId); await this.serializeToWARC(reqresp); // no url, likely invalid } else if (!reqresp.url) { + reqresp.status = 0; + this.addPageRecord(reqresp); this.removeReqResp(requestId); } } @@ -726,10 +763,6 @@ export class Recorder { await sleep(5.0); numPending = this.pendingRequests.size; } - - await this.writePageInfoRecord(); - - return this.pageInfo.ts; } async onClosePage() { @@ -768,7 +801,8 @@ export class Recorder { method = headers[":method"]; } - if (!this.isValidUrl(url)) { + // only check if url is provided, since it is optional + if (url && !this.isValidUrl(url)) { return true; } @@ -1204,6 +1238,9 @@ class AsyncFetcher { { url, networkId, filename, ...formatErr(e), ...logDetails }, "recorder", ); + // indicate response is ultimately not valid + reqresp.status = 0; + recorder.addPageRecord(reqresp); } finally { recorder.removeReqResp(networkId); } diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index fbcd9a4ab..dabe790e4 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -44,11 +44,11 @@ export class RequestResponseInfo { payload?: Uint8Array; // misc - fromServiceWorker: boolean = false; + fromServiceWorker = false; frameId?: string; - fetch: boolean = false; + fetch = false; resourceType?: string; @@ -71,13 +71,7 @@ export class RequestResponseInfo { } fillFetchRequestPaused(params: Protocol.Fetch.RequestPausedEvent) { - this.url = params.request.url; - this.method = params.request.method; - if (!this.requestHeaders) { - this.requestHeaders = params.request.headers; - } - this.postData = params.request.postData; - this.hasPostData = params.request.hasPostData || false; + this.fillRequest(params.request, params.resourceType); this.status = params.responseStatusCode || 0; this.statusText = params.responseStatusText || getStatusText(this.status); @@ -86,14 +80,24 @@ export class RequestResponseInfo { this.fetch = true; - if (params.resourceType) { - this.resourceType = params.resourceType.toLowerCase(); + this.frameId = params.frameId; + } + + fillRequest(request: Protocol.Network.Request, resourceType?: string) { + this.url = request.url; + this.method = request.method; + if (!this.requestHeaders) { + this.requestHeaders = request.headers; } + this.postData = request.postData; + this.hasPostData = request.hasPostData || false; - this.frameId = params.frameId; + if (resourceType) { + this.resourceType = resourceType.toLowerCase(); + } } - fillResponse(response: Protocol.Network.Response, type?: string) { + fillResponse(response: Protocol.Network.Response, resourceType?: string) { // if initial fetch was a 200, but now replacing with 304, don't! if ( response.status == 304 && @@ -111,8 +115,8 @@ export class RequestResponseInfo { this.protocol = response.protocol; - if (type) { - this.resourceType = type.toLowerCase(); + if (resourceType) { + this.resourceType = resourceType.toLowerCase(); } if (response.requestHeaders) { diff --git a/src/util/worker.ts b/src/util/worker.ts index b7ceb6986..e05dad308 100644 --- a/src/util/worker.ts +++ b/src/util/worker.ts @@ -298,7 +298,7 @@ export class PageWorker { async crawlPage(opts: WorkerState) { const res = await this.crawler.crawlPage(opts); if (this.recorder) { - opts.data.ts = await this.recorder.finishPage(); + await this.recorder.awaitPageResources(); } return res; } @@ -339,7 +339,21 @@ export class PageWorker { "worker", ); } + + await this.closePage(); } finally { + try { + if (this.recorder) { + opts.data.ts = await this.recorder.writePageInfoRecord(); + } + } catch (e) { + logger.error( + "Error writing pageinfo recorder", + { ...formatErr(e), ...this.logDetails }, + "recorder", + ); + } + await timedRun( this.crawler.pageFinished(data), FINISHED_TIMEOUT, diff --git a/tests/pageinfo-records.test.js b/tests/pageinfo-records.test.js index 8642190c7..6b67b18db 100644 --- a/tests/pageinfo-records.test.js +++ b/tests/pageinfo-records.test.js @@ -5,7 +5,7 @@ import { WARCParser } from "warcio"; test("run warc and ensure pageinfo records contain the correct resources", async () => { child_process.execSync( - "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --url https://webrecorder.net/about --scopeType page --collection page-info-test --combineWARC", + "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --url https://webrecorder.net/about --url https://invalid.invalid/ --scopeType page --collection page-info-test --combineWARC", ); const filename = path.join( @@ -21,6 +21,7 @@ test("run warc and ensure pageinfo records contain the correct resources", async let foundIndex = false; let foundAbout = false; + let foundInvalid = false; for await (const record of parser) { if ( @@ -40,10 +41,20 @@ test("run warc and ensure pageinfo records contain the correct resources", async const text = await record.contentText(); validateResourcesAbout(JSON.parse(text)); } + + if ( + !foundInvalid && + record.warcTargetURI === "urn:pageinfo:https://invalid.invalid/" + ) { + foundInvalid = true; + const text = await record.contentText(); + validateResourcesInvalid(JSON.parse(text)); + } } expect(foundIndex).toBe(true); expect(foundAbout).toBe(true); + expect(foundInvalid).toBe(true); }); function validateResourcesIndex(json) { @@ -161,5 +172,23 @@ function validateResourcesAbout(json) { { status: 200, mime: "font/woff2", type: "font" }, "https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2": { status: 200, mime: "font/woff2", type: "font" }, + "https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2Fabout&d=webrecorder.net": + { + status: 0, + type: "xhr", + }, + }); +} + +function validateResourcesInvalid(json) { + expect(json).toHaveProperty("pageid"); + expect(json).toHaveProperty("url"); + expect(json).toHaveProperty("urls"); + expect(json.counts).toEqual({ jsErrors: 0 }); + expect(json.urls).toEqual({ + "https://invalid.invalid/": { + status: 0, + type: "document", + }, }); } From e85c78a159f0c8de86fe400faf48c31b05ae25f0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 6 Mar 2024 21:09:38 -0800 Subject: [PATCH 2/5] remove commented out --- src/util/recorder.ts | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 07836e493..f41df5cc2 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -286,21 +286,6 @@ export class Recorder { } } - // handleRequestServedFromCache( - // params: Protocol.Network.RequestServedFromCacheEvent, - // ) { - // const { requestId } = params; - - // logNetwork("Network.requestServedFromCache", { - // requestId, - // ...this.logDetails, - // }); - // const reqresp = this.pendingReqResp(requestId, true); - // if (reqresp) { - // reqresp.fromCache = true; - // } - // } - handleRequestExtraInfo( params: Protocol.Network.RequestWillBeSentExtraInfoEvent, ) { From 811dafb1a414d4961b414c0685a55fff859ae8db Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 6 Mar 2024 22:18:32 -0800 Subject: [PATCH 3/5] ensure added to pageinfo even if dupe via async fetcher --- src/util/recorder.ts | 49 +++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/src/util/recorder.ts b/src/util/recorder.ts index f41df5cc2..a8381ff16 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -225,11 +225,13 @@ export class Recorder { handleResponseReceived(params: Protocol.Network.ResponseReceivedEvent) { const { requestId, response, type } = params; - logNetwork("Network.responseReceived", { requestId, ...this.logDetails }); + const { mimeType, url } = response; - // handling to fill in security details - - const { mimeType } = response; + logNetwork("Network.responseReceived", { + requestId, + url, + ...this.logDetails, + }); if (mimeType === MIME_EVENT_STREAM) { return; @@ -241,8 +243,6 @@ export class Recorder { } reqresp.fillResponse(response, type); - - this.addPageRecord(reqresp); } handleResponseReceivedExtraInfo( @@ -331,17 +331,20 @@ export class Recorder { handleLoadingFailed(params: Protocol.Network.LoadingFailedEvent) { const { errorText, type, requestId } = params; + const reqresp = this.pendingReqResp(requestId, true); + + const url = reqresp?.url; + logNetwork("Network.loadingFailed", { requestId, + url, ...this.logDetails, }); - const reqresp = this.pendingReqResp(requestId, true); if (!reqresp) { return; } - const { url } = reqresp; if (type) { reqresp.resourceType = type.toLowerCase(); } @@ -394,13 +397,16 @@ export class Recorder { handleLoadingFinished(params: Protocol.Network.LoadingFinishedEvent) { const { requestId } = params; + const reqresp = this.pendingReqResp(requestId, true); + + const url = reqresp?.url; + logNetwork("Network.loadingFinished", { - requestId: params.requestId, + requestId, + url, ...this.logDetails, }); - const reqresp = this.pendingReqResp(requestId, true); - if (!reqresp || reqresp.asyncLoading) { return; } @@ -968,6 +974,10 @@ export class Recorder { removeReqResp(requestId: string, allowReuse = false) { const reqresp = this.pendingRequests.get(requestId); + if (reqresp) { + const { url, requestId } = reqresp; + logNetwork("Removing reqresp", { requestId, url }); + } this.pendingRequests.delete(requestId); if (!allowReuse) { this.skipIds.add(requestId); @@ -976,13 +986,18 @@ export class Recorder { } async serializeToWARC(reqresp: RequestResponseInfo) { + // always include in pageinfo record if going to serialize to WARC + // even if serialization does not happen + this.addPageRecord(reqresp); + if (reqresp.shouldSkipSave()) { - const { url, method, status, payload } = reqresp; - logNetwork("Skipping request/response", { + const { url, method, status, payload, requestId } = reqresp; + logNetwork("Skipping writing request/response", { + requestId, url, method, status, - payloadLength: payload && payload.length, + payloadLength: (payload && payload.length) || 0, }); return; } @@ -996,8 +1011,6 @@ export class Recorder { return; } - this.addPageRecord(reqresp); - const responseRecord = createResponse(reqresp, this.pageid); const requestRecord = createRequest(reqresp, responseRecord, this.pageid); @@ -1203,8 +1216,6 @@ class AsyncFetcher { ); } - recorder.addPageRecord(reqresp); - recorder.warcQ.add(() => recorder.writer.writeRecordPair( responseRecord, @@ -1225,8 +1236,8 @@ class AsyncFetcher { ); // indicate response is ultimately not valid reqresp.status = 0; - recorder.addPageRecord(reqresp); } finally { + recorder.addPageRecord(reqresp); recorder.removeReqResp(networkId); } From 2e5d76681d7d48302bf86f2ece4f4f8f5d496a79 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 6 Mar 2024 23:37:29 -0800 Subject: [PATCH 4/5] pageinfo: also include 'error' field when status is 0 cache status: track if a resource is loaded from browser cache, include in debug logging (though not yet in pageinfo, to revisit later) --- src/util/recorder.ts | 64 ++++++++++++++++++++++++++++------ src/util/reqresp.ts | 10 +++++- tests/pageinfo-records.test.js | 2 ++ 3 files changed, 65 insertions(+), 11 deletions(-) diff --git a/src/util/recorder.ts b/src/util/recorder.ts index a8381ff16..840d5bb5d 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -52,6 +52,8 @@ export type PageInfoValue = { status: number; mime?: string; type?: string; + error?: string; + fromBrowserCache?: boolean; }; // ================================================================= @@ -166,6 +168,11 @@ export class Recorder { this.handleResponseReceivedExtraInfo(params), ); + // Cache + cdp.on("Network.requestServedFromCache", (params) => + this.handleRequestServedFromCache(params), + ); + // Request cdp.on("Network.requestWillBeSent", (params) => this.handleRequestWillBeSent(params), @@ -261,6 +268,26 @@ export class Recorder { } } + handleRequestServedFromCache( + params: Protocol.Network.RequestServedFromCacheEvent, + ) { + const { requestId } = params; + + const reqresp = this.pendingReqResp(requestId, true); + + const url = reqresp?.url; + + logNetwork("Network.requestServedFromCache", { + requestId, + url, + ...this.logDetails, + }); + + if (reqresp) { + reqresp.fromCache = true; + } + } + handleRequestWillBeSent(params: Protocol.Network.RequestWillBeSentEvent) { // only handling redirect here, committing last response in redirect chain // request data stored from requestPaused @@ -390,6 +417,8 @@ export class Recorder { ); } reqresp.status = 0; + reqresp.errorText = errorText; + this.addPageRecord(reqresp); this.removeReqResp(requestId); } @@ -413,7 +442,7 @@ export class Recorder { this.removeReqResp(requestId); - if (!this.isValidUrl(reqresp.url)) { + if (!this.isValidUrl(url)) { return; } @@ -692,7 +721,15 @@ export class Recorder { if (this.isValidUrl(reqresp.url)) { const { status, resourceType: type } = reqresp; const mime = reqresp.getMimeType(); - this.pageInfo.urls[reqresp.getCanonURL()] = { status, mime, type }; + const info: PageInfoValue = { status, mime, type }; + if (reqresp.errorText) { + info.error = reqresp.errorText; + } + //TODO: revisit if we want to record this later + // if (reqresp.fromCache) { + // info.fromBrowserCache = true; + // } + this.pageInfo.urls[reqresp.getCanonURL()] = info; } } @@ -719,8 +756,6 @@ export class Recorder { await this.serializeToWARC(reqresp); // no url, likely invalid } else if (!reqresp.url) { - reqresp.status = 0; - this.addPageRecord(reqresp); this.removeReqResp(requestId); } } @@ -990,8 +1025,17 @@ export class Recorder { // even if serialization does not happen this.addPageRecord(reqresp); - if (reqresp.shouldSkipSave()) { - const { url, method, status, payload, requestId } = reqresp; + const { url, method, status, payload, requestId } = reqresp; + + // Specifically log skipping cached resources + if (reqresp.isCached()) { + logger.debug( + "Skipping cached resource, should be already recorded", + { url, status }, + "recorder", + ); + return; + } else if (reqresp.shouldSkipSave()) { logNetwork("Skipping writing request/response", { requestId, url, @@ -1003,11 +1047,11 @@ export class Recorder { } if ( - reqresp.url && - reqresp.method === "GET" && - !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, reqresp.url)) + url && + method === "GET" && + !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url)) ) { - logNetwork("Skipping dupe", { url: reqresp.url }); + logNetwork("Skipping dupe", { url }); return; } diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index dabe790e4..32490c375 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -37,6 +37,8 @@ export class RequestResponseInfo { status: number = 0; statusText?: string; + errorText?: string; + responseHeaders?: Record; responseHeadersList?: { name: string; value: string }[]; responseHeadersText?: string; @@ -45,6 +47,7 @@ export class RequestResponseInfo { // misc fromServiceWorker = false; + fromCache = false; frameId?: string; @@ -296,9 +299,14 @@ export class RequestResponseInfo { return true; } + isCached() { + return this.fromCache && !this.payload; + } + shouldSkipSave() { - // skip OPTIONS/HEAD responses, and 304 or 206 responses + // skip cached, OPTIONS/HEAD responses, and 304 or 206 responses if ( + this.fromCache || !this.payload || (this.method && ["OPTIONS", "HEAD"].includes(this.method)) || [206, 304].includes(this.status) diff --git a/tests/pageinfo-records.test.js b/tests/pageinfo-records.test.js index 6b67b18db..d16f52d31 100644 --- a/tests/pageinfo-records.test.js +++ b/tests/pageinfo-records.test.js @@ -176,6 +176,7 @@ function validateResourcesAbout(json) { { status: 0, type: "xhr", + error: "net::ERR_BLOCKED_BY_CLIENT", }, }); } @@ -189,6 +190,7 @@ function validateResourcesInvalid(json) { "https://invalid.invalid/": { status: 0, type: "document", + error: "net::ERR_NAME_NOT_RESOLVED", }, }); } From 30b6b63f9d010b1f0ffe9c963cb3830c12878383 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 7 Mar 2024 00:03:46 -0800 Subject: [PATCH 5/5] minor tweaks, ignore direct fetch reqresp --- src/util/recorder.ts | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 840d5bb5d..d2c967b2f 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -1092,8 +1092,7 @@ export class Recorder { const res = await fetcher.load(); const mime = - (reqresp && - reqresp.responseHeaders && + (reqresp.responseHeaders && reqresp.responseHeaders["content-type"] && reqresp.responseHeaders["content-type"].split(";")[0]) || ""; @@ -1280,9 +1279,13 @@ class AsyncFetcher { ); // indicate response is ultimately not valid reqresp.status = 0; + reqresp.errorText = e.message; } finally { - recorder.addPageRecord(reqresp); - recorder.removeReqResp(networkId); + // exclude direct fetch request with fake id + if (networkId !== "0") { + recorder.addPageRecord(reqresp); + recorder.removeReqResp(networkId); + } } return fetched;