diff --git a/package.json b/package.json index 3035a9a21..6d20d0e6f 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.0.0-beta.6", + "version": "1.0.0-beta.7", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", diff --git a/src/crawler.ts b/src/crawler.ts index e21352c1d..91a7d80c7 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -189,7 +189,8 @@ export class Crawler { const debugLogging = this.params.logging.includes("debug"); logger.setDebugLogging(debugLogging); logger.setLogLevel(this.params.logLevel); - logger.setContext(this.params.context); + logger.setContext(this.params.logContext); + logger.setExcludeContext(this.params.logExcludeContext); // if automatically restarts on error exit code, // exit with 0 from fatal by default, to avoid unnecessary restart diff --git a/src/util/argParser.ts b/src/util/argParser.ts index d5f942ed3..5242b9e4c 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -15,7 +15,11 @@ import { import { ScopedSeed } from "./seeds.js"; import { interpolateFilename } from "./storage.js"; import { screenshotTypes } from "./screenshots.js"; -import { LOG_CONTEXT_TYPES, logger } from "./logger.js"; +import { + DEFAULT_EXCLUDE_LOG_CONTEXTS, + LOG_CONTEXT_TYPES, + logger, +} from "./logger.js"; // ============================================================================ class ArgParser { @@ -225,6 +229,14 @@ class ArgParser { coerce, }, + logExcludeContext: { + describe: "Comma-separated list of contexts to NOT include in logs", + type: "array", + default: DEFAULT_EXCLUDE_LOG_CONTEXTS, + choices: LOG_CONTEXT_TYPES, + coerce, + }, + text: { describe: "Extract initial (default) or final text to pages.jsonl or WARC resource record(s)", diff --git a/src/util/logger.ts b/src/util/logger.ts index c2786df05..dccb75abf 100644 --- a/src/util/logger.ts +++ b/src/util/logger.ts @@ -26,6 +26,7 @@ export const LOG_CONTEXT_TYPES = [ "general", "worker", "recorder", + "recorderNetwork", "writer", "state", "redis", @@ -51,13 +52,20 @@ export const LOG_CONTEXT_TYPES = [ export type LogContext = (typeof LOG_CONTEXT_TYPES)[number]; +export const DEFAULT_EXCLUDE_LOG_CONTEXTS: LogContext[] = [ + "recorderNetwork", + "jsError", + "screencast", +]; + // =========================================================================== class Logger { logStream: Writable | null = null; debugLogging = false; logErrorsToRedis = false; logLevels: string[] = []; - contexts: string[] = []; + contexts: LogContext[] = []; + excludeContexts: LogContext[] = []; crawlState?: RedisCrawlState | null = null; fatalExitCode = 17; @@ -81,10 +89,14 @@ class Logger { this.logLevels = logLevels; } - setContext(contexts: string[]) { + setContext(contexts: LogContext[]) { this.contexts = contexts; } + setExcludeContext(contexts: LogContext[]) { + this.excludeContexts = contexts; + } + setCrawlState(crawlState: RedisCrawlState) { this.crawlState = crawlState; } @@ -92,7 +104,7 @@ class Logger { logAsJSON( message: string, dataUnknown: unknown, - context: string, + context: LogContext, logLevel = "info", ) { // eslint-disable-next-line @typescript-eslint/no-explicit-any @@ -110,6 +122,12 @@ class Logger { } } + if (this.excludeContexts.length) { + if (this.excludeContexts.indexOf(context) >= 0) { + return; + } + } + const dataToLog = { timestamp: new Date().toISOString(), logLevel: logLevel, diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 242ecbd91..d2c967b2f 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -42,10 +42,9 @@ const MIME_EVENT_STREAM = "text/event-stream"; const encoder = new TextEncoder(); // ================================================================= -// TODO: Fix this the next time the file is edited. -// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unused-vars +// eslint-disable-next-line @typescript-eslint/no-explicit-any function logNetwork(msg: string, data: any) { - // logger.debug(msg, data, "recorderNetwork"); + logger.debug(msg, data, "recorderNetwork"); } // ================================================================= @@ -53,6 +52,8 @@ export type PageInfoValue = { status: number; mime?: string; type?: string; + error?: string; + fromBrowserCache?: boolean; }; // ================================================================= @@ -150,7 +151,6 @@ export class Recorder { async onCreatePage({ cdp }: { cdp: CDPSession }) { // Fetch - cdp.on("Fetch.requestPaused", async (params) => { this.handleRequestPaused(params, cdp); }); @@ -159,85 +159,41 @@ export class Recorder { patterns: [{ urlPattern: "*", requestStage: "Response" }], }); - await cdp.send("Console.enable"); - // Response - cdp.on("Network.responseReceived", (params) => { - // handling to fill in security details - logNetwork("Network.responseReceived", { - requestId: params.requestId, - ...this.logDetails, - }); - this.handleResponseReceived(params); - }); - - cdp.on("Network.responseReceivedExtraInfo", (params) => { - logNetwork("Network.responseReceivedExtraInfo", { - requestId: params.requestId, - ...this.logDetails, - }); - const reqresp = this.pendingReqResp(params.requestId, true); - if (reqresp) { - reqresp.fillResponseReceivedExtraInfo(params); - } - }); - - // Request + cdp.on("Network.responseReceived", (params) => + this.handleResponseReceived(params), + ); - cdp.on("Network.requestWillBeSent", (params) => { - // only handling redirect here, committing last response in redirect chain - // request data stored from requestPaused - if (params.redirectResponse) { - logNetwork("Network.requestWillBeSent after redirect", { - requestId: params.requestId, - ...this.logDetails, - }); - this.handleRedirectResponse(params); - } - }); + cdp.on("Network.responseReceivedExtraInfo", (params) => + this.handleResponseReceivedExtraInfo(params), + ); - cdp.on("Network.requestServedFromCache", (params) => { - logNetwork("Network.requestServedFromCache", { - requestId: params.requestId, - ...this.logDetails, - }); - const reqresp = this.pendingReqResp(params.requestId, true); - if (reqresp) { - this.addPageRecord(reqresp); + // Cache + cdp.on("Network.requestServedFromCache", (params) => + this.handleRequestServedFromCache(params), + ); - this.removeReqResp(params.requestId); - } - }); + // Request + cdp.on("Network.requestWillBeSent", (params) => + this.handleRequestWillBeSent(params), + ); - cdp.on("Network.requestWillBeSentExtraInfo", (params) => { - logNetwork("Network.requestWillBeSentExtraInfo", { - requestId: params.requestId, - ...this.logDetails, - }); - this.handleRequestExtraInfo(params); - }); + cdp.on("Network.requestWillBeSentExtraInfo", (params) => + this.handleRequestExtraInfo(params), + ); // Loading - cdp.on("Network.loadingFinished", (params) => { - logNetwork("Network.loadingFinished", { - requestId: params.requestId, - ...this.logDetails, - }); - this.handleLoadingFinished(params); - }); + cdp.on("Network.loadingFinished", (params) => + this.handleLoadingFinished(params), + ); - cdp.on("Network.loadingFailed", (params) => { - logNetwork("Network.loadingFailed", { - requestId: params.requestId, - ...this.logDetails, - }); - this.handleLoadingFailed(params); - }); + cdp.on("Network.loadingFailed", (params) => + this.handleLoadingFailed(params), + ); await cdp.send("Network.enable"); // Target - cdp.on("Target.attachedToTarget", async (params) => { const { url, type, sessionId } = params.targetInfo; if (type === "service_worker") { @@ -255,6 +211,13 @@ export class Recorder { } }); + await cdp.send("Target.setAutoAttach", { + autoAttach: true, + waitForDebuggerOnStart: false, + flatten: true, + }); + + // Console cdp.on("Console.messageAdded", (params) => { const { message } = params; const { source, level } = message; @@ -263,17 +226,19 @@ export class Recorder { } }); - await cdp.send("Target.setAutoAttach", { - autoAttach: true, - waitForDebuggerOnStart: false, - flatten: true, - }); + await cdp.send("Console.enable"); } handleResponseReceived(params: Protocol.Network.ResponseReceivedEvent) { const { requestId, response, type } = params; - const { mimeType } = response; + const { mimeType, url } = response; + + logNetwork("Network.responseReceived", { + requestId, + url, + ...this.logDetails, + }); if (mimeType === MIME_EVENT_STREAM) { return; @@ -285,15 +250,81 @@ export class Recorder { } reqresp.fillResponse(response, type); + } - this.addPageRecord(reqresp); + handleResponseReceivedExtraInfo( + params: Protocol.Network.ResponseReceivedExtraInfoEvent, + ) { + const { requestId } = params; + + logNetwork("Network.responseReceivedExtraInfo", { + requestId, + ...this.logDetails, + }); + + const reqresp = this.pendingReqResp(requestId, true); + if (reqresp) { + reqresp.fillResponseReceivedExtraInfo(params); + } + } + + handleRequestServedFromCache( + params: Protocol.Network.RequestServedFromCacheEvent, + ) { + const { requestId } = params; + + const reqresp = this.pendingReqResp(requestId, true); + + const url = reqresp?.url; + + logNetwork("Network.requestServedFromCache", { + requestId, + url, + ...this.logDetails, + }); + + if (reqresp) { + reqresp.fromCache = true; + } + } + + handleRequestWillBeSent(params: Protocol.Network.RequestWillBeSentEvent) { + // only handling redirect here, committing last response in redirect chain + // request data stored from requestPaused + const { redirectResponse, requestId, request, type } = params; + + const { headers, method, url } = request; + + logNetwork("Network.requestWillBeSent", { + requestId, + redirectResponse, + ...this.logDetails, + }); + + if (redirectResponse) { + this.handleRedirectResponse(params); + } else { + if (!this.shouldSkip(headers, url, method, type)) { + const reqresp = this.pendingReqResp(requestId); + if (reqresp) { + reqresp.fillRequest(request); + } + } + } } handleRequestExtraInfo( params: Protocol.Network.RequestWillBeSentExtraInfoEvent, ) { - if (!this.shouldSkip(params.headers)) { - const reqresp = this.pendingReqResp(params.requestId, true); + const { requestId, headers } = params; + + logNetwork("Network.requestWillBeSentExtraInfo", { + requestId, + ...this.logDetails, + }); + + if (!this.shouldSkip(headers)) { + const reqresp = this.pendingReqResp(requestId, true); if (reqresp) { reqresp.fillRequestExtraInfo(params); } @@ -328,11 +359,19 @@ export class Recorder { const { errorText, type, requestId } = params; const reqresp = this.pendingReqResp(requestId, true); + + const url = reqresp?.url; + + logNetwork("Network.loadingFailed", { + requestId, + url, + ...this.logDetails, + }); + if (!reqresp) { return; } - const { url } = reqresp; if (type) { reqresp.resourceType = type.toLowerCase(); } @@ -377,19 +416,33 @@ export class Recorder { "recorder", ); } + reqresp.status = 0; + reqresp.errorText = errorText; + + this.addPageRecord(reqresp); this.removeReqResp(requestId); } handleLoadingFinished(params: Protocol.Network.LoadingFinishedEvent) { - const reqresp = this.pendingReqResp(params.requestId, true); + const { requestId } = params; + + const reqresp = this.pendingReqResp(requestId, true); + + const url = reqresp?.url; + + logNetwork("Network.loadingFinished", { + requestId, + url, + ...this.logDetails, + }); if (!reqresp || reqresp.asyncLoading) { return; } - this.removeReqResp(params.requestId); + this.removeReqResp(requestId); - if (!this.isValidUrl(reqresp.url)) { + if (!this.isValidUrl(url)) { return; } @@ -668,7 +721,15 @@ export class Recorder { if (this.isValidUrl(reqresp.url)) { const { status, resourceType: type } = reqresp; const mime = reqresp.getMimeType(); - this.pageInfo.urls[reqresp.getCanonURL()] = { status, mime, type }; + const info: PageInfoValue = { status, mime, type }; + if (reqresp.errorText) { + info.error = reqresp.errorText; + } + //TODO: revisit if we want to record this later + // if (reqresp.fromCache) { + // info.fromBrowserCache = true; + // } + this.pageInfo.urls[reqresp.getCanonURL()] = info; } } @@ -684,9 +745,11 @@ export class Recorder { ); this.warcQ.add(() => this.writer.writeSingleRecord(resourceRecord)); + + return this.pageInfo.ts; } - async finishPage() { + async awaitPageResources() { for (const [requestId, reqresp] of this.pendingRequests.entries()) { if (reqresp.payload) { this.removeReqResp(requestId); @@ -726,10 +789,6 @@ export class Recorder { await sleep(5.0); numPending = this.pendingRequests.size; } - - await this.writePageInfoRecord(); - - return this.pageInfo.ts; } async onClosePage() { @@ -768,7 +827,8 @@ export class Recorder { method = headers[":method"]; } - if (!this.isValidUrl(url)) { + // only check if url is provided, since it is optional + if (url && !this.isValidUrl(url)) { return true; } @@ -949,6 +1009,10 @@ export class Recorder { removeReqResp(requestId: string, allowReuse = false) { const reqresp = this.pendingRequests.get(requestId); + if (reqresp) { + const { url, requestId } = reqresp; + logNetwork("Removing reqresp", { requestId, url }); + } this.pendingRequests.delete(requestId); if (!allowReuse) { this.skipIds.add(requestId); @@ -957,28 +1021,40 @@ export class Recorder { } async serializeToWARC(reqresp: RequestResponseInfo) { - if (reqresp.shouldSkipSave()) { - const { url, method, status, payload } = reqresp; - logNetwork("Skipping request/response", { + // always include in pageinfo record if going to serialize to WARC + // even if serialization does not happen + this.addPageRecord(reqresp); + + const { url, method, status, payload, requestId } = reqresp; + + // Specifically log skipping cached resources + if (reqresp.isCached()) { + logger.debug( + "Skipping cached resource, should be already recorded", + { url, status }, + "recorder", + ); + return; + } else if (reqresp.shouldSkipSave()) { + logNetwork("Skipping writing request/response", { + requestId, url, method, status, - payloadLength: payload && payload.length, + payloadLength: (payload && payload.length) || 0, }); return; } if ( - reqresp.url && - reqresp.method === "GET" && - !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, reqresp.url)) + url && + method === "GET" && + !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url)) ) { - logNetwork("Skipping dupe", { url: reqresp.url }); + logNetwork("Skipping dupe", { url }); return; } - this.addPageRecord(reqresp); - const responseRecord = createResponse(reqresp, this.pageid); const requestRecord = createRequest(reqresp, responseRecord, this.pageid); @@ -1016,8 +1092,7 @@ export class Recorder { const res = await fetcher.load(); const mime = - (reqresp && - reqresp.responseHeaders && + (reqresp.responseHeaders && reqresp.responseHeaders["content-type"] && reqresp.responseHeaders["content-type"].split(";")[0]) || ""; @@ -1184,8 +1259,6 @@ class AsyncFetcher { ); } - recorder.addPageRecord(reqresp); - recorder.warcQ.add(() => recorder.writer.writeRecordPair( responseRecord, @@ -1204,8 +1277,15 @@ class AsyncFetcher { { url, networkId, filename, ...formatErr(e), ...logDetails }, "recorder", ); + // indicate response is ultimately not valid + reqresp.status = 0; + reqresp.errorText = e.message; } finally { - recorder.removeReqResp(networkId); + // exclude direct fetch request with fake id + if (networkId !== "0") { + recorder.addPageRecord(reqresp); + recorder.removeReqResp(networkId); + } } return fetched; diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index fbcd9a4ab..32490c375 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -37,6 +37,8 @@ export class RequestResponseInfo { status: number = 0; statusText?: string; + errorText?: string; + responseHeaders?: Record; responseHeadersList?: { name: string; value: string }[]; responseHeadersText?: string; @@ -44,11 +46,12 @@ export class RequestResponseInfo { payload?: Uint8Array; // misc - fromServiceWorker: boolean = false; + fromServiceWorker = false; + fromCache = false; frameId?: string; - fetch: boolean = false; + fetch = false; resourceType?: string; @@ -71,13 +74,7 @@ export class RequestResponseInfo { } fillFetchRequestPaused(params: Protocol.Fetch.RequestPausedEvent) { - this.url = params.request.url; - this.method = params.request.method; - if (!this.requestHeaders) { - this.requestHeaders = params.request.headers; - } - this.postData = params.request.postData; - this.hasPostData = params.request.hasPostData || false; + this.fillRequest(params.request, params.resourceType); this.status = params.responseStatusCode || 0; this.statusText = params.responseStatusText || getStatusText(this.status); @@ -86,14 +83,24 @@ export class RequestResponseInfo { this.fetch = true; - if (params.resourceType) { - this.resourceType = params.resourceType.toLowerCase(); + this.frameId = params.frameId; + } + + fillRequest(request: Protocol.Network.Request, resourceType?: string) { + this.url = request.url; + this.method = request.method; + if (!this.requestHeaders) { + this.requestHeaders = request.headers; } + this.postData = request.postData; + this.hasPostData = request.hasPostData || false; - this.frameId = params.frameId; + if (resourceType) { + this.resourceType = resourceType.toLowerCase(); + } } - fillResponse(response: Protocol.Network.Response, type?: string) { + fillResponse(response: Protocol.Network.Response, resourceType?: string) { // if initial fetch was a 200, but now replacing with 304, don't! if ( response.status == 304 && @@ -111,8 +118,8 @@ export class RequestResponseInfo { this.protocol = response.protocol; - if (type) { - this.resourceType = type.toLowerCase(); + if (resourceType) { + this.resourceType = resourceType.toLowerCase(); } if (response.requestHeaders) { @@ -292,9 +299,14 @@ export class RequestResponseInfo { return true; } + isCached() { + return this.fromCache && !this.payload; + } + shouldSkipSave() { - // skip OPTIONS/HEAD responses, and 304 or 206 responses + // skip cached, OPTIONS/HEAD responses, and 304 or 206 responses if ( + this.fromCache || !this.payload || (this.method && ["OPTIONS", "HEAD"].includes(this.method)) || [206, 304].includes(this.status) diff --git a/src/util/worker.ts b/src/util/worker.ts index b7ceb6986..e05dad308 100644 --- a/src/util/worker.ts +++ b/src/util/worker.ts @@ -298,7 +298,7 @@ export class PageWorker { async crawlPage(opts: WorkerState) { const res = await this.crawler.crawlPage(opts); if (this.recorder) { - opts.data.ts = await this.recorder.finishPage(); + await this.recorder.awaitPageResources(); } return res; } @@ -339,7 +339,21 @@ export class PageWorker { "worker", ); } + + await this.closePage(); } finally { + try { + if (this.recorder) { + opts.data.ts = await this.recorder.writePageInfoRecord(); + } + } catch (e) { + logger.error( + "Error writing pageinfo recorder", + { ...formatErr(e), ...this.logDetails }, + "recorder", + ); + } + await timedRun( this.crawler.pageFinished(data), FINISHED_TIMEOUT, diff --git a/tests/pageinfo-records.test.js b/tests/pageinfo-records.test.js index 8642190c7..d16f52d31 100644 --- a/tests/pageinfo-records.test.js +++ b/tests/pageinfo-records.test.js @@ -5,7 +5,7 @@ import { WARCParser } from "warcio"; test("run warc and ensure pageinfo records contain the correct resources", async () => { child_process.execSync( - "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --url https://webrecorder.net/about --scopeType page --collection page-info-test --combineWARC", + "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --url https://webrecorder.net/about --url https://invalid.invalid/ --scopeType page --collection page-info-test --combineWARC", ); const filename = path.join( @@ -21,6 +21,7 @@ test("run warc and ensure pageinfo records contain the correct resources", async let foundIndex = false; let foundAbout = false; + let foundInvalid = false; for await (const record of parser) { if ( @@ -40,10 +41,20 @@ test("run warc and ensure pageinfo records contain the correct resources", async const text = await record.contentText(); validateResourcesAbout(JSON.parse(text)); } + + if ( + !foundInvalid && + record.warcTargetURI === "urn:pageinfo:https://invalid.invalid/" + ) { + foundInvalid = true; + const text = await record.contentText(); + validateResourcesInvalid(JSON.parse(text)); + } } expect(foundIndex).toBe(true); expect(foundAbout).toBe(true); + expect(foundInvalid).toBe(true); }); function validateResourcesIndex(json) { @@ -161,5 +172,25 @@ function validateResourcesAbout(json) { { status: 200, mime: "font/woff2", type: "font" }, "https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2": { status: 200, mime: "font/woff2", type: "font" }, + "https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2Fabout&d=webrecorder.net": + { + status: 0, + type: "xhr", + error: "net::ERR_BLOCKED_BY_CLIENT", + }, + }); +} + +function validateResourcesInvalid(json) { + expect(json).toHaveProperty("pageid"); + expect(json).toHaveProperty("url"); + expect(json).toHaveProperty("urls"); + expect(json.counts).toEqual({ jsErrors: 0 }); + expect(json.urls).toEqual({ + "https://invalid.invalid/": { + status: 0, + type: "document", + error: "net::ERR_NAME_NOT_RESOLVED", + }, }); }