From 5060e6b0b10dd98c8ff27e21fe9b234b3e2606b6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 18 Mar 2024 14:24:48 -0700 Subject: [PATCH] profiles: handle terminate signals directly (#500) - add our own signal handling to create-login-profile to ensure fast exit in k8s - print crawler version info string on startup --- src/crawler.ts | 20 ++------------------ src/create-login-profile.ts | 14 +++++++++++++- src/util/file_reader.ts | 17 +++++++++++++++++ 3 files changed, 32 insertions(+), 19 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 51178cd7c..df31bcf13 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -35,7 +35,7 @@ import { initRedis } from "./util/redis.js"; import { logger, formatErr } from "./util/logger.js"; import { WorkerOpts, WorkerState, runWorkers } from "./util/worker.js"; import { sleep, timedRun, secondsElapsed } from "./util/timing.js"; -import { collectAllFileSources } from "./util/file_reader.js"; +import { collectAllFileSources, getInfoString } from "./util/file_reader.js"; import { Browser } from "./util/browser.js"; @@ -428,7 +428,7 @@ export class Crawler { this.logFH = fs.createWriteStream(this.logFilename); logger.setExternalLogStream(this.logFH); - this.infoString = await this.getInfoString(); + this.infoString = await getInfoString(); logger.info(this.infoString); logger.info("Seeds", this.params.scopedSeeds); @@ -1008,22 +1008,6 @@ self.__bx_behaviors.selectMainBehavior(); return res ? frame : null; } - async getInfoString() { - const packageFileJSON = JSON.parse( - await fsp.readFile(new URL("../package.json", import.meta.url), { - encoding: "utf-8", - }), - ); - const warcioPackageJSON = JSON.parse( - await fsp.readFile( - new URL("../node_modules/warcio/package.json", import.meta.url), - { encoding: "utf-8" }, - ), - ); - - return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`; - } - async createWARCInfo(filename: string) { const warcVersion = "WARC/1.0"; const type = "warcinfo"; diff --git a/src/create-login-profile.ts b/src/create-login-profile.ts index 40d6afe62..923281ff3 100755 --- a/src/create-login-profile.ts +++ b/src/create-login-profile.ts @@ -14,6 +14,7 @@ import { logger } from "./util/logger.js"; import { Browser } from "./util/browser.js"; import { initStorage } from "./util/storage.js"; import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core"; +import { getInfoString } from "./util/file_reader.js"; const profileHTML = fs.readFileSync( new URL("../html/createProfile.html", import.meta.url), @@ -118,6 +119,11 @@ function getDefaultWindowSize() { return `${x},${y}`; } +function handleTerminate(signame: string) { + logger.info(`Got signal ${signame}, exiting`); + process.exit(1); +} + async function main() { // eslint-disable-next-line @typescript-eslint/no-explicit-any const params: any = yargs(process.argv) @@ -126,6 +132,12 @@ async function main() { logger.setDebugLogging(true); + logger.info(await getInfoString()); + + process.on("SIGINT", () => handleTerminate("SIGINT")); + + process.on("SIGTERM", () => handleTerminate("SIGTERM")); + if (!params.headless) { logger.debug("Launching XVFB"); child_process.spawn("Xvfb", [ @@ -164,7 +176,7 @@ async function main() { await browser.launch({ profileUrl: params.profile, headless: params.headless, - signals: true, + signals: false, chromeOptions: { proxy: false, extraArgs: [ diff --git a/src/util/file_reader.ts b/src/util/file_reader.ts index 83b8d33c0..45baf9b68 100644 --- a/src/util/file_reader.ts +++ b/src/util/file_reader.ts @@ -1,4 +1,5 @@ import fs from "fs"; +import fsp from "fs/promises"; import path from "path"; const MAX_DEPTH = 2; @@ -48,3 +49,19 @@ export function collectAllFileSources( return []; } + +export async function getInfoString() { + const packageFileJSON = JSON.parse( + await fsp.readFile(new URL("../../package.json", import.meta.url), { + encoding: "utf-8", + }), + ); + const warcioPackageJSON = JSON.parse( + await fsp.readFile( + new URL("../../node_modules/warcio/package.json", import.meta.url), + { encoding: "utf-8" }, + ), + ); + + return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`; +}