Dependencies update + minor bugfix in ingestion pipeline

Bugfix: Skips records that don't have record headers or HTTP headers.
harvard-lil · Feb 17, 2024 · 80d390d · 80d390d
1 parent 9dda123
commit 80d390d
Show file tree

Hide file tree

Showing 3 changed files with 80 additions and 25 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,16 +7,15 @@ readme = "README.md"
 version = "0.1.0"
 
 [tool.poetry.dependencies]
-PyPDF2 = "^3.0.1"
 beautifulsoup4 = "^4.12.2"
 brotlipy = "^0.7.0"
 chromadb = "^0.4.14"
 click = "^8.1.7"
 flask = "^3.0.0"
-langchain = "^0.0.327"
 litellm = "^1.10.4"
 pandas = "^2.1.3"
 plotly = "^5.18.0"
+pypdf = "^4.0.1"
 python = "^3.11"
 python-dateutil = "^2.8.2"
 python-dotenv = "^1.0.0"
@@ -26,6 +25,7 @@ scikit-learn = "^1.3.2"
 sentence-transformers = "^2.2.2"
 torch = "^2.1.1"
 warcio = "^1.7.4"
+langchain = "^0.1.7"
 
 [tool.poetry.group.dev.dependencies]
 black = "^23.10.1"

diff --git a/warc_gpt/commands/ingest.py b/warc_gpt/commands/ingest.py
@@ -1,6 +1,7 @@
 """
 `commands.ingest` module: Controller for the `ingest` CLI command.
 """
+
 import os
 import glob
 import traceback
@@ -11,7 +12,7 @@
 import chromadb
 from bs4 import BeautifulSoup
 from bs4 import Comment as HTMLComment
-from PyPDF2 import PdfReader
+from pypdf import PdfReader
 from sentence_transformers import SentenceTransformer
 from warcio.archiveiterator import ArchiveIterator
 from langchain.text_splitter import SentenceTransformersTokenTextSplitter
@@ -97,6 +98,10 @@ def ingest() -> None:
                 # Extract metadata
                 rec_headers = record.rec_headers
                 http_headers = record.http_headers
+
+                if not rec_headers or not http_headers:
+                    continue
+
                 record_data["warc_filename"] = os.path.basename(warc_file)
                 record_data["warc_record_id"] = rec_headers.get_header("WARC-Record-ID")
                 record_data["warc_record_date"] = rec_headers.get_header("WARC-Date")