Skip to content

Commit

Permalink
Dependencies update + minor bugfix in ingestion pipeline
Browse files Browse the repository at this point in the history
Bugfix: Skips records that don't have record headers or HTTP headers.
  • Loading branch information
matteocargnelutti committed Feb 17, 2024
1 parent 9dda123 commit 80d390d
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 25 deletions.
94 changes: 72 additions & 22 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions pyproject.toml
Expand Up @@ -7,16 +7,15 @@ readme = "README.md"
version = "0.1.0"

[tool.poetry.dependencies]
PyPDF2 = "^3.0.1"
beautifulsoup4 = "^4.12.2"
brotlipy = "^0.7.0"
chromadb = "^0.4.14"
click = "^8.1.7"
flask = "^3.0.0"
langchain = "^0.0.327"
litellm = "^1.10.4"
pandas = "^2.1.3"
plotly = "^5.18.0"
pypdf = "^4.0.1"
python = "^3.11"
python-dateutil = "^2.8.2"
python-dotenv = "^1.0.0"
Expand All @@ -26,6 +25,7 @@ scikit-learn = "^1.3.2"
sentence-transformers = "^2.2.2"
torch = "^2.1.1"
warcio = "^1.7.4"
langchain = "^0.1.7"

[tool.poetry.group.dev.dependencies]
black = "^23.10.1"
Expand Down
7 changes: 6 additions & 1 deletion warc_gpt/commands/ingest.py
@@ -1,6 +1,7 @@
"""
`commands.ingest` module: Controller for the `ingest` CLI command.
"""

import os
import glob
import traceback
Expand All @@ -11,7 +12,7 @@
import chromadb
from bs4 import BeautifulSoup
from bs4 import Comment as HTMLComment
from PyPDF2 import PdfReader
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from warcio.archiveiterator import ArchiveIterator
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
Expand Down Expand Up @@ -97,6 +98,10 @@ def ingest() -> None:
# Extract metadata
rec_headers = record.rec_headers
http_headers = record.http_headers

if not rec_headers or not http_headers:
continue

record_data["warc_filename"] = os.path.basename(warc_file)
record_data["warc_record_id"] = rec_headers.get_header("WARC-Record-ID")
record_data["warc_record_date"] = rec_headers.get_header("WARC-Date")
Expand Down

0 comments on commit 80d390d

Please sign in to comment.