Archive bot
- warc3
- requests
- dnspython
- lxml
# require the policy URL for robots User-Agent
import os
policy = "http://example.com/your/crawl/policy"
os.environ["OWLBOT_POLICY"] = policy
import io
import gzip
import shutil
from owlbot.archive import Archive
# create WARCFile
filename = "example.warc.gz"
fp = io.BytesIO()
arc = Archvie(filename, fileobj=fp)
# crawl & archive web page
resp = arc.get("http://example.com/")
if resp.code == 200:
for link in resp.links():
arc.get(link)
# compress data
fp.seek(0)
with gzip.open(filename, "wb") as wfp:
shutil.copyfileobj(fp, wfp)