Skip to content

Commit

Permalink
fix(extract_media): search all js files in data directory for URLs
Browse files Browse the repository at this point in the history
  • Loading branch information
wzyboy committed Jul 30, 2023
1 parent 6a05f52 commit fca8643
Showing 1 changed file with 16 additions and 14 deletions.
30 changes: 16 additions & 14 deletions contrib/extract_media/main.py
Expand Up @@ -13,6 +13,7 @@

from typing import Optional
from collections.abc import Iterator
from collections.abc import Iterable

import scrapy
from scrapy.crawler import CrawlerProcess
Expand All @@ -22,14 +23,14 @@ class TwimgExtractor(scrapy.Spider):
name = 'TwimgExtractor'

def __init__(self, archive_dir: Path, output_dir: Path, **kwargs):
self.tweets_js = archive_dir / 'data/tweets.js'
self.tweets_media = archive_dir / 'data/tweets_media'
self.js_files = archive_dir.glob('data/*.js')
self.media_dir = archive_dir / 'data/tweets_media'
self.output_dir = output_dir
super().__init__(**kwargs)

def start_requests(self) -> Iterator[scrapy.Request]:
media_cache = TweetsMediaCache(self.tweets_media)
for url in self.find_urls(self.tweets_js):
media_cache = TweetsMediaCache(self.media_dir)
for url in self.find_urls(self.js_files):
if cached := media_cache.get(url):
output = self.url_to_fs_path(url, self.output_dir)
output.parent.mkdir(parents=True, exist_ok=True)
Expand All @@ -53,16 +54,17 @@ def write_to_disk(self, response, output: Path):
output.write_bytes(response.body)

@staticmethod
def find_urls(tweet_js: Path) -> Iterator[str]:
def find_urls(js_files: Iterable[Path]) -> Iterator[str]:
twimg_url_re = re.compile(r'(?<=")https://(pbs|video).twimg.com/.*?(?=")')
seen = set()
with open(tweet_js, 'r') as f:
for line in f:
if matched := twimg_url_re.search(line):
url = matched.group(0)
if url not in seen:
yield url
seen.add(url)
for js_file in js_files:
with open(js_file, 'r') as f:
for line in f:
if matched := twimg_url_re.search(line):
url = matched.group(0)
if url not in seen:
yield url
seen.add(url)

@staticmethod
def url_to_fs_path(url: str, parent: Path) -> Path:
Expand All @@ -71,9 +73,9 @@ def url_to_fs_path(url: str, parent: Path) -> Path:


class TweetsMediaCache:
def __init__(self, tweets_media: Path) -> None:
def __init__(self, media_dir: Path) -> None:
self._dict = dict()
for file in tweets_media.glob('*'):
for file in media_dir.glob('*'):
key = file.name.split('-', 1)[1]
self._dict[key] = file

Expand Down

0 comments on commit fca8643

Please sign in to comment.