fix(extract_media): search all js files in data directory for URLs

wzyboy · Jul 30, 2023 · fca8643 · fca8643
1 parent 6a05f52
commit fca8643
Showing 1 changed file with 16 additions and 14 deletions.
diff --git a/contrib/extract_media/main.py b/contrib/extract_media/main.py
@@ -13,6 +13,7 @@
 
 from typing import Optional
 from collections.abc import Iterator
+from collections.abc import Iterable
 
 import scrapy
 from scrapy.crawler import CrawlerProcess
@@ -22,14 +23,14 @@ class TwimgExtractor(scrapy.Spider):
     name = 'TwimgExtractor'
 
     def __init__(self, archive_dir: Path, output_dir: Path, **kwargs):
-        self.tweets_js = archive_dir / 'data/tweets.js'
-        self.tweets_media = archive_dir / 'data/tweets_media'
+        self.js_files = archive_dir.glob('data/*.js')
+        self.media_dir = archive_dir / 'data/tweets_media'
         self.output_dir = output_dir
         super().__init__(**kwargs)
 
     def start_requests(self) -> Iterator[scrapy.Request]:
-        media_cache = TweetsMediaCache(self.tweets_media)
-        for url in self.find_urls(self.tweets_js):
+        media_cache = TweetsMediaCache(self.media_dir)
+        for url in self.find_urls(self.js_files):
             if cached := media_cache.get(url):
                 output = self.url_to_fs_path(url, self.output_dir)
                 output.parent.mkdir(parents=True, exist_ok=True)
@@ -53,16 +54,17 @@ def write_to_disk(self, response, output: Path):
         output.write_bytes(response.body)
 
     @staticmethod
-    def find_urls(tweet_js: Path) -> Iterator[str]:
+    def find_urls(js_files: Iterable[Path]) -> Iterator[str]:
         twimg_url_re = re.compile(r'(?<=")https://(pbs|video).twimg.com/.*?(?=")')
         seen = set()
-        with open(tweet_js, 'r') as f:
-            for line in f:
-                if matched := twimg_url_re.search(line):
-                    url = matched.group(0)
-                    if url not in seen:
-                        yield url
-                        seen.add(url)
+        for js_file in js_files:
+            with open(js_file, 'r') as f:
+                for line in f:
+                    if matched := twimg_url_re.search(line):
+                        url = matched.group(0)
+                        if url not in seen:
+                            yield url
+                            seen.add(url)
 
     @staticmethod
     def url_to_fs_path(url: str, parent: Path) -> Path:
@@ -71,9 +73,9 @@ def url_to_fs_path(url: str, parent: Path) -> Path:
 
 
 class TweetsMediaCache:
-    def __init__(self, tweets_media: Path) -> None:
+    def __init__(self, media_dir: Path) -> None:
         self._dict = dict()
-        for file in tweets_media.glob('*'):
+        for file in media_dir.glob('*'):
             key = file.name.split('-', 1)[1]
             self._dict[key] = file