Merge branch 'ytdl-org:master' into fix-npo-support

ytdl-org · Mar 12, 2024 · d4250c8 · d4250c8
2 parents 58d7a00 + a96a45b
commit d4250c8
Show file tree

Hide file tree

Showing 8 changed files with 342 additions and 199 deletions.
diff --git a/test/test_utils.py b/test/test_utils.py
@@ -81,6 +81,7 @@
     sanitize_filename,
     sanitize_path,
     sanitize_url,
+    sanitized_Request,
     shell_quote,
     smuggle_url,
     str_or_none,
@@ -255,6 +256,18 @@ def test_sanitize_url(self):
         self.assertEqual(sanitize_url('https://foo.bar'), 'https://foo.bar')
         self.assertEqual(sanitize_url('foo bar'), 'foo bar')
 
+    def test_sanitized_Request(self):
+        self.assertFalse(sanitized_Request('http://foo.bar').has_header('Authorization'))
+        self.assertFalse(sanitized_Request('http://:foo.bar').has_header('Authorization'))
+        self.assertEqual(sanitized_Request('http://@foo.bar').get_header('Authorization'),
+                         'Basic Og==')
+        self.assertEqual(sanitized_Request('http://:pass@foo.bar').get_header('Authorization'),
+                         'Basic OnBhc3M=')
+        self.assertEqual(sanitized_Request('http://user:@foo.bar').get_header('Authorization'),
+                         'Basic dXNlcjo=')
+        self.assertEqual(sanitized_Request('http://user:pass@foo.bar').get_header('Authorization'),
+                         'Basic dXNlcjpwYXNz')
+
     def test_expand_path(self):
         def env(var):
             return '%{0}%'.format(var) if sys.platform == 'win32' else '${0}'.format(var)
@@ -512,11 +525,14 @@ def test_float_or_none(self):
         self.assertEqual(float_or_none(set()), None)
 
     def test_int_or_none(self):
+        self.assertEqual(int_or_none(42), 42)
         self.assertEqual(int_or_none('42'), 42)
         self.assertEqual(int_or_none(''), None)
         self.assertEqual(int_or_none(None), None)
         self.assertEqual(int_or_none([]), None)
         self.assertEqual(int_or_none(set()), None)
+        self.assertEqual(int_or_none('42', base=8), 34)
+        self.assertRaises(TypeError, int_or_none(42, base=8))
 
     def test_str_to_int(self):
         self.assertEqual(str_to_int('123,456'), 123456)

diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
@@ -25,6 +25,7 @@
     compat_getpass,
     compat_integer_types,
     compat_http_client,
+    compat_kwargs,
     compat_map as map,
     compat_open as open,
     compat_os_name,
@@ -1102,6 +1103,60 @@ def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, f
             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
             return None
 
+    def _search_json(self, start_pattern, string, name, video_id, **kwargs):
+        """Searches string for the JSON object specified by start_pattern"""
+
+        # self, start_pattern, string, name, video_id, *, end_pattern='',
+        # contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT
+        # NB: end_pattern is only used to reduce the size of the initial match
+        end_pattern = kwargs.pop('end_pattern', '')
+        # (?:[\s\S]) simulates (?(s):.) (eg)
+        contains_pattern = kwargs.pop('contains_pattern', r'{[\s\S]+}')
+        fatal = kwargs.pop('fatal', True)
+        default = kwargs.pop('default', NO_DEFAULT)
+
+        if default is NO_DEFAULT:
+            default, has_default = {}, False
+        else:
+            fatal, has_default = False, True
+
+        json_string = self._search_regex(
+            r'(?:{0})\s*(?P<json>{1})\s*(?:{2})'.format(
+                start_pattern, contains_pattern, end_pattern),
+            string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
+        if not json_string:
+            return default
+
+        # yt-dlp has a special JSON parser that allows trailing text.
+        # Until that arrives here, the diagnostic from the exception
+        # raised by json.loads() is used to extract the wanted text.
+        # Either way, it's a problem if a transform_source() can't
+        # handle the trailing text.
+
+        # force an exception
+        kwargs['fatal'] = True
+
+        # self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
+        for _ in range(2):
+            try:
+                # return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
+                transform_source = kwargs.pop('transform_source', None)
+                if transform_source:
+                    json_string = transform_source(json_string)
+                return self._parse_json(json_string, video_id, **compat_kwargs(kwargs))
+            except ExtractorError as e:
+                end = int_or_none(self._search_regex(r'\(char\s+(\d+)', error_to_compat_str(e), 'end', default=None))
+                if end is not None:
+                    json_string = json_string[:end]
+                    continue
+                msg = 'Unable to extract {0} - Failed to parse JSON'.format(name)
+                if fatal:
+                    raise ExtractorError(msg, cause=e.cause, video_id=video_id)
+                elif not has_default:
+                    self.report_warning(
+                        '{0}: {1}'.format(msg, error_to_compat_str(e)), video_id=video_id)
+            return default
+
     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
         """
         Like _search_regex, but strips HTML tags and unescapes entities.
@@ -2966,25 +3021,22 @@ def manifest_url(manifest):
         return formats
 
     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
-        mobj = re.search(
-            r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
-            webpage)
-        if mobj:
-            try:
-                jwplayer_data = self._parse_json(mobj.group('options'),
-                                                 video_id=video_id,
-                                                 transform_source=transform_source)
-            except ExtractorError:
-                pass
-            else:
-                if isinstance(jwplayer_data, dict):
-                    return jwplayer_data
+        return self._search_json(
+            r'''(?<!-)\bjwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?:(?!</script>).)*?\.\s*(?:setup\s*\(|(?P<load>load)\s*\(\s*\[)''',
+            webpage, 'JWPlayer data', video_id,
+            # must be a {...} or sequence, ending
+            contains_pattern=r'\{[\s\S]*}(?(load)(?:\s*,\s*\{[\s\S]*})*)', end_pattern=r'(?(load)\]|\))',
+            transform_source=transform_source, default=None)
 
     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
-        jwplayer_data = self._find_jwplayer_data(
-            webpage, video_id, transform_source=js_to_json)
-        return self._parse_jwplayer_data(
-            jwplayer_data, video_id, *args, **kwargs)
+
+        # allow passing `transform_source` through to _find_jwplayer_data()
+        transform_source = kwargs.pop('transform_source', None)
+        kwfind = compat_kwargs({'transform_source': transform_source}) if transform_source else {}
+
+        jwplayer_data = self._find_jwplayer_data(webpage, video_id, **kwfind)
+
+        return self._parse_jwplayer_data(jwplayer_data, video_id, *args, **kwargs)
 
     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
@@ -3018,22 +3070,14 @@ def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
 
             subtitles = {}
-            tracks = video_data.get('tracks')
-            if tracks and isinstance(tracks, list):
-                for track in tracks:
-                    if not isinstance(track, dict):
-                        continue
-                    track_kind = track.get('kind')
-                    if not track_kind or not isinstance(track_kind, compat_str):
-                        continue
-                    if track_kind.lower() not in ('captions', 'subtitles'):
-                        continue
-                    track_url = urljoin(base_url, track.get('file'))
-                    if not track_url:
-                        continue
-                    subtitles.setdefault(track.get('label') or 'en', []).append({
-                        'url': self._proto_relative_url(track_url)
-                    })
+            for track in traverse_obj(video_data, (
+                    'tracks', lambda _, t: t.get('kind').lower() in ('captions', 'subtitles'))):
+                track_url = urljoin(base_url, track.get('file'))
+                if not track_url:
+                    continue
+                subtitles.setdefault(track.get('label') or 'en', []).append({
+                    'url': self._proto_relative_url(track_url)
+                })
 
             entry = {
                 'id': this_video_id,

diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
@@ -383,7 +383,6 @@
     FC2EmbedIE,
 )
 from .fczenit import FczenitIE
-from .filemoon import FileMoonIE
 from .fifa import FifaIE
 from .filmon import (
     FilmOnIE,

diff --git a/youtube_dl/extractor/filemoon.py b/youtube_dl/extractor/filemoon.py
diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py
@@ -6,22 +6,31 @@
 import string
 
 from .common import InfoExtractor
+from ..compat import (
+    compat_b64decode,
+    compat_ord,
+    compat_struct_pack,
+)
 from ..utils import (
     ExtractorError,
     int_or_none,
     mimetype2ext,
     parse_codecs,
+    parse_qs,
     update_url_query,
     urljoin,
     xpath_element,
     xpath_text,
 )
-from ..compat import (
-    compat_b64decode,
-    compat_ord,
-    compat_struct_pack,
-    compat_urlparse,
-)
+
+
+def compat_random_choices(population, *args, **kwargs):
+    # weights=None, *, cum_weights=None, k=1
+    # limited implementation needed here
+    weights = args[0] if args else kwargs.get('weights')
+    assert all(w is None for w in (weights, kwargs.get('cum_weights')))
+    k = kwargs.get('k', 1)
+    return ''.join(random.choice(population) for _ in range(k))
 
 
 class VideaIE(InfoExtractor):
@@ -35,6 +44,7 @@ class VideaIE(InfoExtractor):
                         )
                         (?P<id>[^?#&]+)
                     '''
+    _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1']
     _TESTS = [{
         'url': 'http://videa.hu/videok/allatok/az-orult-kigyasz-285-kigyot-kigyo-8YfIAjxwWGwT8HVQ',
         'md5': '97a7af41faeaffd9f1fc864a7c7e7603',
@@ -44,6 +54,7 @@ class VideaIE(InfoExtractor):
             'title': 'Az őrült kígyász 285 kígyót enged szabadon',
             'thumbnail': r're:^https?://.*',
             'duration': 21,
+            'age_limit': 0,
         },
     }, {
         'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH',
@@ -54,6 +65,7 @@ class VideaIE(InfoExtractor):
             'title': 'Supercars előzés',
             'thumbnail': r're:^https?://.*',
             'duration': 64,
+            'age_limit': 0,
         },
     }, {
         'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ',
@@ -64,6 +76,7 @@ class VideaIE(InfoExtractor):
             'title': 'Az őrült kígyász 285 kígyót enged szabadon',
             'thumbnail': r're:^https?://.*',
             'duration': 21,
+            'age_limit': 0,
         },
     }, {
         'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1',
@@ -80,11 +93,14 @@ class VideaIE(InfoExtractor):
     }]
     _STATIC_SECRET = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p'
 
-    @staticmethod
-    def _extract_urls(webpage):
-        return [url for _, url in re.findall(
-            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1',
-            webpage)]
+    @classmethod
+    def _extract_urls(cls, webpage):
+        def yield_urls():
+            for pattern in cls._EMBED_REGEX:
+                for m in re.finditer(pattern, webpage):
+                    yield m.group('url')
+
+        return list(yield_urls())
 
     @staticmethod
     def rc4(cipher_text, key):
@@ -130,13 +146,13 @@ def _real_extract(self, url):
         for i in range(0, 32):
             result += s[i - (self._STATIC_SECRET.index(l[i]) - 31)]
 
-        query = compat_urlparse.parse_qs(compat_urlparse.urlparse(player_url).query)
-        random_seed = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(8))
+        query = parse_qs(player_url)
+        random_seed = ''.join(compat_random_choices(string.ascii_letters + string.digits, k=8))
         query['_s'] = random_seed
         query['_t'] = result[:16]
 
         b64_info, handle = self._download_webpage_handle(
-            'http://videa.hu/videaplayer_get_xml.php', video_id, query=query)
+            'http://videa.hu/player/xml', video_id, query=query)
         if b64_info.startswith('<?xml'):
             info = self._parse_xml(b64_info, video_id)
         else:

diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
@@ -673,8 +673,8 @@ def _real_extract(self, url):
             raise
 
         if '//player.vimeo.com/video/' in url:
-            config = self._parse_json(self._search_regex(
-                r'(?s)\b(?:playerC|c)onfig\s*=\s*({.+?})\s*[;\n]', webpage, 'info section'), video_id)
+            config = self._search_json(
+                r'\b(?:playerC|c)onfig\s*=', webpage, 'info section', video_id)
             if config.get('view') == 4:
                 config = self._verify_player_video_password(
                     redirect_url, video_id, headers)