Skip to content

Commit

Permalink
Merge branch 'ytdl-org:master' into fix-npo-support
Browse files Browse the repository at this point in the history
  • Loading branch information
bartbroere committed Mar 12, 2024
2 parents 58d7a00 + a96a45b commit d4250c8
Show file tree
Hide file tree
Showing 8 changed files with 342 additions and 199 deletions.
16 changes: 16 additions & 0 deletions test/test_utils.py
Expand Up @@ -81,6 +81,7 @@
sanitize_filename,
sanitize_path,
sanitize_url,
sanitized_Request,
shell_quote,
smuggle_url,
str_or_none,
Expand Down Expand Up @@ -255,6 +256,18 @@ def test_sanitize_url(self):
self.assertEqual(sanitize_url('https://foo.bar'), 'https://foo.bar')
self.assertEqual(sanitize_url('foo bar'), 'foo bar')

def test_sanitized_Request(self):
self.assertFalse(sanitized_Request('http://foo.bar').has_header('Authorization'))
self.assertFalse(sanitized_Request('http://:foo.bar').has_header('Authorization'))
self.assertEqual(sanitized_Request('http://@foo.bar').get_header('Authorization'),
'Basic Og==')
self.assertEqual(sanitized_Request('http://:pass@foo.bar').get_header('Authorization'),
'Basic OnBhc3M=')
self.assertEqual(sanitized_Request('http://user:@foo.bar').get_header('Authorization'),
'Basic dXNlcjo=')
self.assertEqual(sanitized_Request('http://user:pass@foo.bar').get_header('Authorization'),
'Basic dXNlcjpwYXNz')

def test_expand_path(self):
def env(var):
return '%{0}%'.format(var) if sys.platform == 'win32' else '${0}'.format(var)
Expand Down Expand Up @@ -512,11 +525,14 @@ def test_float_or_none(self):
self.assertEqual(float_or_none(set()), None)

def test_int_or_none(self):
self.assertEqual(int_or_none(42), 42)
self.assertEqual(int_or_none('42'), 42)
self.assertEqual(int_or_none(''), None)
self.assertEqual(int_or_none(None), None)
self.assertEqual(int_or_none([]), None)
self.assertEqual(int_or_none(set()), None)
self.assertEqual(int_or_none('42', base=8), 34)
self.assertRaises(TypeError, int_or_none(42, base=8))

def test_str_to_int(self):
self.assertEqual(str_to_int('123,456'), 123456)
Expand Down
110 changes: 77 additions & 33 deletions youtube_dl/extractor/common.py
Expand Up @@ -25,6 +25,7 @@
compat_getpass,
compat_integer_types,
compat_http_client,
compat_kwargs,
compat_map as map,
compat_open as open,
compat_os_name,
Expand Down Expand Up @@ -1102,6 +1103,60 @@ def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, f
self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
return None

def _search_json(self, start_pattern, string, name, video_id, **kwargs):
"""Searches string for the JSON object specified by start_pattern"""

# self, start_pattern, string, name, video_id, *, end_pattern='',
# contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT
# NB: end_pattern is only used to reduce the size of the initial match
end_pattern = kwargs.pop('end_pattern', '')
# (?:[\s\S]) simulates (?(s):.) (eg)
contains_pattern = kwargs.pop('contains_pattern', r'{[\s\S]+}')
fatal = kwargs.pop('fatal', True)
default = kwargs.pop('default', NO_DEFAULT)

if default is NO_DEFAULT:
default, has_default = {}, False
else:
fatal, has_default = False, True

json_string = self._search_regex(
r'(?:{0})\s*(?P<json>{1})\s*(?:{2})'.format(
start_pattern, contains_pattern, end_pattern),
string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
if not json_string:
return default

# yt-dlp has a special JSON parser that allows trailing text.
# Until that arrives here, the diagnostic from the exception
# raised by json.loads() is used to extract the wanted text.
# Either way, it's a problem if a transform_source() can't
# handle the trailing text.

# force an exception
kwargs['fatal'] = True

# self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
for _ in range(2):
try:
# return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
transform_source = kwargs.pop('transform_source', None)
if transform_source:
json_string = transform_source(json_string)
return self._parse_json(json_string, video_id, **compat_kwargs(kwargs))
except ExtractorError as e:
end = int_or_none(self._search_regex(r'\(char\s+(\d+)', error_to_compat_str(e), 'end', default=None))
if end is not None:
json_string = json_string[:end]
continue
msg = 'Unable to extract {0} - Failed to parse JSON'.format(name)
if fatal:
raise ExtractorError(msg, cause=e.cause, video_id=video_id)
elif not has_default:
self.report_warning(
'{0}: {1}'.format(msg, error_to_compat_str(e)), video_id=video_id)
return default

def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Like _search_regex, but strips HTML tags and unescapes entities.
Expand Down Expand Up @@ -2966,25 +3021,22 @@ def manifest_url(manifest):
return formats

def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
mobj = re.search(
r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
webpage)
if mobj:
try:
jwplayer_data = self._parse_json(mobj.group('options'),
video_id=video_id,
transform_source=transform_source)
except ExtractorError:
pass
else:
if isinstance(jwplayer_data, dict):
return jwplayer_data
return self._search_json(
r'''(?<!-)\bjwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?:(?!</script>).)*?\.\s*(?:setup\s*\(|(?P<load>load)\s*\(\s*\[)''',
webpage, 'JWPlayer data', video_id,
# must be a {...} or sequence, ending
contains_pattern=r'\{[\s\S]*}(?(load)(?:\s*,\s*\{[\s\S]*})*)', end_pattern=r'(?(load)\]|\))',
transform_source=transform_source, default=None)

def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
jwplayer_data = self._find_jwplayer_data(
webpage, video_id, transform_source=js_to_json)
return self._parse_jwplayer_data(
jwplayer_data, video_id, *args, **kwargs)

# allow passing `transform_source` through to _find_jwplayer_data()
transform_source = kwargs.pop('transform_source', None)
kwfind = compat_kwargs({'transform_source': transform_source}) if transform_source else {}

jwplayer_data = self._find_jwplayer_data(webpage, video_id, **kwfind)

return self._parse_jwplayer_data(jwplayer_data, video_id, *args, **kwargs)

def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
Expand Down Expand Up @@ -3018,22 +3070,14 @@ def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)

subtitles = {}
tracks = video_data.get('tracks')
if tracks and isinstance(tracks, list):
for track in tracks:
if not isinstance(track, dict):
continue
track_kind = track.get('kind')
if not track_kind or not isinstance(track_kind, compat_str):
continue
if track_kind.lower() not in ('captions', 'subtitles'):
continue
track_url = urljoin(base_url, track.get('file'))
if not track_url:
continue
subtitles.setdefault(track.get('label') or 'en', []).append({
'url': self._proto_relative_url(track_url)
})
for track in traverse_obj(video_data, (
'tracks', lambda _, t: t.get('kind').lower() in ('captions', 'subtitles'))):
track_url = urljoin(base_url, track.get('file'))
if not track_url:
continue
subtitles.setdefault(track.get('label') or 'en', []).append({
'url': self._proto_relative_url(track_url)
})

entry = {
'id': this_video_id,
Expand Down
1 change: 0 additions & 1 deletion youtube_dl/extractor/extractors.py
Expand Up @@ -383,7 +383,6 @@
FC2EmbedIE,
)
from .fczenit import FczenitIE
from .filemoon import FileMoonIE
from .fifa import FifaIE
from .filmon import (
FilmOnIE,
Expand Down
43 changes: 0 additions & 43 deletions youtube_dl/extractor/filemoon.py

This file was deleted.

44 changes: 30 additions & 14 deletions youtube_dl/extractor/videa.py
Expand Up @@ -6,22 +6,31 @@
import string

from .common import InfoExtractor
from ..compat import (
compat_b64decode,
compat_ord,
compat_struct_pack,
)
from ..utils import (
ExtractorError,
int_or_none,
mimetype2ext,
parse_codecs,
parse_qs,
update_url_query,
urljoin,
xpath_element,
xpath_text,
)
from ..compat import (
compat_b64decode,
compat_ord,
compat_struct_pack,
compat_urlparse,
)


def compat_random_choices(population, *args, **kwargs):
# weights=None, *, cum_weights=None, k=1
# limited implementation needed here
weights = args[0] if args else kwargs.get('weights')
assert all(w is None for w in (weights, kwargs.get('cum_weights')))
k = kwargs.get('k', 1)
return ''.join(random.choice(population) for _ in range(k))


class VideaIE(InfoExtractor):
Expand All @@ -35,6 +44,7 @@ class VideaIE(InfoExtractor):
)
(?P<id>[^?#&]+)
'''
_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1']
_TESTS = [{
'url': 'http://videa.hu/videok/allatok/az-orult-kigyasz-285-kigyot-kigyo-8YfIAjxwWGwT8HVQ',
'md5': '97a7af41faeaffd9f1fc864a7c7e7603',
Expand All @@ -44,6 +54,7 @@ class VideaIE(InfoExtractor):
'title': 'Az őrült kígyász 285 kígyót enged szabadon',
'thumbnail': r're:^https?://.*',
'duration': 21,
'age_limit': 0,
},
}, {
'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH',
Expand All @@ -54,6 +65,7 @@ class VideaIE(InfoExtractor):
'title': 'Supercars előzés',
'thumbnail': r're:^https?://.*',
'duration': 64,
'age_limit': 0,
},
}, {
'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ',
Expand All @@ -64,6 +76,7 @@ class VideaIE(InfoExtractor):
'title': 'Az őrült kígyász 285 kígyót enged szabadon',
'thumbnail': r're:^https?://.*',
'duration': 21,
'age_limit': 0,
},
}, {
'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1',
Expand All @@ -80,11 +93,14 @@ class VideaIE(InfoExtractor):
}]
_STATIC_SECRET = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p'

@staticmethod
def _extract_urls(webpage):
return [url for _, url in re.findall(
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1',
webpage)]
@classmethod
def _extract_urls(cls, webpage):
def yield_urls():
for pattern in cls._EMBED_REGEX:
for m in re.finditer(pattern, webpage):
yield m.group('url')

return list(yield_urls())

@staticmethod
def rc4(cipher_text, key):
Expand Down Expand Up @@ -130,13 +146,13 @@ def _real_extract(self, url):
for i in range(0, 32):
result += s[i - (self._STATIC_SECRET.index(l[i]) - 31)]

query = compat_urlparse.parse_qs(compat_urlparse.urlparse(player_url).query)
random_seed = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(8))
query = parse_qs(player_url)
random_seed = ''.join(compat_random_choices(string.ascii_letters + string.digits, k=8))
query['_s'] = random_seed
query['_t'] = result[:16]

b64_info, handle = self._download_webpage_handle(
'http://videa.hu/videaplayer_get_xml.php', video_id, query=query)
'http://videa.hu/player/xml', video_id, query=query)
if b64_info.startswith('<?xml'):
info = self._parse_xml(b64_info, video_id)
else:
Expand Down
4 changes: 2 additions & 2 deletions youtube_dl/extractor/vimeo.py
Expand Up @@ -673,8 +673,8 @@ def _real_extract(self, url):
raise

if '//player.vimeo.com/video/' in url:
config = self._parse_json(self._search_regex(
r'(?s)\b(?:playerC|c)onfig\s*=\s*({.+?})\s*[;\n]', webpage, 'info section'), video_id)
config = self._search_json(
r'\b(?:playerC|c)onfig\s*=', webpage, 'info section', video_id)
if config.get('view') == 4:
config = self._verify_player_video_password(
redirect_url, video_id, headers)
Expand Down

0 comments on commit d4250c8

Please sign in to comment.