Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix/ard #32688

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
230 changes: 174 additions & 56 deletions youtube_dl/extractor/ard.py
Expand Up @@ -359,7 +359,9 @@ def _real_extract(self, url):


class ARDBetaMediathekIE(ARDMediathekBaseIE):
_VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?:[^/]+/)?(?:player|live|video)/(?:[^/]+/)*(?P<id>Y3JpZDovL[a-zA-Z0-9]+)'

_VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(((?:[^/]+/)?(?:player|live|video|serie|sendung)/(?:[^/]+/)*(?P<id>Y3JpZDovL[a-zA-Z0-9]+))|(((?P<sender>[a-zA-Z0-9\-]+)([/]))?(?P<name>[a-zA-Z0-9\-]+)))'

Comment on lines +362 to +364
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
_VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(((?:[^/]+/)?(?:player|live|video|serie|sendung)/(?:[^/]+/)*(?P<id>Y3JpZDovL[a-zA-Z0-9]+))|(((?P<sender>[a-zA-Z0-9\-]+)([/]))?(?P<name>[a-zA-Z0-9\-]+)))'
_VALID_URL = r'''(?x)
https://(?:(?:beta|www)\.)?ardmediathek\.de/
(?:
(?:[^/#?]+/)?(?:player|live|video|serie|sendung)/(?:[^/#?]+/)*
(?P<id>Y3JpZDovL[a-zA-Z0-9]+)|
(?P<sender>[a-zA-Z0-9-]+)/)?(?P<name>[a-zA-Z0-9-]+)
)
'''

_TESTS = [{
'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
'md5': 'a1dc75a39c61601b980648f7c9f9f71d',
Expand Down Expand Up @@ -395,73 +397,189 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
}, {
'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet',
'only_matching': True,
}, {
'url': 'https://ardmediathek.de/sendung/saartalk/saartalk-gesellschaftsgift-haltung-gegen-hass/sr-fernsehen/Y3JpZDovL3NyLW9ubGluZS5kZS9TVF84MTY4MA/',
'only_matching': True,
}, {
'url': 'https://ardmediathek.de/serie/saartalk/saartalk-gesellschaftsgift-haltung-gegen-hass/sr-fernsehen/Y3JpZDovL3NyLW9ubGluZS5kZS9TVF84MTY4MA/',
'only_matching': True,
}, {
'url': 'https://www.ardmediathek.de/br/dahoam-is-dahoam',
'only_matching': True,
}]

def _real_extract(self, url):
video_id = self._match_id(url)
video_name = self._match_id(url, group_name='name')
sender = self._match_id(url, group_name='sender')
Comment on lines 412 to +414
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
video_id = self._match_id(url)
video_name = self._match_id(url, group_name='name')
sender = self._match_id(url, group_name='sender')
video_id, video_name, sender = self._match_valid_url(url).group('id, 'name', 'sender')


if '/serie/' in url or '/sendung/' in url:
return self._real_extract_serie(video_id)
elif 'none' != video_name.lower():
return self._real_extract_named_serie(video_name, sender if 'none' != sender.lower() else "ard")
else:
return self._real_extract_video(video_id)
Comment on lines +416 to +421
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The URL pattern matches either for video_id or for video_name, so that logic can be used. A non-matched group will be None rather than 'none'.

Suggested change
if '/serie/' in url or '/sendung/' in url:
return self._real_extract_serie(video_id)
elif 'none' != video_name.lower():
return self._real_extract_named_serie(video_name, sender if 'none' != sender.lower() else "ard")
else:
return self._real_extract_video(video_id)
if video_id is not None:
return self._real_extract_serie(video_id)
return self._real_extract_named_serie(video_name, sender if sender is not None else 'ard')

If video_name.lower() == 'none' is an actual possibility, add a test for it and raise ExtractorError( ..., expected=True) for that case. Or wrap the group in `(?!none...) in the pattern so that it doesn't match.


def _real_extract_video(self, video_id):

player_page = self._download_json(
'https://api.ardmediathek.de/public-gateway',
video_id, data=json.dumps({
'query': '''{
playerPage(client: "ard", clipId: "%s") {
blockedByFsk
broadcastedOn
maturityContentRating
mediaCollection {
_duration
_geoblocked
_isLive
_mediaArray {
_mediaStreamArray {
_quality
_server
_stream
}
}
_previewImage
_subtitleUrl
_type
}
show {
title
}
synopsis
title
tracking {
atiCustomVars {
contentId
}
}
}
}''' % video_id,
}).encode(), headers={
'Content-Type': 'application/json'
})['data']['playerPage']
f'https://api.ardmediathek.de/page-gateway/pages/ard/item/{video_id}',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For Py2 (could use .format(video_id) but this style is used elsewhere):

Suggested change
f'https://api.ardmediathek.de/page-gateway/pages/ard/item/{video_id}',
'https://api.ardmediathek.de/page-gateway/pages/ard/item/' + video_id,

video_id
)

title = player_page['title']
content_id = str_or_none(try_get(
player_page, lambda x: x['tracking']['atiCustomVars']['contentId']))
media_collection = player_page.get('mediaCollection') or {}
content_id = str_or_none(
try_get(
player_page, lambda x: x['tracking']['atiCustomVars']['contentId']
)
)
Comment on lines +431 to +435
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Revert reformat:

Suggested change
content_id = str_or_none(
try_get(
player_page, lambda x: x['tracking']['atiCustomVars']['contentId']
)
)
content_id = str_or_none(try_get(
player_page, lambda x: x['tracking']['atiCustomVars']['contentId']))

media_collection = player_page['widgets'][0].get('mediaCollection') or {}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does player_page always have ['widgets'] or might the ['mediaCollection'] sometimes be found there as before? Also, don't crash if the path to the dict with (possibly) ['mediaCollection'] is absent:

Suggested change
media_collection = player_page['widgets'][0].get('mediaCollection') or {}
media_collection = traverse_obj(player_page, ('widgets', 0, 'mediaCollection', T(dict)) or {}

Or if we should look in both places:

Suggested change
media_collection = player_page['widgets'][0].get('mediaCollection') or {}
media_collection = traverse_obj(player_page, ((('widgets', 0), None), 'mediaCollection', T(dict)) or {}

Import traverse_obj() and T() from ..utils.

Later, we might replace uses of try_get() with traverse_obj() to reduce the imports.

if not media_collection and content_id:
media_collection = self._download_json(
'https://www.ardmediathek.de/play/media/' + content_id,
content_id, fatal=False) or {}
'https://www.ardmediathek.de/play/media/' + content_id,
content_id, fatal=False
) or {}

info = self._parse_media_info(
media_collection, content_id or video_id,
player_page.get('blockedByFsk'))
media_collection['embedded'], content_id or video_id,
player_page['widgets'][0].get('blockedByFsk')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
player_page['widgets'][0].get('blockedByFsk')
traverse_obj(player_page, ('widgets', 0, 'blockedByFsk'))

Or as above

Suggested change
player_page['widgets'][0].get('blockedByFsk')
traverse_obj(player_page, ((('widgets', 0), None), 'blockedByFsk'))

)

age_limit = None
description = player_page.get('synopsis')
maturity_content_rating = player_page.get('maturityContentRating')
description = player_page['widgets'][0].get('synopsis')
maturity_content_rating = player_page['widgets'][0].get('maturityContentRating')
if maturity_content_rating:
age_limit = int_or_none(maturity_content_rating.lstrip('FSK'))
if not age_limit and description:
age_limit = int_or_none(self._search_regex(
r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None))
info.update({
'age_limit': age_limit,
'title': title,
'description': description,
'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
'series': try_get(player_page, lambda x: x['show']['title']),
})
age_limit = int_or_none(
self._search_regex(
r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None
)
)
Comment on lines 448 to +458
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
age_limit = None
description = player_page.get('synopsis')
maturity_content_rating = player_page.get('maturityContentRating')
description = player_page['widgets'][0].get('synopsis')
maturity_content_rating = player_page['widgets'][0].get('maturityContentRating')
if maturity_content_rating:
age_limit = int_or_none(maturity_content_rating.lstrip('FSK'))
if not age_limit and description:
age_limit = int_or_none(self._search_regex(
r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None))
info.update({
'age_limit': age_limit,
'title': title,
'description': description,
'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
'series': try_get(player_page, lambda x: x['show']['title']),
})
age_limit = int_or_none(
self._search_regex(
r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None
)
)
info.update(traverse_obj(player_page, {
'title': title,
'description': ('widgets', 0, 'synopsis', T(txt_or_none)),
'age_limit': ('widgets', 0, 'maturityContentRating',
T(lambda s: int(s.strip().split('FSK')[1].strip())),
}))
if info.get('age_limit') is None:
info['age_limit'] = int_or_none(self._search_regex(
r'\(FSK\s*(\d+)\)\s*$', info.get('description', ''), 'age limit',
default=None)

Replace 'widgets', 0 with (('widgets', 0), None) if appropriate.

I added title here to avoid setting it later.


session_episode_match = re.search(r"\(S(\d+)/E(\d+)\)", title)
episode_match = re.search(r"\((\d+)\)", title)
episode_name_match = re.search(r"(Folge\s\d+)", title)

if session_episode_match:
season_number = session_episode_match.group(1)
episode_number = session_episode_match.group(2)

alt_title = re.sub(r"\(S\d+/E\d+\)", "", title)
alt_title = re.sub(r"(Folge \d+(\:|\s\-))", "", alt_title)
alt_title = alt_title.replace("|", "").replace(" ", " ").replace(" .", "").strip()

info.update(
{
'season_number': int(season_number),
'episode_number': int(episode_number),
'alt_title': alt_title
}
)
elif episode_name_match:
episode_number = episode_name_match.group(1).replace("Folge ", "")

alt_title = re.sub(r"(Folge\s\d+)", "", title)
alt_title = alt_title.replace("|", "").replace(" ", " ").replace(" .", "").strip()

info.update(
{
'season_number': int(0),
'episode_number': int(episode_number),
'alt_title': alt_title
}
)

elif episode_match:
episode_number = episode_match.group(1)
info.update(
{
'season_number': int(0),
'episode_number': int(episode_number),
'alt_title':re.sub(r"\(\d+\)", "", title).replace(" ", "").strip()
}
)
else:
info.update(
{
'alt_title': title
}
)
Comment on lines +460 to +507
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Surely we can simplify this.

Eg (NB if we don't care about the season/episode extraction use default=(None, None) instead of fatal=False and skip the or ...):

Suggested change
session_episode_match = re.search(r"\(S(\d+)/E(\d+)\)", title)
episode_match = re.search(r"\((\d+)\)", title)
episode_name_match = re.search(r"(Folge\s\d+)", title)
if session_episode_match:
season_number = session_episode_match.group(1)
episode_number = session_episode_match.group(2)
alt_title = re.sub(r"\(S\d+/E\d+\)", "", title)
alt_title = re.sub(r"(Folge \d+(\:|\s\-))", "", alt_title)
alt_title = alt_title.replace("|", "").replace(" ", " ").replace(" .", "").strip()
info.update(
{
'season_number': int(season_number),
'episode_number': int(episode_number),
'alt_title': alt_title
}
)
elif episode_name_match:
episode_number = episode_name_match.group(1).replace("Folge ", "")
alt_title = re.sub(r"(Folge\s\d+)", "", title)
alt_title = alt_title.replace("|", "").replace(" ", " ").replace(" .", "").strip()
info.update(
{
'season_number': int(0),
'episode_number': int(episode_number),
'alt_title': alt_title
}
)
elif episode_match:
episode_number = episode_match.group(1)
info.update(
{
'season_number': int(0),
'episode_number': int(episode_number),
'alt_title':re.sub(r"\(\d+\)", "", title).replace(" ", "").strip()
}
)
else:
info.update(
{
'alt_title': title
}
)
season_number, episode_number = (
int_or_none(n) for n in self._search_regex(
title,
r'(?:(?P<s_e>\()|\bFolge\s+)(?:S(?P<s>\d+)/E)?(?P<e>\d+)(?(s_e)\))',
'season/episode', group=('s', 'e'), fatal=False) or (None, None)))
info.update({
'season_number': season_number or 0,
'episode_number': episode_number,
'alt_title': re.sub(
r'\(S\d+/E\d+\)|Folge\s+\d+(?::|\s+-)|\||\s+\.', '',
title).replace(" ", " ").strip(),
})

Is it right for season_number to default to 0 rather than 1 or None?


info.update(
{
'age_limit': age_limit,
'title': title,
'description': description,
'timestamp': unified_timestamp(player_page['widgets'][0].get('broadcastedOn')),
'series': try_get(player_page['widgets'][0], lambda x: x['show']['title']),
'channel': player_page['widgets'][0]['publicationService']['name'],
'channel_id': player_page['widgets'][0]['publicationService']['id'],
'channel_url': f"https://www.ardmediathek.de/{player_page['widgets'][0]['publicationService']['id']}",
}
)
Comment on lines +509 to +520
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now this can be "simpler" too

Suggested change
info.update(
{
'age_limit': age_limit,
'title': title,
'description': description,
'timestamp': unified_timestamp(player_page['widgets'][0].get('broadcastedOn')),
'series': try_get(player_page['widgets'][0], lambda x: x['show']['title']),
'channel': player_page['widgets'][0]['publicationService']['name'],
'channel_id': player_page['widgets'][0]['publicationService']['id'],
'channel_url': f"https://www.ardmediathek.de/{player_page['widgets'][0]['publicationService']['id']}",
}
)
info.update(traverse_obj(player_page, {
'timestamp': ('widgets', 0, 'broadcastedOn', T(unified_timestamp)),
'series': ('widgets', 0, 'show', 'title', T(txt_or_none)),
'channel': ('widgets', 0, 'publicationService', 'name', T(txt_or_none)),
'channel_id': ('widgets', 0, 'publicationService', 'id', T(txt_or_none)),
})
if info.get('channel_id'):
info['channel_url'] = 'https://www.ardmediathek.de/' + info['channel_id'],

Should the ['channel_url'] use that domain even if the original URL had a different domain?

Also see other comments on 'widgets', 0.

return info

def _real_extract_serie(self, video_id):
entries = []

page_number = 0
page_size = 100

while True:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make the loop condition explicit:

Suggested change
while True:
total = traverse_obj(widgets, ('pagination', 'totalElements', T(int))) or 0
while page_number * page_size <= total:


widgets = self._download_json(
f'https://api.ardmediathek.de/page-gateway/widgets/ard/asset/{video_id}',
video_id,
query={'pageSize':str(page_size), 'pageNumber':page_number}
Comment on lines +532 to +534
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Py2, and no need to cast the numeric dict value:

Suggested change
f'https://api.ardmediathek.de/page-gateway/widgets/ard/asset/{video_id}',
video_id,
query={'pageSize':str(page_size), 'pageNumber':page_number}
'https://api.ardmediathek.de/page-gateway/widgets/ard/asset/' + video_id,
video_id, query={'pageSize': page_size, 'pageNumber': page_number}

)

for teaser in widgets['teasers']:
if 'EPISODE' == teaser['coreAssetType']:
item = self._real_extract_video(teaser['id'])
item['webpage_url'] = f"https://www.ardmediathek.de/video/{teaser['id']}"

entries.append(item)
Comment on lines +537 to +542
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for teaser in widgets['teasers']:
if 'EPISODE' == teaser['coreAssetType']:
item = self._real_extract_video(teaser['id'])
item['webpage_url'] = f"https://www.ardmediathek.de/video/{teaser['id']}"
entries.append(item)
def mk_teaser(t_id):
t = self._real_extract_video(t_id)
t['webpage_url'] = 'https://www.ardmediathek.de/video/' + t_id
return t
entries.extend(traverse_obj(widgets, (
'teasers', lambda _, v: 'EPISODE' == v['coreAssetType'], 'id',
T(mk_teaser))))

Actually, mk_teaser()can be used later, so make it a method (T(self._mk_teaser)):

        def _mk_teaser(self, t_id):
            t = self._real_extract_video(t_id)
            t['webpage_url'] = 'https://www.ardmediathek.de/video/' + t_id
            return t

This could raise if _real_extract_video() doesn't return a dict-like object but it's going to be used in a try: context as part of a traversal path.


total = widgets['pagination']['totalElements']
if (page_number + 1) * page_size > total:
break
Comment on lines +544 to +546
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No longer needed:

Suggested change
total = widgets['pagination']['totalElements']
if (page_number + 1) * page_size > total:
break

page_number += 1

return self.playlist_result(entries)

def _real_extract_named_serie(self, video_id, sender):

entries = []
page_size = 100

widgets = self._download_json(
f'https://api.ardmediathek.de/page-gateway/pages/{sender}/editorial/{video_id}',
video_id,
query={'pageSize': str(10), 'pageNumber': 0}
)['widgets']

for widget in widgets:
widget_id = widget['id']
Comment on lines +556 to +563
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
widgets = self._download_json(
f'https://api.ardmediathek.de/page-gateway/pages/{sender}/editorial/{video_id}',
video_id,
query={'pageSize': str(10), 'pageNumber': 0}
)['widgets']
for widget in widgets:
widget_id = widget['id']
widgets = self._download_json(
'https://api.ardmediathek.de/page-gateway/pages/{0}/editorial/{1}'.format(sender, video_id),
video_id, query={'pageSize': 10, 'pageNumber': 0}
)
for widget_id in traverse_obj(widgets, ('widgets', Ellipsis, 'id')):

page_number = 0

while True:
page_data = self._download_json(
f'https://api.ardmediathek.de/page-gateway/widgets/{sender}/editorials/{widget_id}',
video_id,
query={'pageSize': page_size, 'pageNumber': page_number}
Comment on lines +566 to +570
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make loop condition explicit:

Suggested change
while True:
page_data = self._download_json(
f'https://api.ardmediathek.de/page-gateway/widgets/{sender}/editorials/{widget_id}',
video_id,
query={'pageSize': page_size, 'pageNumber': page_number}
total = 0
while page_number * page_size <= total:
page_data = self._download_json(
'https://api.ardmediathek.de/page-gateway/widgets/{0}/editorials/{1}'.format(sender, widget_id),
video_id, query={'pageSize': page_size, 'pageNumber': page_number}

)

for teaser in page_data['teasers']:
if 'EPISODE' == teaser.get('coreAssetType', None) and teaser['type'] not in ['poster'] and ':' not in teaser['id']:

item = self._real_extract_video(teaser['id'])
item['webpage_url'] = f"https://www.ardmediathek.de/video/{teaser['id']}"
entries.append(item)

total = page_data['pagination']['totalElements']
if (page_number + 1) * page_size > total:
break
Comment on lines +573 to +582
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And as before:

Suggested change
for teaser in page_data['teasers']:
if 'EPISODE' == teaser.get('coreAssetType', None) and teaser['type'] not in ['poster'] and ':' not in teaser['id']:
item = self._real_extract_video(teaser['id'])
item['webpage_url'] = f"https://www.ardmediathek.de/video/{teaser['id']}"
entries.append(item)
total = page_data['pagination']['totalElements']
if (page_number + 1) * page_size > total:
break
entries.extend(traverse_obj(page_data, (
'teasers', lambda _, v: 'EPISODE' == ['coreAssetType'] and v.get('type') != 'poster' and ':' not in v['id'],
'id', T(self._mk_teaser))))
total = traverse_obj(page_data, (
'pagination', 'totalElements', T(int))) or 0

page_number += 1

return self.playlist_result(entries)
4 changes: 2 additions & 2 deletions youtube_dl/extractor/common.py
Expand Up @@ -448,10 +448,10 @@ def suitable(cls, url):
return cls.__match_valid_url(url) is not None

@classmethod
def _match_id(cls, url):
def _match_id(cls, url, group_name = 'id'):
m = cls.__match_valid_url(url)
assert m
return compat_str(m.group('id'))
return compat_str(m.group(group_name))
Comment on lines -451 to +454
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pls revert these and use _match_valid_url() in the extractor, as suggested above.


@classmethod
def working(cls):
Expand Down