ytdl-org · bartbroere · Mar 31, 2023 · Mar 31, 2023 · Mar 31, 2023 · Mar 31, 2023
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
@@ -847,7 +847,16 @@
     NownessSeriesIE,
 )
 from .noz import NozIE
-from .npo import NPOIE
+from .npo import (
+    AndereTijdenIE,
+    BNNVaraIE,
+    NPOIE,
+    ONIE,
+    SchoolTVIE,
+    HetKlokhuisIE,
+    VPROIE,
+    WNLIE,
+)
 from .npr import NprIE
 from .nrk import (
     NRKIE,

diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py
@@ -1,43 +1,21 @@
+# coding: utf-8
 from __future__ import unicode_literals
 
 import json
+import re
 
 from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-)
+from ..utils import ExtractorError
 
 
 class NPOIE(InfoExtractor):
     IE_NAME = 'npo'
     IE_DESC = 'npo.nl'
-    _VALID_URL = r'''(?x)
-                    (?:
-                        npo:|
-                        https?://
-                            (?:www\.)?
-                            (?:
-                                npo\.nl/(?:[^/]+/)*
-                            )
-                        )
-                        (?P<id>[^/?#]+)
-                '''
+    _VALID_URL = r'https?://(?:www\.)?npo\.nl/.*'
 
     _TESTS = [{
         'url': 'https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/',
         # TODO fill in other test attributes
-    }, {
-        'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800',
-        'md5': 'da50a5787dbfc1603c4ad80f31c5120b',
-        'info_dict': {
-            'id': 'VARA_101191800',
-            'ext': 'm4v',
-            'title': 'De Mega Mike & Mega Thomas show: The best of.',
-            'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4',
-            'upload_date': '20090227',
-            'duration': 2400,
-        },
-        'skip': 'Video gone',
     }, {
         'url': 'https://npo.nl/start/serie/vpro-tegenlicht/seizoen-11/zwart-geld-de-toekomst-komt-uit-afrika',
         'md5': 'f8065e4e5a7824068ed3c7e783178f2c',
@@ -67,45 +45,49 @@ def _real_extract(self, url):
             url = url[:-10]
         url = url.rstrip('/')
         slug = url.split('/')[-1]
-        page = self._download_webpage(url, slug, 'Finding productId using slug: %s' % slug)
-        # TODO find out what proper HTML parsing utilities are available in youtube-dl
-        next_data = page.split('<script id="__NEXT_DATA__" type="application/json">')[1].split('</script>')[0]
-        # TODO The data in this script tag feels like GraphQL, so there might be an easier way
-        #      to get the product id, maybe using a GraphQL endpoint
-        next_data = json.loads(next_data)
-        product_id, title, description, thumbnail = None, None, None, None
-        for query in next_data['props']['pageProps']['dehydratedState']['queries']:
-            if isinstance(query['state']['data'], list):
-                for entry in query['state']['data']:
-                    if entry['slug'] == slug:
-                        product_id = entry.get('productId')
-                        title = entry.get('title')
-                        synopsis = entry.get('synopsis', {})
-                        description = (
-                            synopsis.get('long')
-                            or synopsis.get('short')
-                            or synopsis.get('brief')
-                        )
-                        thumbnails = entry.get('images')
-                        for thumbnail_entry in thumbnails:
-                            if 'url' in thumbnail_entry:
-                                thumbnail = thumbnail_entry.get('url')
+
+        program_metadata = self._download_json('https://npo.nl/start/api/domain/program-detail',
+                                               slug,
+                                               query={'slug': slug})
+        product_id = program_metadata.get('productId')
+        images = program_metadata.get('images')
+        thumbnail = None
+        for image in images:
+            thumbnail = image.get('url')
+            break
+        title = program_metadata.get('title')
+        descriptions = program_metadata.get('description', {})
+        description = descriptions.get('long') or descriptions.get('short') or descriptions.get('brief')
+        duration = program_metadata.get('durationInSeconds')
+
         if not product_id:
             raise ExtractorError('No productId found for slug: %s' % slug)
 
-        token = self._get_token(product_id)
+        formats = self._download_by_product_id(product_id, slug, url)
 
+        return {
+            'id': slug,
+            'formats': formats,
+            'title': title or slug,
+            'description': description or title or slug,
+            'thumbnail': thumbnail,
+            'duration': duration,
+        }
+
+    def _download_by_product_id(self, product_id, slug, url=None):
+        token = self._get_token(product_id)
         formats = []
         for profile in (
                 'dash',
-                # 'hls',  # TODO test what needs to change for 'hls' support
+                # 'hls' is available too, but implementing it doesn't add much
+                # As far as I know 'dash' is always available
         ):
             stream_link = self._download_json(
                 'https://prod.npoplayer.nl/stream-link', video_id=slug,
                 data=json.dumps({
                     'profileName': profile,
                     'drmType': 'widevine',
-                    'referrerUrl': url,
+                    'referrerUrl': url or '',
                 }).encode('utf8'),
                 headers={
                     'Authorization': token,
@@ -114,12 +96,184 @@ def _real_extract(self, url):
             )
             stream_url = stream_link.get('stream', {}).get('streamURL')
-            stream_url = stream_link.get('stream', {}).get('streamURL')
+            stream_url = traverse_obj(stream_link, ('stream', 'streamURL'))
-            stream_url = stream_link.get('stream', {}).get('streamURL')
+            stream_url = traverse_obj(stream_link, ('stream', 'streamURL'))
             formats.extend(self._extract_mpd_formats(stream_url, slug, mpd_id='dash', fatal=False))
+        return formats
+
+
+class BNNVaraIE(NPOIE):
+    IE_NAME = 'bnnvara'
+    IE_DESC = 'bnnvara.nl'
+    _VALID_URL = r'https?://(?:www\.)?bnnvara\.nl/videos/[0-9]*'
+    _TESTS = [{
+        'url': 'https://www.bnnvara.nl/videos/27455',
+        # TODO fill in other test attributes
+    }]
+
+    def _real_extract(self, url):
+        url = url.rstrip('/')
+        video_id = url.split('/')[-1]
+
+        media = self._download_json('https://api.bnnvara.nl/bff/graphql',
+                                    video_id,
+                                    data=json.dumps(
+                                        {
+                                            'operationName': 'getMedia',
+                                            'variables': {
+                                                'id': video_id,
+                                                'hasAdConsent': False,
+                                                'atInternetId': 70
+                                            },
+                                            'query': 'query getMedia($id: ID!, $mediaUrl: String, $hasAdConsent: Boolean!, $atInternetId: Int) {\n  player(\n    id: $id\n    mediaUrl: $mediaUrl\n    hasAdConsent: $hasAdConsent\n    atInternetId: $atInternetId\n  ) {\n    ... on PlayerSucces {\n      brand {\n        name\n        slug\n        broadcastsEnabled\n        __typename\n      }\n      title\n      programTitle\n      pomsProductId\n      broadcasters {\n        name\n        __typename\n      }\n      duration\n      classifications {\n        title\n        imageUrl\n        type\n        __typename\n      }\n      image {\n        title\n        url\n        __typename\n      }\n      cta {\n        title\n        url\n        __typename\n      }\n      genres {\n        name\n        __typename\n      }\n      subtitles {\n        url\n        language\n        __typename\n      }\n      sources {\n        name\n        url\n        ratio\n        __typename\n      }\n      type\n      token\n      __typename\n    }\n    ... on PlayerError {\n      error\n      __typename\n    }\n    __typename\n  }\n}'
+                                        }).encode('utf8'),
+                                    headers={
+                                        'Content-Type': 'application/json',
+                                    })
+        product_id = media.get('data', {}).get('player', {}).get('pomsProductId')
+
+        formats = self._download_by_product_id(product_id, video_id)
-        product_id = media.get('data', {}).get('player', {}).get('pomsProductId')
-
-        formats = self._download_by_product_id(product_id, video_id)
+        product_id = traverse_obj(media, ('data', 'player', 'pomsProductId'))
+
+        formats = self._download_by_product_id(product_id, video_id) if product_id else []
+        self._sort_formats(formats)
-        product_id = media.get('data', {}).get('player', {}).get('pomsProductId')
-
-        formats = self._download_by_product_id(product_id, video_id)
+        product_id = traverse_obj(media, ('data', 'player', 'pomsProductId'))
+
+        formats = self._download_by_product_id(product_id, video_id) if product_id else []
+        self._sort_formats(formats)
 
         return {
-            'id': slug,
+            'id': product_id,
+            'title': media.get('data', {}).get('player', {}).get('title'),
+            'formats': formats,
+            'thumbnail': media.get('data', {}).get('player', {}).get('image').get('url'),
+        }
+
+
+class ONIE(NPOIE):
+    IE_NAME = 'on'
+    IE_DESC = 'ongehoordnederland.tv'
+    _VALID_URL = r'https?://(?:www\.)?ongehoordnederland.tv/.*'
+    _TESTS = [{
+        'url': 'https://ongehoordnederland.tv/2024/03/01/korte-clips/heeft-preppen-zin-betwijfel-dat-je-daar-echt-iets-aan-zult-hebben-bij-oorlog-lydia-daniel/',
+        # TODO fill in other test attributes
+    }]
+
+    def _real_extract(self, url):
+        video_id = url.rstrip('/').split('/')[-1]
+        page, _ = self._download_webpage_handle(url, video_id)
-        page, _ = self._download_webpage_handle(url, video_id)
+        page = self._download_webpage(url, video_id)
-        page, _ = self._download_webpage_handle(url, video_id)
+        page = self._download_webpage(url, video_id)
+        results = re.findall("page: '(.+)'", page)
+        formats = []
+        for result in results:
+            formats.extend(self._download_by_product_id(result, video_id))
+
+        if not formats:
+            raise ExtractorError('Could not find a POMS product id in the provided URL, '
+                                 'perhaps because all stream URLs are DRM protected.')
+
+        return {
+            'id': video_id,
+            'title': video_id,
             'formats': formats,
-            'title': title or slug,
-            'description': description,
-            'thumbnail': thumbnail,
-            # TODO fill in other metadata that's available
         }
+
+
+class ZAPPIE(NPOIE):
+    IE_NAME = 'zapp'
+    IE_DESC = 'zapp.nl'
+    _VALID_URL = r'https?://(?:www\.)?zapp.nl/.*'
+
+    _TESTS = [{
+        'url': 'https://www.zapp.nl/programmas/zappsport/gemist/AT_300003973',
+        # TODO fill in other test attributes
+    }]
+
+    def _real_extract(self, url):
+        video_id = url.rstrip('/').split('/')[-1]
+
+        formats = self._download_by_product_id(url, video_id)
+
+        return {
+            'id': video_id,
+            'title': video_id,
+            'formats': formats,
+        }
+
+
+class SchoolTVIE(NPOIE):
+    IE_NAME = 'schooltv'
+    IE_DESC = 'schooltv.nl'
+    _VALID_URL = r'https?://(?:www\.)?schooltv.nl/item/.*'
+
+    _TESTS = [{
+        'url': 'https://schooltv.nl/item/zapp-music-challenge-2015-zapp-music-challenge-2015',
+        # TODO fill in other test attributes
+    }]
+
+    def _real_extract(self, url):
+        video_id = url.rstrip('/').split('/')[-1]
+
+        # TODO Find out how we could obtain this automatically
+        #      Otherwise this extractor might break each time SchoolTV deploys a new release
+        build_id = 'b7eHUzAVO7wHXCopYxQhV'
+
+        metadata_url = 'https://schooltv.nl/_next/data/' \
+                       + build_id \
+                       + '/item/' \
+                       + video_id + '.json'
+
+        metadata = self._download_json(metadata_url,
+                                       video_id).get('pageProps', {}).get('data', {})
+
+        formats = self._download_by_product_id(metadata.get('poms_mid'), video_id)
+
+        if not formats:
+            raise ExtractorError('Could not find a POMS product id in the provided URL, '
+                                 'perhaps because all stream URLs are DRM protected.')
+
+        return {
+            'id': video_id,
+            'title': metadata.get('title', '') + ' - ' + metadata.get('subtitle', ''),
+            'description': metadata.get('description') or metadata.get('short_description'),
+            'formats': formats,
+        }
+
+
+class HetKlokhuisIE(NPOIE):
+    ...
+
+    def _real_extract(self, url):
+        ...
+
+
+class VPROIE(NPOIE):
+    IE_NAME = 'vpro'
+    IE_DESC = 'vpro.nl'
+    _VALID_URL = r'https?://(?:www\.)?vpro.nl/.*'
+    _TESTS = [{
+        'url': 'https://www.vpro.nl/programmas/tegenlicht/kijk/afleveringen/2015-2016/offline-als-luxe.html',
+        # TODO fill in other test attributes
+    }]
+
+    def _real_extract(self, url):
+        video_id = url.rstrip('/').split('/')[-1]
+        page, _ = self._download_webpage_handle(url, video_id)
+        results = re.findall(r'data-media-id="(.+_.+)"\s', page)
+        formats = []
+        for result in results:
+            formats.extend(self._download_by_product_id(result, video_id))
+            break  # TODO find a better solution, VPRO pages can have multiple videos embedded
+
+        if not formats:
+            raise ExtractorError('Could not find a POMS product id in the provided URL, '
+                                 'perhaps because all stream URLs are DRM protected.')
+
+        return {
+            'id': video_id,
+            'title': video_id,
+            'formats': formats,
+        }
+
+
+class WNLIE(NPOIE):
+    ...
+
+    def _real_extract(self, url):
+        ...
+
+
+class AndereTijdenIE(NPOIE):
+    ...
+
+    def _real_extract(self, url):
+        ...
+