[extractor/TubeTuGraz] Add extractor (#2397)

Based on https://github.com/ytdl-org/youtube-dl/pull/26778 Authored by: Ferdi265, pukkandan
2025-01-05 23:54:24 +00:00 · 2022-07-15 12:48:21 +02:00 · 2022-07-15 12:48:21 +02:00 · 49afc1d84a
parent 6edf28081f
commit 49afc1d84a
2 changed files with 235 additions and 0 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -1794,6 +1794,7 @@
 from .trunews import TruNewsIE
 from .trutv import TruTVIE
 from .tube8 import Tube8IE
+from .tubetugraz import TubeTuGrazIE, TubeTuGrazSeriesIE
 from .tubitv import (
    TubiTvIE,
    TubiTvShowIE,
--- a/yt_dlp/extractor/tubetugraz.py
+++ b/yt_dlp/extractor/tubetugraz.py
@ -0,0 +1,234 @@
+from .common import InfoExtractor
+from ..utils import (
+    float_or_none,
+    parse_resolution,
+    traverse_obj,
+    urlencode_postdata,
+    variadic,
+)
+
+
+class TubeTuGrazBaseIE(InfoExtractor):
+    _NETRC_MACHINE = 'tubetugraz'
+
+    _API_EPISODE = 'https://tube.tugraz.at/search/episode.json'
+    _FORMAT_TYPES = ('presentation', 'presenter')
+
+    def _perform_login(self, username, password):
+        urlh = self._request_webpage(
+            'https://tube.tugraz.at/Shibboleth.sso/Login?target=/paella/ui/index.html',
+            None, fatal=False, note='downloading login page', errnote='unable to fetch login page')
+        if not urlh:
+            return
+
+        urlh = self._request_webpage(
+            urlh.geturl(), None, fatal=False, headers={'referer': urlh.geturl()},
+            note='logging in', errnote='unable to log in', data=urlencode_postdata({
+                'lang': 'de',
+                '_eventId_proceed': '',
+                'j_username': username,
+                'j_password': password
+            }))
+
+        if urlh and urlh.geturl() != 'https://tube.tugraz.at/paella/ui/index.html':
+            self.report_warning('unable to login: incorrect password')
+
+    def _extract_episode(self, episode_info):
+        id = episode_info.get('id')
+        formats = list(self._extract_formats(
+            traverse_obj(episode_info, ('mediapackage', 'media', 'track')), id))
+        self._sort_formats(formats)
+
+        title = traverse_obj(episode_info, ('mediapackage', 'title'), 'dcTitle')
+        series_title = traverse_obj(episode_info, ('mediapackage', 'seriestitle'))
+        creator = ', '.join(variadic(traverse_obj(
+            episode_info, ('mediapackage', 'creators', 'creator'), 'dcCreator', default='')))
+        return {
+            'id': id,
+            'title': title,
+            'creator': creator or None,
+            'duration': traverse_obj(episode_info, ('mediapackage', 'duration'), 'dcExtent'),
+            'series': series_title,
+            'series_id': traverse_obj(episode_info, ('mediapackage', 'series'), 'dcIsPartOf'),
+            'episode': series_title and title,
+            'formats': formats
+        }
+
+    def _set_format_type(self, formats, type):
+        for f in formats:
+            f['format_note'] = type
+            if not type.startswith(self._FORMAT_TYPES[0]):
+                f['preference'] = -2
+        return formats
+
+    def _extract_formats(self, format_list, id):
+        has_hls, has_dash = False, False
+
+        for format_info in format_list or []:
+            url = traverse_obj(format_info, ('tags', 'url'), 'url')
+            if url is None:
+                continue
+
+            type = format_info.get('type') or 'unknown'
+            transport = (format_info.get('transport') or 'https').lower()
+
+            if transport == 'https':
+                formats = [{
+                    'url': url,
+                    'abr': float_or_none(traverse_obj(format_info, ('audio', 'bitrate')), 1000),
+                    'vbr': float_or_none(traverse_obj(format_info, ('video', 'bitrate')), 1000),
+                    'fps': traverse_obj(format_info, ('video', 'framerate')),
+                    **parse_resolution(traverse_obj(format_info, ('video', 'resolution'))),
+                }]
+            elif transport == 'hls':
+                has_hls, formats = True, self._extract_m3u8_formats(
+                    url, id, 'mp4', fatal=False, note=f'downloading {type} HLS manifest')
+            elif transport == 'dash':
+                has_dash, formats = True, self._extract_mpd_formats(
+                    url, id, fatal=False, note=f'downloading {type} DASH manifest')
+            else:
+                # RTMP, HDS, SMOOTH, and unknown formats
+                # - RTMP url fails on every tested entry until now
+                # - HDS url 404's on every tested entry until now
+                # - SMOOTH url 404's on every tested entry until now
+                continue
+
+            yield from self._set_format_type(formats, type)
+
+        # TODO: Add test for these
+        for type in self._FORMAT_TYPES:
+            if not has_hls:
+                hls_formats = self._extract_m3u8_formats(
+                    f'https://wowza.tugraz.at/matterhorn_engage/smil:engage-player_{id}_{type}.smil/playlist.m3u8',
+                    id, 'mp4', fatal=False, note=f'Downloading {type} HLS manifest', errnote=False) or []
+                yield from self._set_format_type(hls_formats, type)
+
+            if not has_dash:
+                dash_formats = self._extract_mpd_formats(
+                    f'https://wowza.tugraz.at/matterhorn_engage/smil:engage-player_{id}_{type}.smil/manifest_mpm4sav_mvlist.mpd',
+                    id, fatal=False, note=f'Downloading {type} DASH manifest', errnote=False)
+                yield from self._set_format_type(dash_formats, type)
+
+
+class TubeTuGrazIE(TubeTuGrazBaseIE):
+    IE_DESC = 'tube.tugraz.at'
+
+    _VALID_URL = r'''(?x)
+        https?://tube\.tugraz\.at/paella/ui/watch.html\?id=
+        (?P<id>[0-9a-fA-F]{8}-(?:[0-9a-fA-F]{4}-){3}[0-9a-fA-F]{12})
+    '''
+    _TESTS = [
+        {
+            'url': 'https://tube.tugraz.at/paella/ui/watch.html?id=f2634392-e40e-4ac7-9ddc-47764aa23d40',
+            'md5': 'a23a3d5c9aaca2b84932fdba66e17145',
+            'info_dict': {
+                'id': 'f2634392-e40e-4ac7-9ddc-47764aa23d40',
+                'ext': 'mp4',
+                'title': '#6 (23.11.2017)',
+                'episode': '#6 (23.11.2017)',
+                'series': '[INB03001UF] Einführung in die strukturierte Programmierung',
+                'creator': 'Safran C',
+                'duration': 3295818,
+                'series_id': 'b1192fff-2aa7-4bf0-a5cf-7b15c3bd3b34',
+            }
+        }, {
+            'url': 'https://tube.tugraz.at/paella/ui/watch.html?id=2df6d787-e56a-428d-8ef4-d57f07eef238',
+            'md5': 'de0d854a56bf7318d2b693fe1adb89a5',
+            'info_dict': {
+                'id': '2df6d787-e56a-428d-8ef4-d57f07eef238',
+                'title': 'TubeTuGraz video #2df6d787-e56a-428d-8ef4-d57f07eef238',
+                'ext': 'mp4',
+            },
+            'expected_warnings': ['Extractor failed to obtain "title"'],
+        }
+    ]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        episode_data = self._download_json(
+            self._API_EPISODE, video_id, query={'id': video_id, 'limit': 1}, note='Downloading episode metadata')
+
+        episode_info = traverse_obj(episode_data, ('search-results', 'result'), default={'id': video_id})
+        return self._extract_episode(episode_info)
+
+
+class TubeTuGrazSeriesIE(TubeTuGrazBaseIE):
+    _VALID_URL = r'''(?x)
+        https?://tube\.tugraz\.at/paella/ui/browse\.html\?series=
+        (?P<id>[0-9a-fA-F]{8}-(?:[0-9a-fA-F]{4}-){3}[0-9a-fA-F]{12})
+    '''
+    _TESTS = [{
+        'url': 'https://tube.tugraz.at/paella/ui/browse.html?series=0e6351b7-c372-491e-8a49-2c9b7e21c5a6',
+        'id': '0e6351b7-c372-491e-8a49-2c9b7e21c5a6',
+        'info_dict': {
+            'id': '0e6351b7-c372-491e-8a49-2c9b7e21c5a6',
+            'title': '[209351] Strassenwesen',
+        },
+        'playlist': [
+            {
+                'info_dict': {
+                    'id': 'ee17ce5d-34e2-48b7-a76a-fed148614e11',
+                    'series_id': '0e6351b7-c372-491e-8a49-2c9b7e21c5a6',
+                    'ext': 'mp4',
+                    'title': '#4 Detailprojekt',
+                    'episode': '#4 Detailprojekt',
+                    'series': '[209351] Strassenwesen',
+                    'creator': 'Neuhold R',
+                    'duration': 6127024,
+                }
+            },
+            {
+                'info_dict': {
+                    'id': '87350498-799a-44d3-863f-d1518a98b114',
+                    'series_id': '0e6351b7-c372-491e-8a49-2c9b7e21c5a6',
+                    'ext': 'mp4',
+                    'title': '#3 Generelles Projekt',
+                    'episode': '#3 Generelles Projekt',
+                    'series': '[209351] Strassenwesen',
+                    'creator': 'Neuhold R',
+                    'duration': 5374422,
+                }
+            },
+            {
+                'info_dict': {
+                    'id': '778599ea-489e-4189-9e05-3b4888e19bcd',
+                    'series_id': '0e6351b7-c372-491e-8a49-2c9b7e21c5a6',
+                    'ext': 'mp4',
+                    'title': '#2 Vorprojekt',
+                    'episode': '#2 Vorprojekt',
+                    'series': '[209351] Strassenwesen',
+                    'creator': 'Neuhold R',
+                    'duration': 5566404,
+                }
+            },
+            {
+                'info_dict': {
+                    'id': '75e4c71c-d99d-4e56-b0e6-4f2bcdf11f29',
+                    'series_id': '0e6351b7-c372-491e-8a49-2c9b7e21c5a6',
+                    'ext': 'mp4',
+                    'title': '#1 Variantenstudium',
+                    'episode': '#1 Variantenstudium',
+                    'series': '[209351] Strassenwesen',
+                    'creator': 'Neuhold R',
+                    'duration': 5420200,
+                }
+            }
+        ],
+        'min_playlist_count': 4
+    }]
+
+    def _real_extract(self, url):
+        id = self._match_id(url)
+        episodes_data = self._download_json(self._API_EPISODE, id, query={'sid': id}, note='Downloading episode list')
+        series_data = self._download_json(
+            'https://tube.tugraz.at/series/series.json', id, fatal=False,
+            note='downloading series metadata', errnote='failed to download series metadata',
+            query={
+                'seriesId': id,
+                'count': 1,
+                'sort': 'TITLE'
+            })
+
+        return self.playlist_result(
+            map(self._extract_episode, episodes_data['search-results']['result']), id,
+            traverse_obj(series_data, ('catalogs', 0, 'http://purl.org/dc/terms/', 'title', 0, 'value')))