diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index bf0c67542e..ec3ae0e668 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1555,7 +1555,14 @@ from .radiode import RadioDeIE from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE -from .radiofrance import FranceCultureIE, RadioFranceIE +from .radiofrance import ( + FranceCultureIE, + RadioFranceIE, + RadioFranceLiveIE, + RadioFrancePodcastIE, + RadioFranceProfileIE, + RadioFranceProgramScheduleIE, +) from .radiozet import RadioZetPodcastIE from .radiokapital import ( RadioKapitalIE, diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py index 92e51b7f45..35f4b91dd2 100644 --- a/yt_dlp/extractor/radiofrance.py +++ b/yt_dlp/extractor/radiofrance.py @@ -1,7 +1,18 @@ +import itertools import re +import urllib.parse from .common import InfoExtractor -from ..utils import parse_duration, unified_strdate +from ..utils import ( + int_or_none, + join_nonempty, + js_to_json, + parse_duration, + strftime_or_none, + traverse_obj, + unified_strdate, + urljoin, +) class RadioFranceIE(InfoExtractor): @@ -56,8 +67,32 @@ def _real_extract(self, url): } -class FranceCultureIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/(?:franceculture|fip|francemusique|mouv|franceinter)/podcasts/(?:[^?#]+/)?(?P[^?#]+)-(?P\d+)($|[?#])' +class RadioFranceBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?radiofrance\.fr' + + _STATIONS_RE = '|'.join(map(re.escape, ( + 'franceculture', + 'franceinfo', + 'franceinter', + 'francemusique', + 'fip', + 'mouv', + ))) + + def _extract_data_from_webpage(self, webpage, display_id, key): + return traverse_obj(self._search_json( + r'\bconst\s+data\s*=', webpage, key, display_id, + contains_pattern=r'(\[\{.*?\}\]);', transform_source=js_to_json), + (..., 'data', key, {dict}), get_all=False) or {} + + +class FranceCultureIE(RadioFranceBaseIE): + _VALID_URL = rf'''(?x) + {RadioFranceBaseIE._VALID_URL_BASE} + /(?:{RadioFranceBaseIE._STATIONS_RE}) + /podcasts/(?:[^?#]+/)?(?P[^?#]+)-(?P\d{{6,}})(?:$|[?#]) + ''' + _TESTS = [ { 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487', @@ -67,14 +102,30 @@ class FranceCultureIE(InfoExtractor): 'ext': 'mp3', 'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?', 'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?', - 'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'upload_date': '20220514', 'duration': 2750, }, }, + { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9-30/le-7-9-30-du-vendredi-10-mars-2023-2107675', + 'info_dict': { + 'id': '2107675', + 'display_id': 'le-7-9-30-du-vendredi-10-mars-2023', + 'title': 'Inflation alimentaire : comment en sortir ? - Régis Debray et Claude Grange - Cybèle Idelot', + 'description': 'md5:36ee74351ede77a314fdebb94026b916', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'upload_date': '20230310', + 'duration': 8977, + 'ext': 'mp3', + }, + }, { 'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507', 'only_matching': True, + }, { + 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-sciences/sante-bientot-un-vaccin-contre-l-asthme-allergique-3057200', + 'only_matching': True, } ] @@ -89,7 +140,6 @@ def _real_extract(self, url): 'id': video_id, 'display_id': display_id, 'url': video_data['contentUrl'], - 'ext': video_data.get('encodingFormat'), 'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None, 'duration': parse_duration(video_data.get('duration')), 'title': self._html_search_regex(r'(?s)]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)', @@ -102,3 +152,322 @@ def _real_extract(self, url): 'upload_date': unified_strdate(self._search_regex( r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False)) } + + +class RadioFranceLiveIE(RadioFranceBaseIE): + _VALID_URL = rf'''(?x) + https?://(?:www\.)?radiofrance\.fr + /(?P{RadioFranceBaseIE._STATIONS_RE}) + /?(?Pradio-[\w-]+)?(?:[#?]|$) + ''' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/franceinter/', + 'info_dict': { + 'id': 'franceinter', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/franceculture', + 'info_dict': { + 'id': 'franceculture', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv/radio-musique-kids-family', + 'info_dict': { + 'id': 'mouv-radio-musique-kids-family', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv/radio-rnb-soul', + 'info_dict': { + 'id': 'mouv-radio-rnb-soul', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv/radio-musique-mix', + 'info_dict': { + 'id': 'mouv-radio-musique-mix', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/fip/radio-rock', + 'info_dict': { + 'id': 'fip-radio-rock', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv', + 'only_matching': True, + }] + + def _real_extract(self, url): + station_id, substation_id = self._match_valid_url(url).group('id', 'substation_id') + + if substation_id: + webpage = self._download_webpage(url, station_id) + api_response = self._extract_data_from_webpage(webpage, station_id, 'webRadioData') + else: + api_response = self._download_json( + f'https://www.radiofrance.fr/{station_id}/api/live', station_id) + + formats, subtitles = [], {} + for media_source in traverse_obj(api_response, (('now', None), 'media', 'sources', lambda _, v: v['url'])): + if media_source.get('format') == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles(media_source['url'], station_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': media_source['url'], + 'abr': media_source.get('bitrate'), + }) + + return { + 'id': join_nonempty(station_id, substation_id), + 'title': traverse_obj(api_response, ('visual', 'legend')) or join_nonempty( + ('now', 'firstLine', 'title'), ('now', 'secondLine', 'title'), from_dict=api_response, delim=' - '), + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + } + + +class RadioFrancePlaylistBase(RadioFranceBaseIE): + """Subclasses must set _METADATA_KEY""" + + def _call_api(self, content_id, cursor, page_num): + raise NotImplementedError('This method must be implemented by subclasses') + + def _generate_playlist_entries(self, content_id, content_response): + for page_num in itertools.count(2): + for entry in content_response['items']: + yield self.url_result( + f'https://www.radiofrance.fr/{entry["path"]}', url_transparent=True, **traverse_obj(entry, { + 'title': 'title', + 'description': 'standFirst', + 'timestamp': ('publishedDate', {int_or_none}), + 'thumbnail': ('visual', 'src'), + })) + + next_cursor = traverse_obj(content_response, (('pagination', None), 'next'), get_all=False) + if not next_cursor: + break + + content_response = self._call_api(content_id, next_cursor, page_num) + + def _real_extract(self, url): + display_id = self._match_id(url) + + metadata = self._download_json( + 'https://www.radiofrance.fr/api/v2.1/path', display_id, + query={'value': urllib.parse.urlparse(url).path})['content'] + + content_id = metadata['id'] + + return self.playlist_result( + self._generate_playlist_entries(content_id, metadata[self._METADATA_KEY]), content_id, + display_id=display_id, **{**traverse_obj(metadata, { + 'title': 'title', + 'description': 'standFirst', + 'thumbnail': ('visual', 'src'), + }), **traverse_obj(metadata, { + 'title': 'name', + 'description': 'role', + })}) + + +class RadioFrancePodcastIE(RadioFrancePlaylistBase): + _VALID_URL = rf'''(?x) + {RadioFranceBaseIE._VALID_URL_BASE} + /(?:{RadioFranceBaseIE._STATIONS_RE}) + /podcasts/(?P[\w-]+)/?(?:[?#]|$) + ''' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-vert', + 'info_dict': { + 'id': 'eaf6ef81-a980-4f1c-a7d1-8a75ecd54b17', + 'display_id': 'le-billet-vert', + 'title': 'Le billet sciences', + 'description': 'md5:eb1007b34b0c0a680daaa71525bbd4c1', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/jean-marie-le-pen-l-obsession-nationale', + 'info_dict': { + 'id': '566fd524-3074-4fbc-ac69-8696f2152a54', + 'display_id': 'jean-marie-le-pen-l-obsession-nationale', + 'title': 'Jean-Marie Le Pen, l\'obsession nationale', + 'description': 'md5:a07c0cfb894f6d07a62d0ad12c4b7d73', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_count': 7, + }, { + 'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine', + 'info_dict': { + 'id': '63c1ddc9-9f15-457a-98b2-411bac63f48d', + 'display_id': 'serie-thomas-grjebine', + 'title': 'Thomas Grjebine', + }, + 'playlist_count': 1, + }, { + 'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip', + 'info_dict': { + 'id': '143dff38-e956-4a5d-8576-1c0b7242b99e', + 'display_id': 'certains-l-aiment-fip', + 'title': 'Certains l’aiment Fip', + 'description': 'md5:ff974672ba00d4fd5be80fb001c5b27e', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 321, + }, { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9', + 'only_matching': True, + }, { + 'url': 'https://www.radiofrance.fr/mouv/podcasts/dirty-mix', + 'only_matching': True, + }] + + _METADATA_KEY = 'expressions' + + def _call_api(self, podcast_id, cursor, page_num): + return self._download_json( + f'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id, + note=f'Downloading page {page_num}', query={'pageCursor': cursor}) + + +class RadioFranceProfileIE(RadioFrancePlaylistBase): + _VALID_URL = rf'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P[\w-]+)' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet?p=3', + 'info_dict': { + 'id': '86c62790-e481-11e2-9f7b-782bcb6744eb', + 'display_id': 'thomas-pesquet', + 'title': 'Thomas Pesquet', + 'description': 'Astronaute à l\'agence spatiale européenne', + }, + 'playlist_mincount': 212, + }, { + 'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie', + 'info_dict': { + 'id': '9593050b-0183-4972-a0b5-d8f699079e02', + 'display_id': 'eugenie-bastie', + 'title': 'Eugénie Bastié', + 'description': 'Journaliste et essayiste', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 39, + }, { + 'url': 'https://www.radiofrance.fr/personnes/lea-salame', + 'only_matching': True, + }] + + _METADATA_KEY = 'documents' + + def _call_api(self, profile_id, cursor, page_num): + resp = self._download_json( + f'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id, + note=f'Downloading page {page_num}', query={ + 'relation': 'personality', + 'cursor': cursor, + }) + + resp['next'] = traverse_obj(resp, ('pagination', 'next')) + return resp + + +class RadioFranceProgramScheduleIE(RadioFranceBaseIE): + _VALID_URL = rf'''(?x) + {RadioFranceBaseIE._VALID_URL_BASE} + /(?P{RadioFranceBaseIE._STATIONS_RE}) + /grille-programmes(?:\?date=(?P[\d-]+))? + ''' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/franceinter/grille-programmes?date=17-02-2023', + 'info_dict': { + 'id': 'franceinter-program-20230217', + 'upload_date': '20230217', + }, + 'playlist_count': 25, + }, { + 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes?date=01-02-2023', + 'info_dict': { + 'id': 'franceculture-program-20230201', + 'upload_date': '20230201', + }, + 'playlist_count': 25, + }, { + 'url': 'https://www.radiofrance.fr/mouv/grille-programmes?date=19-03-2023', + 'info_dict': { + 'id': 'mouv-program-20230319', + 'upload_date': '20230319', + }, + 'playlist_count': 3, + }, { + 'url': 'https://www.radiofrance.fr/francemusique/grille-programmes?date=18-03-2023', + 'info_dict': { + 'id': 'francemusique-program-20230318', + 'upload_date': '20230318', + }, + 'playlist_count': 15, + }, { + 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes', + 'only_matching': True, + }] + + def _generate_playlist_entries(self, webpage_url, api_response): + for entry in traverse_obj(api_response, ('steps', lambda _, v: v['expression']['path'])): + yield self.url_result( + urljoin(webpage_url, f'/{entry["expression"]["path"]}'), ie=FranceCultureIE, + url_transparent=True, **traverse_obj(entry, { + 'title': ('expression', 'title'), + 'thumbnail': ('expression', 'visual', 'src'), + 'timestamp': ('startTime', {int_or_none}), + 'series_id': ('concept', 'id'), + 'series': ('concept', 'title'), + })) + + def _real_extract(self, url): + station, date = self._match_valid_url(url).group('station', 'date') + webpage = self._download_webpage(url, station) + grid_data = self._extract_data_from_webpage(webpage, station, 'grid') + upload_date = strftime_or_none(grid_data.get('date'), '%Y%m%d') + + return self.playlist_result( + self._generate_playlist_entries(url, grid_data), + join_nonempty(station, 'program', upload_date), upload_date=upload_date)