Incorporating changes for UK accessed articles

This commit is contained in:
Kyle Gonsalves 2024-04-22 17:02:19 -07:00
parent 6ef8990320
commit fba5c8f305

View file

@ -17,6 +17,7 @@
int_or_none, int_or_none,
join_nonempty, join_nonempty,
js_to_json, js_to_json,
merge_dicts,
parse_duration, parse_duration,
parse_iso8601, parse_iso8601,
parse_qs, parse_qs,
@ -43,6 +44,7 @@ class BBCCoUkIE(InfoExtractor):
iplayer(?:/[^/]+)?/(?:episode/|playlist/)| iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
music/(?:clips|audiovideo/popular)[/#]| music/(?:clips|audiovideo/popular)[/#]|
radio/player/| radio/player/|
sounds/play/|
events/[^/]+/play/[^/]+/ events/[^/]+/play/[^/]+/
) )
(?P<id>%s)(?!/(?:episodes|broadcasts|clips)) (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
@ -623,6 +625,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'info_dict': { 'info_dict': {
'id': '3662a707-0af9-3149-963f-47bea720b460', 'id': '3662a707-0af9-3149-963f-47bea720b460',
'title': 'BUGGER', 'title': 'BUGGER',
'description': r're:BUGGER The recent revelations by the whistleblower Edward Snowden were fascinating. .{211}\.{3}$',
}, },
'playlist_count': 18, 'playlist_count': 18,
}, { }, {
@ -631,14 +634,14 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'info_dict': { 'info_dict': {
'id': 'p02mprgb', 'id': 'p02mprgb',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'title': 'Germanwings crash site aerial video',
'description': 'md5:2868290467291b37feda7863f7a83f54', 'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$',
'duration': 47, 'duration': None, # 47,
'timestamp': 1427219242, 'timestamp': 1427219242,
'upload_date': '20150324', 'upload_date': '20150324',
'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg',
}, },
'params': { 'params': {
# rtmp download
'skip_download': True, 'skip_download': True,
} }
}, { }, {
@ -656,7 +659,8 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
} },
'skip': 'now SIMORGH_DATA with no video',
}, { }, {
# single video embedded with data-playable containing XML playlists (regional section) # single video embedded with data-playable containing XML playlists (regional section)
'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
@ -670,7 +674,9 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
} },
# TODO: now in .pageData.promo.media of SIMORGH_DATA
'skip': 'video extraction failed',
}, { }, {
# single video from video playlist embedded with vxp-playlist-data JSON # single video from video playlist embedded with vxp-playlist-data JSON
'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
@ -683,22 +689,22 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
} },
'skip': '404 Not Found',
}, { }, {
# single video story with digitalData # single video story with digitalData
'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
'info_dict': { 'info_dict': {
'id': 'p02q6gc4', 'id': 'p02q6gc4',
'ext': 'flv', 'ext': 'mp4',
'title': 'Sri Lankas spicy secret', # page title: 'Sri Lankas spicy secret',
'description': 'As a new train line to Jaffna opens up the countrys north, travellers can experience a truly distinct slice of Tamil culture.', 'title': 'Tasting the spice of life in Jaffna',
'timestamp': 1437674293, # page description: 'As a new train line to Jaffna opens up the countrys north, travellers can experience a truly distinct slice of Tamil culture.',
'upload_date': '20150723', 'description': r're:(?s)BBC Travel Shows Henry Golding explores the city of Jaffna .{149} aftertaste\.$',
'timestamp': 1437935638, # was: 1437674293,
'upload_date': '20150726',
'duration': 255,
}, },
'params': {
# rtmp download
'skip_download': True,
}
}, { }, {
# single video story without digitalData # single video story without digitalData
'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
@ -710,12 +716,10 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'timestamp': 1415867444, 'timestamp': 1415867444,
'upload_date': '20141113', 'upload_date': '20141113',
}, },
'params': { 'skip': 'redirects to TopGear home page',
# rtmp download
'skip_download': True,
}
}, { }, {
# single video embedded with Morph # single video embedded with Morph
# TODO: replacement test page
'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975', 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
'info_dict': { 'info_dict': {
'id': 'p041vhd0', 'id': 'p041vhd0',
@ -726,27 +730,22 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'uploader': 'BBC Sport', 'uploader': 'BBC Sport',
'uploader_id': 'bbc_sport', 'uploader_id': 'bbc_sport',
}, },
'params': { 'skip': 'Video no longer in page',
# m3u8 download
'skip_download': True,
},
'skip': 'Georestricted to UK',
}, { }, {
# single video with playlist.sxml URL in playlist param # single video in __INITIAL_DATA__ (was: playlist.sxml URL in playlist param)
'url': 'http://www.bbc.com/sport/0/football/33653409', 'url': 'http://www.bbc.com/sport/0/football/33653409',
'info_dict': { 'info_dict': {
'id': 'p02xycnp', 'id': 'p02xycnp',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', 'title': 'Ronaldo to Man Utd, Arsenal to spend?',
'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.', 'description': r'''re:(?s)BBC Sport's David Ornstein rounds up the latest transfer reports, .{359} here\.$''',
'duration': 140, 'timestamp': 1437750175,
'upload_date': '20150724',
'thumbnail': 'https://news.bbcimg.co.uk/media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png',
'duration': None, # 140,
}, },
'params': {
# rtmp download
'skip_download': True,
}
}, { }, {
# article with multiple videos embedded with playlist.sxml in playlist param # article with multiple videos embedded with Morph.setPayload
'url': 'http://www.bbc.com/sport/0/football/34475836', 'url': 'http://www.bbc.com/sport/0/football/34475836',
'info_dict': { 'info_dict': {
'id': '34475836', 'id': '34475836',
@ -754,6 +753,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.', 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
}, },
'playlist_count': 3, 'playlist_count': 3,
}, {
# lead item from above playlist
'url': 'http://www.bbc.com/sport/0/football/34475836',
'info_dict': {
'id': 'p034ppnv',
'ext': 'mp4',
'title': 'All you need to know about Jurgen Klopp',
'timestamp': 1444335081,
'upload_date': '20151008',
'duration': 122.0,
'thumbnail': 'https://ichef.bbci.co.uk/onesport/cps/976/cpsprodpb/7542/production/_85981003_klopp.jpg',
},
'params': {
'noplaylist': True,
},
}, { }, {
# school report article with single video # school report article with single video
'url': 'http://www.bbc.co.uk/schoolreport/35744779', 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
@ -762,6 +776,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'title': 'School which breaks down barriers in Jerusalem', 'title': 'School which breaks down barriers in Jerusalem',
}, },
'playlist_count': 1, 'playlist_count': 1,
'skip': 'redirects to Young Reporter home page https://www.bbc.co.uk/news/topics/cg41ylwv43pt',
}, { }, {
# single video with playlist URL from weather section # single video with playlist URL from weather section
'url': 'http://www.bbc.com/weather/features/33601775', 'url': 'http://www.bbc.com/weather/features/33601775',
@ -783,10 +798,10 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
# video with window.__INITIAL_DATA__ and value as JSON string # video with window.__INITIAL_DATA__ and value as JSON string
'url': 'https://www.bbc.com/news/av/world-europe-59468682', 'url': 'https://www.bbc.com/news/av/world-europe-59468682',
'info_dict': { 'info_dict': {
'id': 'p0b71qth', 'id': 'p0b779gc', # was 'p0b71qth',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Why France is making this woman a national hero', 'title': 'Why France is making this woman a national hero',
'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4', 'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{291} Casseville$',
'thumbnail': r're:https?://.+/.+\.jpg', 'thumbnail': r're:https?://.+/.+\.jpg',
'timestamp': 1638230731, 'timestamp': 1638230731,
'upload_date': '20211130', 'upload_date': '20211130',
@ -830,6 +845,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'uploader': 'Radio 3', 'uploader': 'Radio 3',
'uploader_id': 'bbc_radio_three', 'uploader_id': 'bbc_radio_three',
}, },
'skip': '404 Not Found',
}, { }, {
'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227', 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
'info_dict': { 'info_dict': {
@ -837,6 +853,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'ext': 'mp4', 'ext': 'mp4',
'title': 'md5:2fabf12a726603193a2879a055f72514', 'title': 'md5:2fabf12a726603193a2879a055f72514',
'description': 'Learn English words and phrases from this story', 'description': 'Learn English words and phrases from this story',
'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1200x675/p06pq9gk.jpg',
}, },
'add_ie': [BBCCoUkIE.ie_key()], 'add_ie': [BBCCoUkIE.ie_key()],
}, { }, {
@ -849,7 +866,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'alt_title': 'The downsides of positive thinking', 'alt_title': 'The downsides of positive thinking',
'description': 'md5:fad74b31da60d83b8265954ee42d85b4', 'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
'duration': 235, 'duration': 235,
'thumbnail': r're:https?://.+/p07c9dsr.jpg', 'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)',
'upload_date': '20190604', 'upload_date': '20190604',
'categories': ['Psychology'], 'categories': ['Psychology'],
}, },
@ -867,6 +884,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'duration': 1800, 'duration': 1800,
'uploader_id': 'bbc_radio_three', 'uploader_id': 'bbc_radio_three',
}, },
'skip': '404 Not Found',
}, { # onion routes }, { # onion routes
'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576', 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
'only_matching': True, 'only_matching': True,
@ -1082,83 +1100,141 @@ def _real_extract(self, url):
} }
# Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
# There are several setPayload calls may be present but the video # Several setPayload calls may be present but the video(s)
# seems to be always related to the first one # should be in one that mentions leadMedia or videoData
morph_payload = self._parse_json( morph_payload = self._search_json(
self._search_regex( r'\bMorph\s*\.\s*setPayload\s*\([^,]+,', webpage, 'morph payload', playlist_id,
r'Morph\.setPayload\([^,]+,\s*({.+?})\);', contains_pattern=r'\{(?:(?!</script>)[\s\S])+?(?:"leadMedia"|\\"videoData\\")\s*:(?:(?!</script>)[\s\S])+\}',
webpage, 'morph payload', default='{}'), default={})
playlist_id, fatal=False)
if morph_payload: if morph_payload:
components = try_get(morph_payload, lambda x: x['body']['components'], list) or [] for component in traverse_obj(morph_payload, (
for component in components: 'body', 'components', lambda _, v: v['props']['leadMedia']['identifiers'])):
if not isinstance(component, dict): lead_media = component['props']['leadMedia']
continue programme_id = traverse_obj(lead_media['identifiers'], 'vpid', 'playablePid', expected_type=str)
lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
if not lead_media:
continue
identifiers = lead_media.get('identifiers')
if not identifiers or not isinstance(identifiers, dict):
continue
programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
if not programme_id: if not programme_id:
continue continue
title = lead_media.get('title') or self._og_search_title(webpage) title = lead_media.get('title') or self._og_search_title(webpage)
formats, subtitles = self._download_media_selector(programme_id) formats, subtitles = self._download_media_selector(programme_id)
description = lead_media.get('summary')
uploader = lead_media.get('masterBrand')
uploader_id = lead_media.get('mid')
duration = None
duration_d = lead_media.get('duration')
if isinstance(duration_d, dict):
duration = parse_duration(dict_get(
duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
return { return {
'id': programme_id, 'id': programme_id,
'title': title, 'title': title,
'description': description, **traverse_obj(lead_media, {
'duration': duration, 'description': ('summary', {str}),
'uploader': uploader, 'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}),
'uploader_id': uploader_id, 'uploader': ('masterBrand', {str}),
'uploader_id': ('mid', {str}),
}),
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
} }
body = traverse_obj(morph_payload, (
'body', 'content', 'article', 'body',
{lambda s: self._parse_json(s, playlist_id, fatal=False)}))
added = False
for video_data in traverse_obj(body, (Ellipsis, 'videoData', {lambda v: v.get('pid') and v})):
if video_data.get('vpid'):
video_id = video_data['vpid']
formats, subtitles = self._download_media_selector(video_id)
entry = {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
}
else:
video_id = video_data['pid']
entry = self.url_result(
'https://www.bbc.co.uk/programmes/%s' % video_id, BBCCoUkIE.ie_key(),
video_id, url_transparent=True)
entry = merge_dicts(
traverse_obj(morph_payload, (
'body', 'content', 'article', {
'timestamp': ('dateTimeInfo', 'dateTime', {parse_iso8601}),
})), traverse_obj(video_data, {
'thumbnail': (('iChefImage', 'image'), {url_or_none}, any),
'title': (('title', 'caption'), {str}, any),
'duration': ('duration', {parse_duration}),
}), entry)
if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id):
return entry
entries.append(entry)
added = True
if added:
playlist_title = traverse_obj(morph_payload, (
'body', 'content', 'article', 'headline', {str})) or playlist_title
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
# various PRELOADED_STATE JSON
preload_state = self._search_json(
r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage,
'preload state', playlist_id, transform_source=js_to_json, default={})
# PRELOADED_STATE with current programmme
current_programme = traverse_obj(preload_state, (
'programmes', 'current', {dict}))
if current_programme:
programme_id = traverse_obj(current_programme, ('id', {str}))
if programme_id and current_programme.get('type') == 'playable_item':
title = traverse_obj(current_programme, ('titles', 'tertiary', {str})) or playlist_title
formats, subtitles = self._download_media_selector(programme_id)
return {
'id': programme_id,
'title': title,
'formats': formats,
**traverse_obj(current_programme, {
'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}),
'duration': ('duration', 'value', {int_or_none}),
'uploader': ('network', 'short_title', {str}),
'uploader_id': ('network', 'id', {str}),
}),
'subtitles': subtitles,
**traverse_obj(preload_state, {
'chapters': (
'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), {
'title': ('titles', {lambda x: join_nonempty(
'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
'start_time': ('offset', 'start', {float_or_none}),
'end_time': ('offset', 'end', {float_or_none}),
}
)
}),
}
preload_state = self._parse_json(self._search_regex( # PWA_PRELOADED_STATE with article video asset
r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage, asset_id = traverse_obj(preload_state, (
'preload state', default='{}'), playlist_id, fatal=False) 'entities', 'articles', lambda k, _: k.rsplit('/', 1)[-1] == playlist_id,
if preload_state: 'assetVideo', 0, {str}, any))
current_programme = preload_state.get('programmes', {}).get('current') or {} if asset_id:
programme_id = current_programme.get('id') video_id = traverse_obj(preload_state, ('entities', 'videos', asset_id, 'vpid', {str}))
if current_programme and programme_id and current_programme.get('type') == 'playable_item': if video_id:
title = current_programme.get('titles', {}).get('tertiary') or playlist_title article = traverse_obj(preload_state, (
formats, subtitles = self._download_media_selector(programme_id) 'entities', 'articles', lambda _, v: v['assetVideo'][0] == asset_id, any))
synopses = current_programme.get('synopses') or {}
network = current_programme.get('network') or {} def image_url(image_id):
duration = int_or_none( return traverse_obj(preload_state, (
current_programme.get('duration', {}).get('value')) 'entities', 'images', image_id, 'url',
thumbnail = None {lambda u: url_or_none(u.replace('$recipe', 'raw'))}))
image_url = current_programme.get('image_url')
if image_url: formats, subtitles = self._download_media_selector(video_id)
thumbnail = image_url.replace('{recipe}', 'raw')
return { return {
'id': programme_id, 'id': video_id,
'title': title, **traverse_obj(preload_state, ('entities', 'videos', asset_id, {
'description': dict_get(synopses, ('long', 'medium', 'short')), 'title': ('title', {str}),
'thumbnail': thumbnail, 'description': (('synopsisLong', 'synopsisMedium', 'synopsisShort'), {str}, any),
'duration': duration, 'thumbnail': (0, {image_url}),
'uploader': network.get('short_title'), 'duration': ('duration', {int_or_none}),
'uploader_id': network.get('id'), })),
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
'chapters': traverse_obj(preload_state, ( **traverse_obj(article, {
'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), { 'timestamp': ('displayDate', {parse_iso8601}),
'title': ('titles', {lambda x: join_nonempty( }),
'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
'start_time': ('offset', 'start', {float_or_none}),
'end_time': ('offset', 'end', {float_or_none}),
})) or None,
} }
else:
return self.url_result(
'https://www.bbc.co.uk/programmes/%s' % asset_id, BBCCoUkIE.ie_key(),
asset_id, playlist_title, display_id=playlist_id,
description=playlist_description)
bbc3_config = self._parse_json( bbc3_config = self._parse_json(
self._search_regex( self._search_regex(
@ -1204,6 +1280,28 @@ def _real_extract(self, url):
return self.playlist_result( return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description) entries, playlist_id, playlist_title, playlist_description)
k_int_or_none = functools.partial(int_or_none, scale=1000)
def parse_model(model):
'''Extract single video from model structure'''
item_id = traverse_obj(model, ('versions', 0, 'versionId', {str}))
if not item_id:
return
formats, subtitles = self._download_media_selector(item_id)
return {
'id': item_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(model, {
'title': ('title', {str}),
'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
'description': (
'synopses', ('long', 'medium', 'short'), {str}, any),
'duration': ('versions', 0, 'duration', {int}),
'timestamp': ('versions', 0, 'availableFrom', {k_int_or_none}),
})
}
initial_data = self._search_regex( initial_data = self._search_regex(
r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage, r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
'quoted preload state', default=None) 'quoted preload state', default=None)
@ -1215,6 +1313,21 @@ def _real_extract(self, url):
initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
initial_data = self._parse_json(initial_data, playlist_id, fatal=False) initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
if initial_data: if initial_data:
added = False
for video_data in traverse_obj(initial_data, (
'stores', 'article', 'articleBodyContent', lambda _, v: v['type'] == 'video')):
model = traverse_obj(video_data, (
'model', 'blocks', lambda _, v: v['type'] == 'aresMedia',
'model', 'blocks', lambda _, v: v['type'] == 'aresMediaMetadata',
'model', {dict}, any))
entry = parse_model(model)
if entry:
entries.append(entry)
added = True
if added:
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
def parse_media(media): def parse_media(media):
if not media: if not media:
return return
@ -1248,18 +1361,19 @@ def parse_media(media):
'timestamp': item_time, 'timestamp': item_time,
'description': strip_or_none(item_desc), 'description': strip_or_none(item_desc),
}) })
for resp in traverse_obj(initial_data, ('data', lambda _, v: v.get('name'))):
name = resp['name']
for resp in (initial_data.get('data') or {}).values(): for resp in (initial_data.get('data') or {}).values():
name = resp.get('name') name = resp.get('name')
if name == 'media-experience': if name == 'media-experience':
parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
elif name == 'article': elif name == 'article':
for block in (try_get(resp, for block in traverse_obj(resp, ('data', (
(lambda x: x['data']['blocks'], None, ('content', 'model')), 'blocks',
lambda x: x['data']['content']['model']['blocks'],), lambda _, v: v.get('type') in {'media', 'video'},
list) or []): 'model', {dict})):
if block.get('type') not in ['media', 'video']: parse_media(block)
continue
parse_media(block.get('model'))
return self.playlist_result( return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description) entries, playlist_id, playlist_title, playlist_description)
@ -1268,26 +1382,6 @@ def extract_all(pattern):
lambda s: self._parse_json(s, playlist_id, fatal=False), lambda s: self._parse_json(s, playlist_id, fatal=False),
re.findall(pattern, webpage)))) re.findall(pattern, webpage))))
def parse_model(model):
'''Extract single video from model structure'''
item_id = traverse_obj(model, ('versions', 0, 'versionId', {str}))
if not item_id:
return
formats, subtitles = self._download_media_selector(item_id)
return {
'id': item_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(model, {
'title': ('title', {str}),
'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
'description': (
'synopses', ('long', 'medium', 'short'), {str}, any),
'duration': ('versions', 0, 'duration', {int}),
'timestamp': ('versions', 0, 'availableFrom', {lambda x: int_or_none(x, scale=1000)}),
})
}
# US accessed article with single embedded video (e.g. # US accessed article with single embedded video (e.g.
# https://www.bbc.com/news/uk-68546268) # https://www.bbc.com/news/uk-68546268)
next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}), ( next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}), (
@ -1303,7 +1397,7 @@ def parse_model(model):
if entry.get('timestamp') is None: if entry.get('timestamp') is None:
entry['timestamp'] = traverse_obj(next_data, ( entry['timestamp'] = traverse_obj(next_data, (
..., 'contents', lambda _, v: v['type'] == 'timestamp', ..., 'contents', lambda _, v: v['type'] == 'timestamp',
'model', 'timestamp', {functools.partial(int_or_none, scale=1000)}, any)) 'model', 'timestamp', {k_int_or_none}, any))
entries.append(entry) entries.append(entry)
return self.playlist_result( return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description) entries, playlist_id, playlist_title, playlist_description)