[ie/theplatform] Extract more metadata (#8635)

Authored by: trainman261
This commit is contained in:
trainman261 2023-12-12 01:00:35 +01:00 committed by GitHub
parent e370f9ec36
commit 7e09c147fd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 73 additions and 11 deletions

View File

@ -121,11 +121,21 @@ class AENetworksIE(AENetworksBaseIE):
'info_dict': { 'info_dict': {
'id': '22253814', 'id': '22253814',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Winter is Coming', 'title': 'Winter Is Coming',
'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', 'description': 'md5:a40e370925074260b1c8a633c632c63a',
'timestamp': 1338306241, 'timestamp': 1338306241,
'upload_date': '20120529', 'upload_date': '20120529',
'uploader': 'AENE-NEW', 'uploader': 'AENE-NEW',
'duration': 2592.0,
'thumbnail': r're:^https?://.*\.jpe?g$',
'chapters': 'count:5',
'tags': 'count:14',
'categories': ['Mountain Men'],
'episode_number': 1,
'episode': 'Episode 1',
'season': 'Season 1',
'season_number': 1,
'series': 'Mountain Men',
}, },
'params': { 'params': {
# m3u8 download # m3u8 download
@ -143,6 +153,15 @@ class AENetworksIE(AENetworksBaseIE):
'timestamp': 1452634428, 'timestamp': 1452634428,
'upload_date': '20160112', 'upload_date': '20160112',
'uploader': 'AENE-NEW', 'uploader': 'AENE-NEW',
'duration': 1277.695,
'thumbnail': r're:^https?://.*\.jpe?g$',
'chapters': 'count:4',
'tags': 'count:23',
'episode': 'Episode 1',
'episode_number': 1,
'season': 'Season 9',
'season_number': 9,
'series': 'Duck Dynasty',
}, },
'params': { 'params': {
# m3u8 download # m3u8 download

View File

@ -180,6 +180,13 @@ class CBCPlayerIE(InfoExtractor):
'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg', 'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg',
'chapters': [], 'chapters': [],
'duration': 494.811, 'duration': 494.811,
'categories': ['AudioMobile/All in a Weekend Montreal'],
'tags': 'count:8',
'location': 'Quebec',
'series': 'All in a Weekend Montreal',
'season': 'Season 2015',
'season_number': 2015,
'media_type': 'Excerpt',
}, },
}, { }, {
'url': 'http://www.cbc.ca/player/play/2164402062', 'url': 'http://www.cbc.ca/player/play/2164402062',
@ -195,25 +202,37 @@ class CBCPlayerIE(InfoExtractor):
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg', 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg',
'chapters': [], 'chapters': [],
'duration': 186.867, 'duration': 186.867,
'series': 'CBC News: Windsor at 6:00',
'categories': ['News/Canada/Windsor'],
'location': 'Windsor',
'tags': ['cancer'],
'creator': 'Allison Johnson',
'media_type': 'Excerpt',
}, },
}, { }, {
# Has subtitles # Has subtitles
# These broadcasts expire after ~1 month, can find new test URL here: # These broadcasts expire after ~1 month, can find new test URL here:
# https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast # https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast
'url': 'http://www.cbc.ca/player/play/2249992771553', 'url': 'http://www.cbc.ca/player/play/2284799043667',
'md5': '2f2fb675dd4f0f8a5bb7588d1b13bacd', 'md5': '9b49f0839e88b6ec0b01d840cf3d42b5',
'info_dict': { 'info_dict': {
'id': '2249992771553', 'id': '2284799043667',
'ext': 'mp4', 'ext': 'mp4',
'title': 'The National | Womens soccer pay, Florida seawater, Swift quake', 'title': 'The National | Hockey coach charged, Green grants, Safer drugs',
'description': 'md5:adba28011a56cfa47a080ff198dad27a', 'description': 'md5:84ef46321c94bcf7d0159bb565d26bfa',
'timestamp': 1690596000, 'timestamp': 1700272800,
'duration': 2716.333, 'duration': 2718.833,
'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]}, 'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]},
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/481/326/thumbnail.jpeg', 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/907/171/thumbnail.jpeg',
'uploader': 'CBCC-NEW', 'uploader': 'CBCC-NEW',
'chapters': 'count:5', 'chapters': 'count:5',
'upload_date': '20230729', 'upload_date': '20231118',
'categories': 'count:4',
'series': 'The National - Full Show',
'tags': 'count:1',
'creator': 'News',
'location': 'Canada',
'media_type': 'Full Program',
}, },
}] }]

View File

@ -46,6 +46,10 @@ class CWTVIE(InfoExtractor):
'timestamp': 1444107300, 'timestamp': 1444107300,
'age_limit': 14, 'age_limit': 14,
'uploader': 'CWTV', 'uploader': 'CWTV',
'thumbnail': r're:^https?://.*\.jpe?g$',
'chapters': 'count:4',
'episode': 'Episode 20',
'season': 'Season 11',
}, },
'params': { 'params': {
# m3u8 download # m3u8 download

View File

@ -73,6 +73,7 @@ class MediasetIE(ThePlatformBaseIE):
'season_number': 5, 'season_number': 5,
'episode_number': 5, 'episode_number': 5,
'chapters': [{'start_time': 0.0, 'end_time': 3409.08}, {'start_time': 3409.08, 'end_time': 6565.008}], 'chapters': [{'start_time': 0.0, 'end_time': 3409.08}, {'start_time': 3409.08, 'end_time': 6565.008}],
'categories': ['Informazione'],
}, },
}, { }, {
# DRM # DRM
@ -149,6 +150,7 @@ class MediasetIE(ThePlatformBaseIE):
'season_number': 12, 'season_number': 12,
'episode': 'Episode 8', 'episode': 'Episode 8',
'episode_number': 8, 'episode_number': 8,
'categories': ['Intrattenimento'],
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,

View File

@ -53,6 +53,8 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
'chapters': 'count:1', 'chapters': 'count:1',
'tags': 'count:4', 'tags': 'count:4',
'thumbnail': r're:https?://.+\.jpg', 'thumbnail': r're:https?://.+\.jpg',
'categories': ['Series/The Tonight Show Starring Jimmy Fallon'],
'media_type': 'Full Episode',
}, },
'params': { 'params': {
'skip_download': 'm3u8', 'skip_download': 'm3u8',
@ -131,6 +133,8 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
'tags': 'count:10', 'tags': 'count:10',
'age_limit': 0, 'age_limit': 0,
'thumbnail': r're:https?://.+\.jpg', 'thumbnail': r're:https?://.+\.jpg',
'categories': ['Series/Quantum Leap 2022'],
'media_type': 'Highlight',
}, },
'params': { 'params': {
'skip_download': 'm3u8', 'skip_download': 'm3u8',

View File

@ -114,6 +114,8 @@ class ScrippsNetworksIE(InfoExtractor):
'timestamp': 1475678834, 'timestamp': 1475678834,
'upload_date': '20161005', 'upload_date': '20161005',
'uploader': 'SCNI-SCND', 'uploader': 'SCNI-SCND',
'tags': 'count:10',
'creator': 'Cooking Channel',
'duration': 29.995, 'duration': 29.995,
'chapters': [{'start_time': 0.0, 'end_time': 29.995, 'title': '<Untitled Chapter 1>'}], 'chapters': [{'start_time': 0.0, 'end_time': 29.995, 'title': '<Untitled Chapter 1>'}],
'thumbnail': 'https://images.dds.discovery.com/up/tp/Scripps_-_Food_Category_Prod/122/987/0260338_630x355.jpg', 'thumbnail': 'https://images.dds.discovery.com/up/tp/Scripps_-_Food_Category_Prod/122/987/0260338_630x355.jpg',

View File

@ -104,6 +104,10 @@ class ThePlatformBaseIE(OnceIE):
_add_chapter(chapter.get('startTime'), chapter.get('endTime')) _add_chapter(chapter.get('startTime'), chapter.get('endTime'))
_add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration) _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration)
def extract_site_specific_field(field):
# A number of sites have custom-prefixed keys, e.g. 'cbc$seasonNumber'
return traverse_obj(info, lambda k, v: v and k.endswith(f'${field}'), get_all=False)
return { return {
'title': info['title'], 'title': info['title'],
'subtitles': subtitles, 'subtitles': subtitles,
@ -113,6 +117,14 @@ class ThePlatformBaseIE(OnceIE):
'timestamp': int_or_none(info.get('pubDate'), 1000) or None, 'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
'uploader': info.get('billingCode'), 'uploader': info.get('billingCode'),
'chapters': chapters, 'chapters': chapters,
'creator': traverse_obj(info, ('author', {str})) or None,
'categories': traverse_obj(info, (
'categories', lambda _, v: v.get('label') in ('category', None), 'name', {str})) or None,
'tags': traverse_obj(info, ('keywords', {lambda x: re.split(r'[;,]\s?', x) if x else None})),
'location': extract_site_specific_field('region'),
'series': extract_site_specific_field('show'),
'season_number': int_or_none(extract_site_specific_field('seasonNumber')),
'media_type': extract_site_specific_field('programmingType') or extract_site_specific_field('type'),
} }
def _extract_theplatform_metadata(self, path, video_id): def _extract_theplatform_metadata(self, path, video_id):