[extractor/common] Support root JSON-LD lists (Closes #10203)

This commit is contained in:
Sergey M․ 2016-08-05 23:14:32 +07:00
parent 3859ebeee6
commit 46933a15d6
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D

View file

@ -828,41 +828,47 @@ def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
if not json_ld: if not json_ld:
return {} return {}
info = {} info = {}
if json_ld.get('@context') == 'http://schema.org': if not isinstance(json_ld, (list, tuple, dict)):
item_type = json_ld.get('@type') return info
if expected_type is not None and expected_type != item_type: if isinstance(json_ld, dict):
return info json_ld = [json_ld]
if item_type == 'TVEpisode': for e in json_ld:
info.update({ if e.get('@context') == 'http://schema.org':
'episode': unescapeHTML(json_ld.get('name')), item_type = e.get('@type')
'episode_number': int_or_none(json_ld.get('episodeNumber')), if expected_type is not None and expected_type != item_type:
'description': unescapeHTML(json_ld.get('description')), return info
}) if item_type == 'TVEpisode':
part_of_season = json_ld.get('partOfSeason') info.update({
if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': 'episode': unescapeHTML(e.get('name')),
info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) 'episode_number': int_or_none(e.get('episodeNumber')),
part_of_series = json_ld.get('partOfSeries') 'description': unescapeHTML(e.get('description')),
if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': })
info['series'] = unescapeHTML(part_of_series.get('name')) part_of_season = e.get('partOfSeason')
elif item_type == 'Article': if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
info.update({ info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
'timestamp': parse_iso8601(json_ld.get('datePublished')), part_of_series = e.get('partOfSeries')
'title': unescapeHTML(json_ld.get('headline')), if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
'description': unescapeHTML(json_ld.get('articleBody')), info['series'] = unescapeHTML(part_of_series.get('name'))
}) elif item_type == 'Article':
elif item_type == 'VideoObject': info.update({
info.update({ 'timestamp': parse_iso8601(e.get('datePublished')),
'url': json_ld.get('contentUrl'), 'title': unescapeHTML(e.get('headline')),
'title': unescapeHTML(json_ld.get('name')), 'description': unescapeHTML(e.get('articleBody')),
'description': unescapeHTML(json_ld.get('description')), })
'thumbnail': json_ld.get('thumbnailUrl'), elif item_type == 'VideoObject':
'duration': parse_duration(json_ld.get('duration')), info.update({
'timestamp': unified_timestamp(json_ld.get('uploadDate')), 'url': e.get('contentUrl'),
'filesize': float_or_none(json_ld.get('contentSize')), 'title': unescapeHTML(e.get('name')),
'tbr': int_or_none(json_ld.get('bitrate')), 'description': unescapeHTML(e.get('description')),
'width': int_or_none(json_ld.get('width')), 'thumbnail': e.get('thumbnailUrl'),
'height': int_or_none(json_ld.get('height')), 'duration': parse_duration(e.get('duration')),
}) 'timestamp': unified_timestamp(e.get('uploadDate')),
'filesize': float_or_none(e.get('contentSize')),
'tbr': int_or_none(e.get('bitrate')),
'width': int_or_none(e.get('width')),
'height': int_or_none(e.get('height')),
})
break
return dict((k, v) for k, v in info.items() if v is not None) return dict((k, v) for k, v in info.items() if v is not None)
@staticmethod @staticmethod