[extractor] Extract thumbnails from JSON-LD (#2195)

Authored by: nixxo
This commit is contained in:
nixxo 2022-01-01 20:50:27 +01:00 committed by GitHub
parent 767f999b53
commit 7592749cbe
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 28 additions and 1 deletions

View file

@ -208,6 +208,32 @@ def test_search_json_ld_realworld(self):
}, },
{'expected_type': 'NewsArticle'}, {'expected_type': 'NewsArticle'},
), ),
(
# test multiple thumbnails in a list
r'''
<script type="application/ld+json">
{"@context":"https://schema.org",
"@type":"VideoObject",
"thumbnailUrl":["https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg"]}
</script>''',
{
'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}],
},
{},
),
(
# test single thumbnail
r'''
<script type="application/ld+json">
{"@context":"https://schema.org",
"@type":"VideoObject",
"thumbnailUrl":"https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg"}
</script>''',
{
'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}],
},
{},
)
] ]
for html, expected_dict, search_json_ld_kwargs in _TESTS: for html, expected_dict, search_json_ld_kwargs in _TESTS:
expect_dict( expect_dict(

View file

@ -1436,7 +1436,8 @@ def extract_video_object(e):
'url': url_or_none(e.get('contentUrl')), 'url': url_or_none(e.get('contentUrl')),
'title': unescapeHTML(e.get('name')), 'title': unescapeHTML(e.get('name')),
'description': unescapeHTML(e.get('description')), 'description': unescapeHTML(e.get('description')),
'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), 'thumbnails': [{'url': url_or_none(url)}
for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
'duration': parse_duration(e.get('duration')), 'duration': parse_duration(e.get('duration')),
'timestamp': unified_timestamp(e.get('uploadDate')), 'timestamp': unified_timestamp(e.get('uploadDate')),
# author can be an instance of 'Organization' or 'Person' types. # author can be an instance of 'Organization' or 'Person' types.