[extractor/twitter] Fix --no-playlist and add media view_count when using GraphQL (#6211)

Authored by: Grub4K
This commit is contained in:
Simon Sawicki 2023-02-12 14:43:26 +01:00 committed by GitHub
parent 2e269bd998
commit b6795fd310
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -293,7 +293,7 @@ def _real_extract(self, url):
class TwitterIE(TwitterBaseIE): class TwitterIE(TwitterBaseIE):
IE_NAME = 'twitter' IE_NAME = 'twitter'
_VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P<id>\d+)(?:/video/(?P<index>\d+))?' _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P<id>\d+)(?:/(?:video|photo)/(?P<index>\d+))?'
_TESTS = [{ _TESTS = [{
'url': 'https://twitter.com/freethenipple/status/643211948184596480', 'url': 'https://twitter.com/freethenipple/status/643211948184596480',
@ -336,7 +336,7 @@ class TwitterIE(TwitterBaseIE):
'id': '665052190608723968', 'id': '665052190608723968',
'display_id': '665052190608723968', 'display_id': '665052190608723968',
'ext': 'mp4', 'ext': 'mp4',
'title': 'md5:e99588f17b3dd0503814ffb560e64731', 'title': r're:Star Wars.*A new beginning is coming December 18.*',
'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ', 'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ',
'uploader_id': 'starwars', 'uploader_id': 'starwars',
'uploader': r're:Star Wars.*', 'uploader': r're:Star Wars.*',
@ -752,7 +752,7 @@ class TwitterIE(TwitterBaseIE):
'info_dict': { 'info_dict': {
'id': '1600649511827013632', 'id': '1600649511827013632',
'ext': 'mp4', 'ext': 'mp4',
'title': 'md5:be05989b0722e114103ed3851a0ffae2', 'title': 'md5:dac4f4d4c591fcc4e88a253eba472dc3',
'thumbnail': r're:^https?://.+\.jpg', 'thumbnail': r're:^https?://.+\.jpg',
'timestamp': 1670459604.0, 'timestamp': 1670459604.0,
'uploader_id': 'CTVJLaidlaw', 'uploader_id': 'CTVJLaidlaw',
@ -792,6 +792,52 @@ class TwitterIE(TwitterBaseIE):
'repost_count': int, 'repost_count': int,
'comment_count': int, 'comment_count': int,
}, },
}, {
'url': 'https://twitter.com/hlo_again/status/1599108751385972737/video/2',
'info_dict': {
'id': '1599108643743473680',
'display_id': '1599108751385972737',
'ext': 'mp4',
'title': '\u06ea - \U0001F48B',
'uploader_url': 'https://twitter.com/hlo_again',
'like_count': int,
'uploader_id': 'hlo_again',
'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1599108643743473680/pu/img/UG3xjov4rgg5sbYM.jpg?name=orig',
'repost_count': int,
'duration': 9.531,
'comment_count': int,
'upload_date': '20221203',
'age_limit': 0,
'timestamp': 1670092210.0,
'tags': [],
'uploader': '\u06ea',
'description': '\U0001F48B https://t.co/bTj9Qz7vQP',
},
'params': {'noplaylist': True},
}, {
# Media view count is GraphQL only, force in test
'url': 'https://twitter.com/MunTheShinobi/status/1600009574919962625',
'info_dict': {
'id': '1600009362759733248',
'display_id': '1600009574919962625',
'ext': 'mp4',
'uploader_url': 'https://twitter.com/MunTheShinobi',
'description': 'This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525 https://t.co/cNsA0MoOml',
'view_count': int,
'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig',
'age_limit': 0,
'uploader': 'Mün The Shinobi | BlaqBoi\'s Therapist',
'repost_count': int,
'upload_date': '20221206',
'title': 'Mün The Shinobi | BlaqBoi\'s Therapist - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525',
'comment_count': int,
'like_count': int,
'tags': [],
'uploader_id': 'MunTheShinobi',
'duration': 139.987,
'timestamp': 1670306984.0,
},
'params': {'extractor_args': {'twitter': {'force_graphql': ['']}}},
}, { }, {
# onion route # onion route
'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273',
@ -920,13 +966,6 @@ def _real_extract(self, url):
title = f'{uploader} - {title}' title = f'{uploader} - {title}'
uploader_id = user.get('screen_name') uploader_id = user.get('screen_name')
tags = []
for hashtag in (try_get(status, lambda x: x['entities']['hashtags'], list) or []):
hashtag_text = hashtag.get('text')
if not hashtag_text:
continue
tags.append(hashtag_text)
info = { info = {
'id': twid, 'id': twid,
'title': title, 'title': title,
@ -939,7 +978,7 @@ def _real_extract(self, url):
'repost_count': int_or_none(status.get('retweet_count')), 'repost_count': int_or_none(status.get('retweet_count')),
'comment_count': int_or_none(status.get('reply_count')), 'comment_count': int_or_none(status.get('reply_count')),
'age_limit': 18 if status.get('possibly_sensitive') else 0, 'age_limit': 18 if status.get('possibly_sensitive') else 0,
'tags': tags, 'tags': traverse_obj(status, ('entities', 'hashtags', ..., 'text')),
} }
def extract_from_video_info(media): def extract_from_video_info(media):
@ -973,6 +1012,7 @@ def add_thumbnail(name, size):
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})),
'duration': float_or_none(video_info.get('duration_millis'), 1000), 'duration': float_or_none(video_info.get('duration_millis'), 1000),
# The codec of http formats are unknown # The codec of http formats are unknown
'_format_sort_fields': ('res', 'br', 'size', 'proto'), '_format_sort_fields': ('res', 'br', 'size', 'proto'),
@ -1052,11 +1092,31 @@ def get_binding_value(k):
'content_duration_seconds')), 'content_duration_seconds')),
} }
media_path = ((None, 'quoted_status'), 'extended_entities', 'media', lambda _, m: m['type'] != 'photo') videos = traverse_obj(status, (
videos = map(extract_from_video_info, traverse_obj(status, media_path, expected_type=dict)) (None, 'quoted_status'), 'extended_entities', 'media', lambda _, m: m['type'] != 'photo', {dict}))
cards = extract_from_card_info(status.get('card'))
entries = [{**info, **data, 'display_id': twid} for data in (*videos, *cards)]
if self._yes_playlist(twid, selected_index, video_label='URL-specified video number'):
selected_entries = (*map(extract_from_video_info, videos), *extract_from_card_info(status.get('card')))
else:
desired_obj = traverse_obj(status, ('extended_entities', 'media', int(selected_index) - 1, {dict}))
if not desired_obj:
raise ExtractorError(f'Video #{selected_index} is unavailable', expected=True)
elif desired_obj.get('type') != 'video':
raise ExtractorError(f'Media #{selected_index} is not a video', expected=True)
# Restore original archive id and video index in title
for index, entry in enumerate(videos, 1):
if entry.get('id') != desired_obj.get('id'):
continue
if index == 1:
info['_old_archive_ids'] = [make_archive_id(self, twid)]
if len(videos) != 1:
info['title'] += f' #{index}'
break
return {**info, **extract_from_video_info(desired_obj), 'display_id': twid}
entries = [{**info, **data, 'display_id': twid} for data in selected_entries]
if not entries: if not entries:
expanded_url = traverse_obj(status, ('entities', 'urls', 0, 'expanded_url'), expected_type=url_or_none) expanded_url = traverse_obj(status, ('entities', 'urls', 0, 'expanded_url'), expected_type=url_or_none)
if not expanded_url or expanded_url == url: if not expanded_url or expanded_url == url:
@ -1066,13 +1126,6 @@ def get_binding_value(k):
entries[0]['_old_archive_ids'] = [make_archive_id(self, twid)] entries[0]['_old_archive_ids'] = [make_archive_id(self, twid)]
if not self._yes_playlist(twid, selected_index, video_label='URL-specified video number'):
index = int(selected_index) - 1
if index >= len(entries):
raise ExtractorError(f'Video #{selected_index} is unavailable', expected=True)
return entries[index]
if len(entries) == 1: if len(entries) == 1:
return entries[0] return entries[0]