[youtube] Make search extraction less dependent on json schema.

If an object looks like a video (it has a `videoId` key), assume that it is.
This commit is contained in:
xarantolus 2020-06-23 08:56:21 +02:00
parent 19f671f88b
commit e03b4f3e05

View file

@ -3229,16 +3229,37 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
def _find_videos_in_json(self, extracted):
videos = []
def _real_find(obj):
if obj is None or isinstance(obj, str):
return
if type(obj) is list:
for elem in obj:
_real_find(elem)
if type(obj) is dict:
if "videoId" in obj:
videos.append(obj)
return
for _, o in obj.items():
_real_find(o)
_real_find(extracted)
return videos
def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page):
search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None) search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None)
result_items = try_get( result_items = self._find_videos_in_json(search_response)
search_response,
lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'])
for plobj in result_items: for plobj in result_items:
video_id = try_get(plobj, lambda x: x['videoRenderer']['videoId']) video_id = try_get(plobj, lambda x: x['videoId'])
video_title = try_get(plobj, lambda x: x['videoRenderer']['title']['runs'][0]['text']) video_title = try_get(plobj, lambda x: x['title']['runs'][0]['text'])
if video_id is None or video_title is None: if video_id is None or video_title is None:
# we do not have a videoRenderer or it is empty # we do not have a videoRenderer or it is empty