From 25f14e9f93295a787e0cb436a5f6179d6174733d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 May 2015 21:06:59 +0600 Subject: [PATCH] [youtube] Separate feed extractor --- youtube_dl/extractor/youtube.py | 143 +++++++++----------------------- 1 file changed, 37 insertions(+), 106 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e58184adcf..9096a29756 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -49,6 +49,11 @@ def _set_language(self): # YouTube sets the expire time to about two months expire_time=time.time() + 2 * 30 * 24 * 3600) + def _ids_to_results(self, ids): + return [ + self.url_result(vid_id, 'Youtube', video_id=vid_id) + for vid_id in ids] + def _login(self): """ Attempt to log in to YouTube. @@ -1261,11 +1266,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): def _real_initialize(self): self._login() - def _ids_to_results(self, ids): - return [ - self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] - def _extract_mix(self, playlist_id): # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id @@ -1601,20 +1601,10 @@ def _real_extract(self, url): class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ - Base class for extractors that fetch info from - http://www.youtube.com/feed_ajax + Base class for feed extractors Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True - # use action_load_personal_feed instead of action_load_system_feed - _PERSONAL_FEED = False - - @property - def _FEED_TEMPLATE(self): - action = 'action_load_system_feed' - if self._PERSONAL_FEED: - action = 'action_load_personal_feed' - return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME) @property def IE_NAME(self): @@ -1624,58 +1614,8 @@ def _real_initialize(self): self._login() def _real_extract(self, url): - feed_entries = [] - paging = 0 - for i in itertools.count(1): - info = self._download_json( - self._FEED_TEMPLATE % paging, - '%s feed' % self._FEED_NAME, - 'Downloading page %s' % i, - transform_source=uppercase_escape) - feed_html = info.get('feed_html') or info.get('content_html') - load_more_widget_html = info.get('load_more_widget_html') or feed_html - m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) - ids = orderedSet(m.group(1) for m in m_ids) - feed_entries.extend( - self.url_result(video_id, 'Youtube', video_id=video_id) - for video_id in ids) - mobj = re.search( - r'data-uix-load-more-href="/?[^"]+paging=(?P\d+)', - load_more_widget_html) - if mobj is None: - break - paging = mobj.group('paging') - return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) - - -class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): - IE_NAME = 'youtube:recommended' - IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' - _FEED_NAME = 'recommended' - _PLAYLIST_TITLE = 'Youtube Recommended videos' - - -class YoutubeWatchLaterIE(YoutubePlaylistIE): - IE_NAME = 'youtube:watchlater' - IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater' - - _TESTS = [] # override PlaylistIE tests - - def _real_extract(self, url): - return self._extract_playlist('WL') - - -class YoutubeHistoryIE(YoutubePlaylistIE): - IE_NAME = 'youtube:history' - IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' - _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' - _TESTS = [] - - def _real_extract(self, url): - title = 'Youtube History' - page = self._download_webpage('https://www.youtube.com/feed/history', title) + page = self._download_webpage( + 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE) # The extraction process is the same as for playlists, but the regex # for the video ids doesn't contain an index @@ -1692,17 +1632,25 @@ def _real_extract(self, url): break more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), title, + 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, 'Downloading page #%s' % page_num, transform_source=uppercase_escape) content_html = more['content_html'] more_widget_html = more['load_more_widget_html'] - return { - '_type': 'playlist', - 'title': title, - 'entries': self._ids_to_results(ids), - } + return self.playlist_result( + self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE) + + +class YoutubeWatchLaterIE(YoutubePlaylistIE): + IE_NAME = 'youtube:watchlater' + IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' + _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater' + + _TESTS = [] # override PlaylistIE tests + + def _real_extract(self, url): + return self._extract_playlist('WL') class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): @@ -1717,42 +1665,25 @@ def _real_extract(self, url): return self.url_result(playlist_id, 'YoutubePlaylist') -class YoutubeSubscriptionsIE(YoutubePlaylistIE): - IE_NAME = 'youtube:subscriptions' +class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' + _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' + _FEED_NAME = 'recommended' + _PLAYLIST_TITLE = 'Youtube Recommended videos' + + +class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' - _TESTS = [] + _FEED_NAME = 'subscriptions' + _PLAYLIST_TITLE = 'Youtube Subscriptions' - def _real_extract(self, url): - title = 'Youtube Subscriptions' - page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title) - # The extraction process is the same as for playlists, but the regex - # for the video ids doesn't contain an index - ids = [] - more_widget_html = content_html = page - - for page_num in itertools.count(1): - matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) - new_ids = orderedSet(matches) - ids.extend(new_ids) - - mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), title, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] - - return { - '_type': 'playlist', - 'title': title, - 'entries': self._ids_to_results(ids), - } +class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' + _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' + _FEED_NAME = 'history' + _PLAYLIST_TITLE = 'Youtube History' class YoutubeTruncatedURLIE(InfoExtractor):