Fix YoutubeChannelIE

- urls with query parameters now match
- fixes regex for identifying videos
- fixes pagination
This commit is contained in:
ispedals 2013-04-19 18:05:35 -04:00
parent dce9027045
commit 5a8d13199c

View file

@ -1823,15 +1823,23 @@ def _real_extract(self, url):
class YoutubeChannelIE(InfoExtractor): class YoutubeChannelIE(InfoExtractor):
"""Information Extractor for YouTube channels.""" """Information Extractor for YouTube channels."""
_VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$" _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
_TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en' _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
_MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}" _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
_MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
IE_NAME = u'youtube:channel' IE_NAME = u'youtube:channel'
def report_download_page(self, channel_id, pagenum): def report_download_page(self, channel_id, pagenum):
"""Report attempt to download channel page with given number.""" """Report attempt to download channel page with given number."""
self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum)) self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
def extract_videos_from_page(self, page):
ids_in_page = []
for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
if mobj.group(1) not in ids_in_page:
ids_in_page.append(mobj.group(1))
return ids_in_page
def _real_extract(self, url): def _real_extract(self, url):
# Extract channel id # Extract channel id
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
@ -1839,12 +1847,11 @@ def _real_extract(self, url):
self._downloader.report_error(u'invalid url: %s' % url) self._downloader.report_error(u'invalid url: %s' % url)
return return
# Download channel pages # Download channel page
channel_id = mobj.group(1) channel_id = mobj.group(1)
video_ids = [] video_ids = []
pagenum = 1 pagenum = 1
while True:
self.report_download_page(channel_id, pagenum) self.report_download_page(channel_id, pagenum)
url = self._TEMPLATE_URL % (channel_id, pagenum) url = self._TEMPLATE_URL % (channel_id, pagenum)
request = compat_urllib_request.Request(url) request = compat_urllib_request.Request(url)
@ -1855,16 +1862,31 @@ def _real_extract(self, url):
return return
# Extract video identifiers # Extract video identifiers
ids_in_page = [] ids_in_page = self.extract_videos_from_page(page)
for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
if mobj.group(1) not in ids_in_page:
ids_in_page.append(mobj.group(1))
video_ids.extend(ids_in_page) video_ids.extend(ids_in_page)
if self._MORE_PAGES_INDICATOR not in page: # Download any subsequent channel pages using the json-based channel_ajax query
break if self._MORE_PAGES_INDICATOR in page:
while True:
pagenum = pagenum + 1 pagenum = pagenum + 1
self.report_download_page(channel_id, pagenum)
url = self._MORE_PAGES_URL % (pagenum, channel_id)
request = compat_urllib_request.Request(url)
try:
page = compat_urllib_request.urlopen(request).read().decode('utf8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
return
page = json.loads(page)
ids_in_page = self.extract_videos_from_page(page['content_html'])
video_ids.extend(ids_in_page)
if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
break
self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids] urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]