[mixcloud] Added support for user uploads, playlists, favorites and listens.

Fixes #3750 and #5272
This commit is contained in:
Philip Huppert 2015-10-05 00:41:20 +02:00
parent 466a614537
commit c96eca426b
2 changed files with 203 additions and 3 deletions

View file

@ -411,7 +411,11 @@
from .miomio import MioMioIE from .miomio import MioMioIE
from .mit import TechTVMITIE, MITIE, OCWMITIE from .mit import TechTVMITIE, MITIE, OCWMITIE
from .mitele import MiTeleIE from .mitele import MiTeleIE
from .mixcloud import MixcloudIE from .mixcloud import (
MixcloudIE,
MixcloudUserIE,
MixcloudPlaylistIE
)
from .mlb import MLBIE from .mlb import MLBIE
from .mnet import MnetIE from .mnet import MnetIE
from .mpora import MporaIE from .mpora import MporaIE

View file

@ -3,18 +3,22 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote from ..compat import (
compat_urllib_parse_unquote,
compat_urllib_request
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
HEADRequest, HEADRequest,
NO_DEFAULT, NO_DEFAULT,
parse_count, parse_count,
str_to_int, str_to_int,
clean_html
) )
class MixcloudIE(InfoExtractor): class MixcloudIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)' _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)'
IE_NAME = 'mixcloud' IE_NAME = 'mixcloud'
_TESTS = [{ _TESTS = [{
@ -115,3 +119,195 @@ def _real_extract(self, url):
'view_count': view_count, 'view_count': view_count,
'like_count': like_count, 'like_count': like_count,
} }
class MixcloudUserIE(InfoExtractor):
"""
Information extractor for Mixcloud users.
It can retrieve a list of a user's uploads, favorites or listens.
"""
_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/(?P<type>uploads|favorites|listens)?/?$'
IE_NAME = 'mixcloud:user'
_TESTS = [{
'url': 'http://www.mixcloud.com/dholbach/',
'info_dict': {
'id': 'dholbach/uploads',
'title': 'Daniel Holbach (uploads)',
'description': 'md5:327af72d1efeb404a8216c27240d1370',
},
'playlist_mincount': 11
}, {
'url': 'http://www.mixcloud.com/dholbach/uploads/',
'info_dict': {
'id': 'dholbach/uploads',
'title': 'Daniel Holbach (uploads)',
'description': 'md5:327af72d1efeb404a8216c27240d1370',
},
'playlist_mincount': 11
}, {
'url': 'http://www.mixcloud.com/dholbach/favorites/',
'info_dict': {
'id': 'dholbach/favorites',
'title': 'Daniel Holbach (favorites)',
'description': 'md5:327af72d1efeb404a8216c27240d1370',
},
'playlist_mincount': 244
}, {
'url': 'http://www.mixcloud.com/dholbach/listens/',
'info_dict': {
'id': 'dholbach/listens',
'title': 'Daniel Holbach (listens)',
'description': 'md5:327af72d1efeb404a8216c27240d1370',
},
'playlist_mincount': 846
}]
def _fetch_tracks(self, base_url, video_id, dl_note=None, dl_errnote=None):
# retrieve all fragments of a list of tracks with fake AJAX calls
track_urls = []
current_page = 1
while True:
# fake a AJAX request to retrieve a list fragment
page_url = base_url + "?page=%d&list=main&_ajax=1" % current_page
req = compat_urllib_request.Request(page_url, headers={"X-Requested-With": "XMLHttpRequest"}, method="GET")
resp = self._download_webpage(req, video_id, note=dl_note + " (page %d)" % current_page, errnote=dl_errnote)
# extract all track URLs from fragment
urls = re.findall(r'm-play-button m-url="(?P<url>[^"]+)"', resp)
# clean up URLs
urls = map(clean_html, urls)
# create absolute URLs
urls = map(lambda u: "https://www.mixcloud.com" + u, urls)
track_urls.extend(urls)
# advance to next fragment, if any
if " m-next-page-url=" in resp:
current_page += 1
else:
break
return track_urls
def _handle_track_urls(self, urls):
return map(lambda u: self.url_result(u, "Mixcloud"), urls)
def _get_user_description(self, page_content):
return self._html_search_regex(
r'<div class="description-text">.*?<p>(?P<description>.*?)</p></div></div></div>',
page_content,
"user description",
group="description",
fatal=False,
default="")
def _get_username(self, page_content):
return self._og_search_title(page_content)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
user_id = mobj.group("user")
list_type = mobj.group("type")
# if only a profile URL was supplied, default to download all uploads
if list_type is None:
list_type = "uploads"
video_id = "%s/%s" % (user_id, list_type)
# download the user's profile to retrieve some metadata
profile = self._download_webpage("https://www.mixcloud.com/%s/" % user_id,
video_id,
note="Downloading user profile",
errnote="Unable to download user profile")
username = self._get_username(profile)
description = self._get_user_description(profile)
# retrieve all page fragments of uploads, favorites or listens
track_urls = self._fetch_tracks(
"https://www.mixcloud.com/%s/%s/" % (user_id, list_type),
video_id,
dl_note="Downloading list of %s" % list_type,
dl_errnote="Unable to download list of %s" % list_type)
# let MixcloudIE handle each track URL
entries = self._handle_track_urls(track_urls)
return {
'_type': 'playlist',
'entries': entries,
'title': "%s (%s)" % (username, list_type),
'id': video_id,
"description": description
}
class MixcloudPlaylistIE(MixcloudUserIE):
"""
Information extractor for Mixcloud playlists.
"""
_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/playlists/(?P<playlist>[^/]+)/?$'
IE_NAME = 'mixcloud:playlist'
_TESTS = [{
'url': 'https://www.mixcloud.com/RedBullThre3style/playlists/tokyo-finalists-2015/',
'info_dict': {
'id': 'RedBullThre3style/playlists/tokyo-finalists-2015',
'title': 'National Champions 2015',
'description': 'md5:6ff5fb01ac76a31abc9b3939c16243a3',
},
'playlist_mincount': 16
}, {
'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/',
'info_dict': {
'id': 'maxvibes/playlists/jazzcat-on-ness-radio',
'title': 'Jazzcat on Ness Radio',
'description': 'md5:c2c51a1f1b8bb5442f2ca67c3dc4af27',
},
'playlist_mincount': 23
}]
def _get_playlist_title(self, page_content):
return self._html_search_regex(
r'<span class="main-list-title list-playlist-title ">(?P<title>.*?)</span>',
page_content,
"playlist title",
group="title",
fatal=True
)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
user_id = mobj.group("user")
playlist_id = mobj.group("playlist")
video_id = "%s/playlists/%s" % (user_id, playlist_id)
# download the playlist page to retrieve some metadata
profile = self._download_webpage(url,
user_id,
note="Downloading playlist page",
errnote="Unable to download playlist page")
description = self._get_user_description(profile)
playlist_title = self._get_playlist_title(profile)
# retrieve all page fragments of playlist
track_urls = self._fetch_tracks(
"https://www.mixcloud.com/%s/playlists/%s/" % (user_id, playlist_id),
video_id,
dl_note="Downloading tracklist of %s" % playlist_title,
dl_errnote="Unable to tracklist of %s" % playlist_title)
# let MixcloudIE handle each track
entries = self._handle_track_urls(track_urls)
return {
'_type': 'playlist',
'entries': entries,
'title': playlist_title,
'id': video_id,
"description": description
}