mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-27 10:31:29 +00:00
Merge pull request #176 from blackjack4494/mtv_updated_extractor_logic
[Mtv] updated extractor logic & more
This commit is contained in:
commit
cfd7f14bb3
|
@ -3,6 +3,8 @@
|
|||
from .mtv import MTVServicesInfoExtractor
|
||||
from ..utils import unified_strdate
|
||||
|
||||
# TODO Remove - Reason: Outdated Site
|
||||
|
||||
|
||||
class BetIE(MTVServicesInfoExtractor):
|
||||
_VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P<id>.+?)\.html'
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
|
||||
from .mtv import MTVIE
|
||||
|
||||
# TODO Remove - Reason: Outdated Site
|
||||
|
||||
|
||||
class CMTIE(MTVIE):
|
||||
IE_NAME = 'cmt.com'
|
||||
|
@ -39,7 +41,7 @@ class CMTIE(MTVIE):
|
|||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _extract_mgid(self, webpage):
|
||||
def _extract_mgid(self, webpage, url):
|
||||
mgid = self._search_regex(
|
||||
r'MTVN\.VIDEO\.contentUri\s*=\s*([\'"])(?P<mgid>.+?)\1',
|
||||
webpage, 'mgid', group='mgid', default=None)
|
||||
|
@ -50,5 +52,5 @@ def _extract_mgid(self, webpage):
|
|||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
mgid = self._extract_mgid(webpage)
|
||||
mgid = self._extract_mgid(webpage, url)
|
||||
return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid)
|
||||
|
|
|
@ -48,7 +48,7 @@ class ComedyCentralFullEpisodesIE(MTVServicesInfoExtractor):
|
|||
def _real_extract(self, url):
|
||||
playlist_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, playlist_id)
|
||||
mgid = self._extract_triforce_mgid(webpage, data_zone='t2_lc_promo1')
|
||||
mgid = self._extract_mgid(webpage, url, data_zone='t2_lc_promo1')
|
||||
videos_info = self._get_videos_info(mgid)
|
||||
return videos_info
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
from ..compat import (
|
||||
compat_str,
|
||||
compat_xpath,
|
||||
compat_urlparse,
|
||||
)
|
||||
from ..utils import (
|
||||
ExtractorError,
|
||||
|
@ -22,6 +23,7 @@
|
|||
unescapeHTML,
|
||||
update_url_query,
|
||||
url_basename,
|
||||
get_domain,
|
||||
xpath_text,
|
||||
)
|
||||
|
||||
|
@ -253,7 +255,42 @@ def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None):
|
|||
|
||||
return try_get(feed, lambda x: x['result']['data']['id'], compat_str)
|
||||
|
||||
def _extract_mgid(self, webpage):
|
||||
def _extract_new_triforce_mgid(self, webpage, url='', video_id=None):
|
||||
# print(compat_urlparse.urlparse(url).netloc)
|
||||
if url == '':
|
||||
return
|
||||
domain = get_domain(url)
|
||||
if domain is None:
|
||||
raise ExtractorError(
|
||||
'[%s] could not get domain' % self.IE_NAME,
|
||||
expected=True)
|
||||
url = url.replace("https://", "http://")
|
||||
enc_url = compat_urlparse.quote(url, safe='')
|
||||
_TRIFORCE_V8_TEMPLATE = 'https://%s/feeds/triforce/manifest/v8?url=%s'
|
||||
triforce_manifest_url = _TRIFORCE_V8_TEMPLATE % (domain, enc_url)
|
||||
|
||||
manifest = self._download_json(triforce_manifest_url, video_id, fatal=False)
|
||||
if manifest:
|
||||
if manifest.get('manifest').get('type') == 'redirect':
|
||||
self.to_screen('Found a redirect. Downloading manifest from new location')
|
||||
new_loc = manifest.get('manifest').get('newLocation')
|
||||
new_loc = new_loc.replace("https://", "http://")
|
||||
enc_new_loc = compat_urlparse.quote(new_loc, safe='')
|
||||
triforce_manifest_new_loc = _TRIFORCE_V8_TEMPLATE % (domain, enc_new_loc)
|
||||
manifest = self._download_json(triforce_manifest_new_loc, video_id, fatal=False)
|
||||
|
||||
item_id = try_get(manifest, lambda x: x['manifest']['reporting']['itemId'], compat_str)
|
||||
if not item_id:
|
||||
self.to_screen('Found no id!')
|
||||
return
|
||||
|
||||
# 'episode' can be anything. 'content' is used often as well
|
||||
_MGID_TEMPLATE = 'mgid:arc:episode:%s:%s'
|
||||
mgid = _MGID_TEMPLATE % (domain, item_id)
|
||||
|
||||
return mgid
|
||||
|
||||
def _extract_mgid(self, webpage, url, data_zone=None):
|
||||
try:
|
||||
# the url can be http://media.mtvnservices.com/fb/{mgid}.swf
|
||||
# or http://media.mtvnservices.com/{mgid}
|
||||
|
@ -276,14 +313,17 @@ def _extract_mgid(self, webpage):
|
|||
r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid', default=None)
|
||||
|
||||
if not mgid:
|
||||
mgid = self._extract_triforce_mgid(webpage)
|
||||
mgid = self._extract_new_triforce_mgid(webpage, url)
|
||||
|
||||
if not mgid:
|
||||
mgid = self._extract_triforce_mgid(webpage, data_zone)
|
||||
|
||||
return mgid
|
||||
|
||||
def _real_extract(self, url):
|
||||
title = url_basename(url)
|
||||
webpage = self._download_webpage(url, title)
|
||||
mgid = self._extract_mgid(webpage)
|
||||
mgid = self._extract_mgid(webpage, url)
|
||||
videos_info = self._get_videos_info(mgid)
|
||||
return videos_info
|
||||
|
||||
|
|
|
@ -245,5 +245,5 @@ class NickRuIE(MTVServicesInfoExtractor):
|
|||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
mgid = self._extract_mgid(webpage)
|
||||
mgid = self._extract_mgid(webpage, url)
|
||||
return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid)
|
||||
|
|
|
@ -20,8 +20,18 @@ class BellatorIE(MTVServicesInfoExtractor):
|
|||
_FEED_URL = 'http://www.bellator.com/feeds/mrss/'
|
||||
_GEO_COUNTRIES = ['US']
|
||||
|
||||
def _extract_mgid(self, webpage):
|
||||
return self._extract_triforce_mgid(webpage)
|
||||
def _extract_mgid(self, webpage, url):
|
||||
mgid = None
|
||||
|
||||
if not mgid:
|
||||
mgid = self._extract_triforce_mgid(webpage)
|
||||
|
||||
if not mgid:
|
||||
mgid = self._extract_new_triforce_mgid(webpage, url)
|
||||
|
||||
return mgid
|
||||
|
||||
# TODO Remove - Reason: Outdated Site
|
||||
|
||||
|
||||
class ParamountNetworkIE(MTVServicesInfoExtractor):
|
||||
|
@ -43,7 +53,7 @@ class ParamountNetworkIE(MTVServicesInfoExtractor):
|
|||
_FEED_URL = 'http://www.paramountnetwork.com/feeds/mrss/'
|
||||
_GEO_COUNTRIES = ['US']
|
||||
|
||||
def _extract_mgid(self, webpage):
|
||||
def _extract_mgid(self, webpage, url):
|
||||
root_data = self._parse_json(self._search_regex(
|
||||
r'window\.__DATA__\s*=\s*({.+})',
|
||||
webpage, 'data'), None)
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
|
||||
from .mtv import MTVServicesInfoExtractor
|
||||
|
||||
# TODO Remove - Reason: Outdated Site
|
||||
|
||||
|
||||
class VH1IE(MTVServicesInfoExtractor):
|
||||
IE_NAME = 'vh1.com'
|
||||
|
|
|
@ -1984,6 +1984,7 @@ def get_elements_by_attribute(attribute, value, html, escape_value=True):
|
|||
|
||||
class HTMLAttributeParser(compat_HTMLParser):
|
||||
"""Trivial HTML parser to gather the attributes for a single element"""
|
||||
|
||||
def __init__(self):
|
||||
self.attrs = {}
|
||||
compat_HTMLParser.__init__(self)
|
||||
|
@ -2378,6 +2379,7 @@ class GeoRestrictedError(ExtractorError):
|
|||
This exception may be thrown when a video is not available from your
|
||||
geographic location due to geographic restrictions imposed by a website.
|
||||
"""
|
||||
|
||||
def __init__(self, msg, countries=None):
|
||||
super(GeoRestrictedError, self).__init__(msg, expected=True)
|
||||
self.msg = msg
|
||||
|
@ -3558,6 +3560,11 @@ def remove_quotes(s):
|
|||
return s
|
||||
|
||||
|
||||
def get_domain(url):
|
||||
domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
|
||||
return domain.group('domain') if domain else None
|
||||
|
||||
|
||||
def url_basename(url):
|
||||
path = compat_urlparse.urlparse(url).path
|
||||
return path.strip('/').split('/')[-1]
|
||||
|
|
Loading…
Reference in a new issue