Improve geo bypass mechanism

* Rename options to preffixly match with --geo-verification-proxy
* Introduce _GEO_COUNTRIES for extractors
* Implement faking IP right away for sites with known geo restriction
This commit is contained in:
Sergey M․ 2017-02-19 03:53:23 +07:00 committed by Sergey M
parent 0a840f584c
commit 4248dad92b
13 changed files with 72 additions and 31 deletions

View file

@ -323,10 +323,15 @@ class InfoExtractor(object):
_real_extract() methods and define a _VALID_URL regexp. _real_extract() methods and define a _VALID_URL regexp.
Probably, they should also be added to the list of extractors. Probably, they should also be added to the list of extractors.
_BYPASS_GEO attribute may be set to False in order to disable _GEO_BYPASS attribute may be set to False in order to disable
geo restriction bypass mechanisms for a particular extractor. geo restriction bypass mechanisms for a particular extractor.
Though it won't disable explicit geo restriction bypass based on Though it won't disable explicit geo restriction bypass based on
country code provided with geo_bypass_country. country code provided with geo_bypass_country. (experimental)
_GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
countries for this extractor. One of these countries will be used by
geo restriction bypass mechanism right away in order to bypass
geo restriction, of course, if the mechanism is not disabled. (experimental)
Finally, the _WORKING attribute should be set to False for broken IEs Finally, the _WORKING attribute should be set to False for broken IEs
in order to warn the users and skip the tests. in order to warn the users and skip the tests.
@ -335,7 +340,8 @@ class InfoExtractor(object):
_ready = False _ready = False
_downloader = None _downloader = None
_x_forwarded_for_ip = None _x_forwarded_for_ip = None
_BYPASS_GEO = True _GEO_BYPASS = True
_GEO_COUNTRIES = None
_WORKING = True _WORKING = True
def __init__(self, downloader=None): def __init__(self, downloader=None):
@ -370,14 +376,28 @@ def working(cls):
def initialize(self): def initialize(self):
"""Initializes an instance (authentication, etc).""" """Initializes an instance (authentication, etc)."""
if not self._x_forwarded_for_ip: self.__initialize_geo_bypass()
country_code = self._downloader.params.get('geo_bypass_country', None)
if country_code:
self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
if not self._ready: if not self._ready:
self._real_initialize() self._real_initialize()
self._ready = True self._ready = True
def __initialize_geo_bypass(self):
if not self._x_forwarded_for_ip:
country_code = self._downloader.params.get('geo_bypass_country', None)
# If there is no explicit country for geo bypass specified and
# the extractor is known to be geo restricted let's fake IP
# as X-Forwarded-For right away.
if (not country_code and
self._GEO_BYPASS and
self._downloader.params.get('geo_bypass', True) and
self._GEO_COUNTRIES):
country_code = random.choice(self._GEO_COUNTRIES)
if country_code:
self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
if self._downloader.params.get('verbose', False):
self._downloader.to_stdout(
'[debug] Using fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
def extract(self, url): def extract(self, url):
"""Extracts URL information and returns it in list of dicts.""" """Extracts URL information and returns it in list of dicts."""
try: try:
@ -389,16 +409,8 @@ def extract(self, url):
ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
return ie_result return ie_result
except GeoRestrictedError as e: except GeoRestrictedError as e:
if (not self._downloader.params.get('geo_bypass_country', None) and if self.__maybe_fake_ip_and_retry(e.countries):
self._BYPASS_GEO and continue
self._downloader.params.get('geo_bypass', True) and
not self._x_forwarded_for_ip and
e.countries):
self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(e.countries))
if self._x_forwarded_for_ip:
self.report_warning(
'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
continue
raise raise
except ExtractorError: except ExtractorError:
raise raise
@ -407,6 +419,19 @@ def extract(self, url):
except (KeyError, StopIteration) as e: except (KeyError, StopIteration) as e:
raise ExtractorError('An extractor error has occurred.', cause=e) raise ExtractorError('An extractor error has occurred.', cause=e)
def __maybe_fake_ip_and_retry(self, countries):
if (not self._downloader.params.get('geo_bypass_country', None) and
self._GEO_BYPASS and
self._downloader.params.get('geo_bypass', True) and
not self._x_forwarded_for_ip and
countries):
self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(countries))
if self._x_forwarded_for_ip:
self.report_warning(
'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
return True
return False
def set_downloader(self, downloader): def set_downloader(self, downloader):
"""Sets the downloader for this IE.""" """Sets the downloader for this IE."""
self._downloader = downloader self._downloader = downloader

View file

@ -20,6 +20,7 @@
class DramaFeverBaseIE(AMPIE): class DramaFeverBaseIE(AMPIE):
_LOGIN_URL = 'https://www.dramafever.com/accounts/login/' _LOGIN_URL = 'https://www.dramafever.com/accounts/login/'
_NETRC_MACHINE = 'dramafever' _NETRC_MACHINE = 'dramafever'
_GEO_COUNTRIES = ['US', 'CA']
_CONSUMER_SECRET = 'DA59dtVXYLxajktV' _CONSUMER_SECRET = 'DA59dtVXYLxajktV'
@ -118,7 +119,7 @@ def _real_extract(self, url):
if isinstance(e.cause, compat_HTTPError): if isinstance(e.cause, compat_HTTPError):
self.raise_geo_restricted( self.raise_geo_restricted(
msg='Currently unavailable in your country', msg='Currently unavailable in your country',
countries=['US', 'CA']) countries=self._GEO_COUNTRIES)
raise raise
series_id, episode_number = video_id.split('.') series_id, episode_number = video_id.split('.')

View file

@ -37,6 +37,7 @@ class GoIE(AdobePassIE):
} }
} }
_VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|season-\d+/\d+-(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys()) _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|season-\d+/\d+-(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys())
_GEO_COUNTRIES = ['US']
_TESTS = [{ _TESTS = [{
'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx', 'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx',
'info_dict': { 'info_dict': {
@ -104,7 +105,7 @@ def _real_extract(self, url):
for error in errors: for error in errors:
if error.get('code') == 1002: if error.get('code') == 1002:
self.raise_geo_restricted( self.raise_geo_restricted(
error['message'], countries=['US']) error['message'], countries=self._GEO_COUNTRIES)
error_message = ', '.join([error['message'] for error in errors]) error_message = ', '.join([error['message'] for error in errors])
raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True)
asset_url += '?' + entitlement['uplynkData']['sessionKey'] asset_url += '?' + entitlement['uplynkData']['sessionKey']

View file

@ -24,6 +24,7 @@
class ITVIE(InfoExtractor): class ITVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)' _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)'
_GEO_COUNTRIES = ['GB']
_TEST = { _TEST = {
'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053', 'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053',
'info_dict': { 'info_dict': {
@ -101,7 +102,8 @@ def _add_sub_element(element, name):
fault_code = xpath_text(resp_env, './/faultcode') fault_code = xpath_text(resp_env, './/faultcode')
fault_string = xpath_text(resp_env, './/faultstring') fault_string = xpath_text(resp_env, './/faultstring')
if fault_code == 'InvalidGeoRegion': if fault_code == 'InvalidGeoRegion':
self.raise_geo_restricted(msg=fault_string, countries=['GB']) self.raise_geo_restricted(
msg=fault_string, countries=self._GEO_COUNTRIES)
raise ExtractorError('%s said: %s' % (self.IE_NAME, fault_string)) raise ExtractorError('%s said: %s' % (self.IE_NAME, fault_string))
title = xpath_text(playlist, 'EpisodeTitle', fatal=True) title = xpath_text(playlist, 'EpisodeTitle', fatal=True)
video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True)

View file

@ -14,6 +14,7 @@
class NRKBaseIE(InfoExtractor): class NRKBaseIE(InfoExtractor):
_GEO_COUNTRIES = ['NO']
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
@ -93,7 +94,8 @@ def video_id_and_title(idx):
# Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked*
if 'IsGeoBlocked' in message_type: if 'IsGeoBlocked' in message_type:
self.raise_geo_restricted( self.raise_geo_restricted(
msg=MESSAGES.get('ProgramIsGeoBlocked'), countries=['NO']) msg=MESSAGES.get('ProgramIsGeoBlocked'),
countries=self._GEO_COUNTRIES)
raise ExtractorError( raise ExtractorError(
'%s said: %s' % (self.IE_NAME, MESSAGES.get( '%s said: %s' % (self.IE_NAME, MESSAGES.get(
message_type, message_type)), message_type, message_type)),

View file

@ -10,6 +10,7 @@
class OnDemandKoreaIE(InfoExtractor): class OnDemandKoreaIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?P<id>[^/]+)\.html' _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?P<id>[^/]+)\.html'
_GEO_COUNTRIES = ['US', 'CA']
_TEST = { _TEST = {
'url': 'http://www.ondemandkorea.com/ask-us-anything-e43.html', 'url': 'http://www.ondemandkorea.com/ask-us-anything-e43.html',
'info_dict': { 'info_dict': {
@ -36,7 +37,7 @@ def _real_extract(self, url):
if 'msg_block_01.png' in webpage: if 'msg_block_01.png' in webpage:
self.raise_geo_restricted( self.raise_geo_restricted(
msg='This content is not available in your region', msg='This content is not available in your region',
countries=['US', 'CA']) countries=self._GEO_COUNTRIES)
if 'This video is only available to ODK PLUS members.' in webpage: if 'This video is only available to ODK PLUS members.' in webpage:
raise ExtractorError( raise ExtractorError(

View file

@ -193,6 +193,8 @@ class PBSIE(InfoExtractor):
) )
''' % '|'.join(list(zip(*_STATIONS))[0]) ''' % '|'.join(list(zip(*_STATIONS))[0])
_GEO_COUNTRIES = ['US']
_TESTS = [ _TESTS = [
{ {
'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/',
@ -492,7 +494,8 @@ def extract_redirect_urls(info):
message = self._ERRORS.get( message = self._ERRORS.get(
redirect_info['http_code'], redirect_info['message']) redirect_info['http_code'], redirect_info['message'])
if redirect_info['http_code'] == 403: if redirect_info['http_code'] == 403:
self.raise_geo_restricted(msg=message, countries=['US']) self.raise_geo_restricted(
msg=message, countries=self._GEO_COUNTRIES)
raise ExtractorError( raise ExtractorError(
'%s said: %s' % (self.IE_NAME, message), expected=True) '%s said: %s' % (self.IE_NAME, message), expected=True)

View file

@ -14,7 +14,8 @@
class SRGSSRIE(InfoExtractor): class SRGSSRIE(InfoExtractor):
_VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?P<bu>srf|rts|rsi|rtr|swi):(?:[^:]+:)?(?P<type>video|audio):(?P<id>[0-9a-f\-]{36}|\d+)' _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?P<bu>srf|rts|rsi|rtr|swi):(?:[^:]+:)?(?P<type>video|audio):(?P<id>[0-9a-f\-]{36}|\d+)'
_BYPASS_GEO = False _GEO_BYPASS = False
_GEO_COUNTRIES = ['CH']
_ERRORS = { _ERRORS = {
'AGERATING12': 'To protect children under the age of 12, this video is only available between 8 p.m. and 6 a.m.', 'AGERATING12': 'To protect children under the age of 12, this video is only available between 8 p.m. and 6 a.m.',
@ -43,7 +44,8 @@ def get_media_data(self, bu, media_type, media_id):
if media_data.get('block') and media_data['block'] in self._ERRORS: if media_data.get('block') and media_data['block'] in self._ERRORS:
message = self._ERRORS[media_data['block']] message = self._ERRORS[media_data['block']]
if media_data['block'] == 'GEOBLOCK': if media_data['block'] == 'GEOBLOCK':
self.raise_geo_restricted(msg=message, countries=['CH']) self.raise_geo_restricted(
msg=message, countries=self._GEO_COUNTRIES)
raise ExtractorError( raise ExtractorError(
'%s said: %s' % (self.IE_NAME, message), expected=True) '%s said: %s' % (self.IE_NAME, message), expected=True)

View file

@ -13,6 +13,7 @@
class SVTBaseIE(InfoExtractor): class SVTBaseIE(InfoExtractor):
_GEO_COUNTRIES = ['SE']
def _extract_video(self, video_info, video_id): def _extract_video(self, video_info, video_id):
formats = [] formats = []
for vr in video_info['videoReferences']: for vr in video_info['videoReferences']:
@ -39,7 +40,8 @@ def _extract_video(self, video_info, video_id):
}) })
if not formats and video_info.get('rights', {}).get('geoBlockedSweden'): if not formats and video_info.get('rights', {}).get('geoBlockedSweden'):
self.raise_geo_restricted( self.raise_geo_restricted(
'This video is only available in Sweden', countries=['SE']) 'This video is only available in Sweden',
countries=self._GEO_COUNTRIES)
self._sort_formats(formats) self._sort_formats(formats)
subtitles = {} subtitles = {}

View file

@ -20,6 +20,7 @@ class Vbox7IE(InfoExtractor):
) )
(?P<id>[\da-fA-F]+) (?P<id>[\da-fA-F]+)
''' '''
_GEO_COUNTRIES = ['BG']
_TESTS = [{ _TESTS = [{
'url': 'http://vbox7.com/play:0946fff23c', 'url': 'http://vbox7.com/play:0946fff23c',
'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf', 'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf',
@ -78,7 +79,7 @@ def _real_extract(self, url):
video_url = video['src'] video_url = video['src']
if '/na.mp4' in video_url: if '/na.mp4' in video_url:
self.raise_geo_restricted(countries=['BG']) self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
uploader = video.get('uploader') uploader = video.get('uploader')

View file

@ -14,7 +14,7 @@
class VGTVIE(XstreamIE): class VGTVIE(XstreamIE):
IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet' IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet'
_BYPASS_GEO = False _GEO_BYPASS = False
_HOST_TO_APPNAME = { _HOST_TO_APPNAME = {
'vgtv.no': 'vgtv', 'vgtv.no': 'vgtv',
@ -218,7 +218,8 @@ def _real_extract(self, url):
properties = try_get( properties = try_get(
data, lambda x: x['streamConfiguration']['properties'], list) data, lambda x: x['streamConfiguration']['properties'], list)
if properties and 'geoblocked' in properties: if properties and 'geoblocked' in properties:
raise self.raise_geo_restricted(countries=['NO']) raise self.raise_geo_restricted(
countries=[host.rpartition('.')[-1].partition('/')[0].upper()])
self._sort_formats(info['formats']) self._sort_formats(info['formats'])

View file

@ -27,7 +27,7 @@ class VikiBaseIE(InfoExtractor):
_APP_VERSION = '2.2.5.1428709186' _APP_VERSION = '2.2.5.1428709186'
_APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)' _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)'
_BYPASS_GEO = False _GEO_BYPASS = False
_NETRC_MACHINE = 'viki' _NETRC_MACHINE = 'viki'
_token = None _token = None

View file

@ -3291,7 +3291,7 @@ def random_ipv4(cls, code):
addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0] addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
addr_max = addr_min | (0xffffffff >> int(preflen)) addr_max = addr_min | (0xffffffff >> int(preflen))
return compat_str(socket.inet_ntoa( return compat_str(socket.inet_ntoa(
compat_struct_pack('!I', random.randint(addr_min, addr_max)))) compat_struct_pack('!L', random.randint(addr_min, addr_max))))
class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):