yt-dlp/yt_dlp/networking/_requests.py

424 lines
15 KiB
Python
Raw Normal View History

import contextlib
import functools
import http.client
import logging
import re
import socket
import warnings
from ..dependencies import brotli, requests, urllib3
from ..utils import bug_reports_message, int_or_none, variadic
from ..utils.networking import normalize_url
if requests is None:
raise ImportError('requests module is not installed')
if urllib3 is None:
raise ImportError('urllib3 module is not installed')
urllib3_version = tuple(int_or_none(x, default=0) for x in urllib3.__version__.split('.'))
if urllib3_version < (1, 26, 17):
raise ImportError('Only urllib3 >= 1.26.17 is supported')
if requests.__build__ < 0x023100:
raise ImportError('Only requests >= 2.31.0 is supported')
import requests.adapters
import requests.utils
import urllib3.connection
import urllib3.exceptions
import urllib3.util
from ._helper import (
InstanceStoreMixin,
add_accept_encoding_header,
create_connection,
create_socks_proxy_socket,
get_redirect_method,
make_socks_proxy_opts,
select_proxy,
)
from .common import (
Features,
RequestHandler,
Response,
register_preference,
register_rh,
)
from .exceptions import (
CertificateVerifyError,
HTTPError,
IncompleteRead,
ProxyError,
RequestError,
SSLError,
TransportError,
)
from ..socks import ProxyError as SocksProxyError
SUPPORTED_ENCODINGS = [
'gzip', 'deflate'
]
if brotli is not None:
SUPPORTED_ENCODINGS.append('br')
"""
Override urllib3's behavior to not convert lower-case percent-encoded characters
to upper-case during url normalization process.
RFC3986 defines that the lower or upper case percent-encoded hexidecimal characters are equivalent
and normalizers should convert them to uppercase for consistency [1].
However, some sites may have an incorrect implementation where they provide
a percent-encoded url that is then compared case-sensitively.[2]
While this is a very rare case, since urllib does not do this normalization step, it
is best to avoid it in requests too for compatability reasons.
1: https://tools.ietf.org/html/rfc3986#section-2.1
2: https://github.com/streamlink/streamlink/pull/4003
"""
class Urllib3PercentREOverride:
def __init__(self, r: re.Pattern):
self.re = r
# pass through all other attribute calls to the original re
def __getattr__(self, item):
return self.re.__getattribute__(item)
def subn(self, repl, string, *args, **kwargs):
return string, self.re.subn(repl, string, *args, **kwargs)[1]
# urllib3 >= 1.25.8 uses subn:
# https://github.com/urllib3/urllib3/commit/a2697e7c6b275f05879b60f593c5854a816489f0
import urllib3.util.url # noqa: E305
if hasattr(urllib3.util.url, 'PERCENT_RE'):
urllib3.util.url.PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url.PERCENT_RE)
elif hasattr(urllib3.util.url, '_PERCENT_RE'): # urllib3 >= 2.0.0
urllib3.util.url._PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url._PERCENT_RE)
else:
warnings.warn('Failed to patch PERCENT_RE in urllib3 (does the attribute exist?)' + bug_reports_message())
"""
Workaround for issue in urllib.util.ssl_.py: ssl_wrap_context does not pass
server_hostname to SSLContext.wrap_socket if server_hostname is an IP,
however this is an issue because we set check_hostname to True in our SSLContext.
Monkey-patching IS_SECURETRANSPORT forces ssl_wrap_context to pass server_hostname regardless.
This has been fixed in urllib3 2.0+.
See: https://github.com/urllib3/urllib3/issues/517
"""
if urllib3_version < (2, 0, 0):
with contextlib.suppress(Exception):
urllib3.util.IS_SECURETRANSPORT = urllib3.util.ssl_.IS_SECURETRANSPORT = True
# Requests will not automatically handle no_proxy by default
# due to buggy no_proxy handling with proxy dict [1].
# 1. https://github.com/psf/requests/issues/5000
requests.adapters.select_proxy = select_proxy
class RequestsResponseAdapter(Response):
def __init__(self, res: requests.models.Response):
super().__init__(
fp=res.raw, headers=res.headers, url=res.url,
status=res.status_code, reason=res.reason)
self._requests_response = res
def read(self, amt: int = None):
try:
# Interact with urllib3 response directly.
return self.fp.read(amt, decode_content=True)
# See urllib3.response.HTTPResponse.read() for exceptions raised on read
except urllib3.exceptions.SSLError as e:
raise SSLError(cause=e) from e
except urllib3.exceptions.ProtocolError as e:
# IncompleteRead is always contained within ProtocolError
# See urllib3.response.HTTPResponse._error_catcher()
ir_err = next(
(err for err in (e.__context__, e.__cause__, *variadic(e.args))
if isinstance(err, http.client.IncompleteRead)), None)
if ir_err is not None:
# `urllib3.exceptions.IncompleteRead` is subclass of `http.client.IncompleteRead`
# but uses an `int` for its `partial` property.
partial = ir_err.partial if isinstance(ir_err.partial, int) else len(ir_err.partial)
raise IncompleteRead(partial=partial, expected=ir_err.expected) from e
raise TransportError(cause=e) from e
except urllib3.exceptions.HTTPError as e:
# catch-all for any other urllib3 response exceptions
raise TransportError(cause=e) from e
class RequestsHTTPAdapter(requests.adapters.HTTPAdapter):
def __init__(self, ssl_context=None, proxy_ssl_context=None, source_address=None, **kwargs):
self._pm_args = {}
if ssl_context:
self._pm_args['ssl_context'] = ssl_context
if source_address:
self._pm_args['source_address'] = (source_address, 0)
self._proxy_ssl_context = proxy_ssl_context or ssl_context
super().__init__(**kwargs)
def init_poolmanager(self, *args, **kwargs):
return super().init_poolmanager(*args, **kwargs, **self._pm_args)
def proxy_manager_for(self, proxy, **proxy_kwargs):
extra_kwargs = {}
if not proxy.lower().startswith('socks') and self._proxy_ssl_context:
extra_kwargs['proxy_ssl_context'] = self._proxy_ssl_context
return super().proxy_manager_for(proxy, **proxy_kwargs, **self._pm_args, **extra_kwargs)
# Skip `requests` internal verification; we use our own SSLContext
# requests 2.31.0+
def cert_verify(*args, **kwargs):
pass
# requests 2.31.0-2.32.1
def _get_connection(self, request, *_, proxies=None, **__):
return self.get_connection(request.url, proxies)
# requests 2.32.2+: Reimplementation without `_urllib3_request_context`
def get_connection_with_tls_context(self, request, verify, proxies=None, cert=None):
url = urllib3.util.parse_url(request.url).url
manager = self.poolmanager
if proxy := select_proxy(url, proxies):
manager = self.proxy_manager_for(proxy)
return manager.connection_from_url(url)
class RequestsSession(requests.sessions.Session):
"""
Ensure unified redirect method handling with our urllib redirect handler.
"""
def rebuild_method(self, prepared_request, response):
new_method = get_redirect_method(prepared_request.method, response.status_code)
# HACK: requests removes headers/body on redirect unless code was a 307/308.
if new_method == prepared_request.method:
response._real_status_code = response.status_code
response.status_code = 308
prepared_request.method = new_method
# Requests fails to resolve dot segments on absolute redirect locations
# See: https://github.com/yt-dlp/yt-dlp/issues/9020
prepared_request.url = normalize_url(prepared_request.url)
def rebuild_auth(self, prepared_request, response):
# HACK: undo status code change from rebuild_method, if applicable.
# rebuild_auth runs after requests would remove headers/body based on status code
if hasattr(response, '_real_status_code'):
response.status_code = response._real_status_code
del response._real_status_code
return super().rebuild_auth(prepared_request, response)
class Urllib3LoggingFilter(logging.Filter):
def filter(self, record):
# Ignore HTTP request messages since HTTPConnection prints those
if record.msg == '%s://%s:%s "%s %s %s" %s %s':
return False
return True
class Urllib3LoggingHandler(logging.Handler):
"""Redirect urllib3 logs to our logger"""
def __init__(self, logger, *args, **kwargs):
super().__init__(*args, **kwargs)
self._logger = logger
def emit(self, record):
try:
msg = self.format(record)
if record.levelno >= logging.ERROR:
self._logger.error(msg)
else:
self._logger.stdout(msg)
except Exception:
self.handleError(record)
@register_rh
class RequestsRH(RequestHandler, InstanceStoreMixin):
"""Requests RequestHandler
https://github.com/psf/requests
"""
_SUPPORTED_URL_SCHEMES = ('http', 'https')
_SUPPORTED_ENCODINGS = tuple(SUPPORTED_ENCODINGS)
_SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
_SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
RH_NAME = 'requests'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Forward urllib3 debug messages to our logger
logger = logging.getLogger('urllib3')
self.__logging_handler = Urllib3LoggingHandler(logger=self._logger)
self.__logging_handler.setFormatter(logging.Formatter('requests: %(message)s'))
self.__logging_handler.addFilter(Urllib3LoggingFilter())
logger.addHandler(self.__logging_handler)
# TODO: Use a logger filter to suppress pool reuse warning instead
logger.setLevel(logging.ERROR)
if self.verbose:
# Setting this globally is not ideal, but is easier than hacking with urllib3.
# It could technically be problematic for scripts embedding yt-dlp.
# However, it is unlikely debug traffic is used in that context in a way this will cause problems.
urllib3.connection.HTTPConnection.debuglevel = 1
logger.setLevel(logging.DEBUG)
# this is expected if we are using --no-check-certificate
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def close(self):
self._clear_instances()
# Remove the logging handler that contains a reference to our logger
# See: https://github.com/yt-dlp/yt-dlp/issues/8922
logging.getLogger('urllib3').removeHandler(self.__logging_handler)
def _check_extensions(self, extensions):
super()._check_extensions(extensions)
extensions.pop('cookiejar', None)
extensions.pop('timeout', None)
def _create_instance(self, cookiejar):
session = RequestsSession()
http_adapter = RequestsHTTPAdapter(
ssl_context=self._make_sslcontext(),
source_address=self.source_address,
max_retries=urllib3.util.retry.Retry(False),
)
session.adapters.clear()
session.headers = requests.models.CaseInsensitiveDict({'Connection': 'keep-alive'})
session.mount('https://', http_adapter)
session.mount('http://', http_adapter)
session.cookies = cookiejar
session.trust_env = False # no need, we already load proxies from env
return session
def _send(self, request):
headers = self._merge_headers(request.headers)
add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
max_redirects_exceeded = False
session = self._get_instance(cookiejar=self._get_cookiejar(request))
try:
requests_res = session.request(
method=request.method,
url=request.url,
data=request.data,
headers=headers,
timeout=self._calculate_timeout(request),
proxies=self._get_proxies(request),
allow_redirects=True,
stream=True
)
except requests.exceptions.TooManyRedirects as e:
max_redirects_exceeded = True
requests_res = e.response
except requests.exceptions.SSLError as e:
if 'CERTIFICATE_VERIFY_FAILED' in str(e):
raise CertificateVerifyError(cause=e) from e
raise SSLError(cause=e) from e
except requests.exceptions.ProxyError as e:
raise ProxyError(cause=e) from e
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
raise TransportError(cause=e) from e
except urllib3.exceptions.HTTPError as e:
# Catch any urllib3 exceptions that may leak through
raise TransportError(cause=e) from e
except requests.exceptions.RequestException as e:
# Miscellaneous Requests exceptions. May not necessary be network related e.g. InvalidURL
raise RequestError(cause=e) from e
res = RequestsResponseAdapter(requests_res)
if not 200 <= res.status < 300:
raise HTTPError(res, redirect_loop=max_redirects_exceeded)
return res
@register_preference(RequestsRH)
def requests_preference(rh, request):
return 100
# Use our socks proxy implementation with requests to avoid an extra dependency.
class SocksHTTPConnection(urllib3.connection.HTTPConnection):
def __init__(self, _socks_options, *args, **kwargs): # must use _socks_options to pass PoolKey checks
self._proxy_args = _socks_options
super().__init__(*args, **kwargs)
def _new_conn(self):
try:
return create_connection(
address=(self._proxy_args['addr'], self._proxy_args['port']),
timeout=self.timeout,
source_address=self.source_address,
_create_socket_func=functools.partial(
create_socks_proxy_socket, (self.host, self.port), self._proxy_args))
except (socket.timeout, TimeoutError) as e:
raise urllib3.exceptions.ConnectTimeoutError(
self, f'Connection to {self.host} timed out. (connect timeout={self.timeout})') from e
except SocksProxyError as e:
raise urllib3.exceptions.ProxyError(str(e), e) from e
except OSError as e:
raise urllib3.exceptions.NewConnectionError(
self, f'Failed to establish a new connection: {e}') from e
class SocksHTTPSConnection(SocksHTTPConnection, urllib3.connection.HTTPSConnection):
pass
class SocksHTTPConnectionPool(urllib3.HTTPConnectionPool):
ConnectionCls = SocksHTTPConnection
class SocksHTTPSConnectionPool(urllib3.HTTPSConnectionPool):
ConnectionCls = SocksHTTPSConnection
class SocksProxyManager(urllib3.PoolManager):
def __init__(self, socks_proxy, username=None, password=None, num_pools=10, headers=None, **connection_pool_kw):
connection_pool_kw['_socks_options'] = make_socks_proxy_opts(socks_proxy)
super().__init__(num_pools, headers, **connection_pool_kw)
self.pool_classes_by_scheme = {
'http': SocksHTTPConnectionPool,
'https': SocksHTTPSConnectionPool
}
requests.adapters.SOCKSProxyManager = SocksProxyManager