Merge remote-tracking branch 'upstream/master'

This commit is contained in:
bergoid 2023-09-21 22:58:17 +02:00
commit d8d31be98e
68 changed files with 2986 additions and 1077 deletions

View File

@ -13,13 +13,16 @@ jobs:
matrix:
os: [ubuntu-latest]
# CPython 3.11 is in quick-test
python-version: ['3.8', '3.9', '3.10', pypy-3.7, pypy-3.8]
python-version: ['3.8', '3.9', '3.10', '3.12-dev', pypy-3.7, pypy-3.8, pypy-3.10]
run-tests-ext: [sh]
include:
# atleast one of each CPython/PyPy tests must be in windows
- os: windows-latest
python-version: '3.7'
run-tests-ext: bat
- os: windows-latest
python-version: '3.12-dev'
run-tests-ext: bat
- os: windows-latest
python-version: pypy-3.9
run-tests-ext: bat

View File

@ -76,7 +76,7 @@ yt-dlp is a [youtube-dl](https://github.com/ytdl-org/youtube-dl) fork based on t
# NEW FEATURES
* Forked from [**yt-dlc@f9401f2**](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee) and merged with [**youtube-dl@42f2d4**](https://github.com/ytdl-org/youtube-dl/commit/07af47960f3bb262ead02490ce65c8c45c01741e) ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21))
* Forked from [**yt-dlc@f9401f2**](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee) and merged with [**youtube-dl@66ab08**](https://github.com/ytdl-org/youtube-dl/commit/66ab0814c4baa2dc79c2dd5287bc0ad61a37c5b9) ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21))
* **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in YouTube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API
@ -1854,7 +1854,7 @@ The following extractors use this feature:
* `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks`
#### twitter
* `legacy_api`: Force usage of the legacy Twitter API instead of the GraphQL API for tweet extraction. Has no effect if login cookies are passed
* `api`: Select one of `graphql` (default), `legacy` or `syndication` as the API for tweet extraction. Has no effect if logged in
#### stacommu, wrestleuniverse
* `device_id`: UUID value assigned by the website and used to enforce device limits for paid livestream content. Can be found in browser local storage

View File

@ -68,6 +68,25 @@
{
"action": "change",
"when": "b03fa7834579a01cc5fba48c0e73488a16683d48",
"short": "[ie/twitter] Revert 92315c03774cfabb3a921884326beb4b981f786b"
"short": "[ie/twitter] Revert 92315c03774cfabb3a921884326beb4b981f786b",
"authors": ["pukkandan"]
},
{
"action": "change",
"when": "fcd6a76adc49d5cd8783985c7ce35384b72e545f",
"short": "[test] Add tests for socks proxies (#7908)",
"authors": ["coletdjnz"]
},
{
"action": "change",
"when": "4bf912282a34b58b6b35d8f7e6be535770c89c76",
"short": "[rh:urllib] Remove dot segments during URL normalization (#7662)",
"authors": ["coletdjnz"]
},
{
"action": "change",
"when": "59e92b1f1833440bb2190f847eb735cf0f90bc85",
"short": "[rh:urllib] Simplify gzip decoding (#7611)",
"authors": ["Grub4K"]
}
]

View File

@ -31,35 +31,27 @@ class CommitGroup(enum.Enum):
EXTRACTOR = 'Extractor'
DOWNLOADER = 'Downloader'
POSTPROCESSOR = 'Postprocessor'
NETWORKING = 'Networking'
MISC = 'Misc.'
@classmethod
@property
def ignorable_prefixes(cls):
return ('core', 'downloader', 'extractor', 'misc', 'postprocessor', 'upstream')
@classmethod
@lru_cache
def commit_lookup(cls):
def subgroup_lookup(cls):
return {
name: group
for group, names in {
cls.PRIORITY: {'priority'},
cls.CORE: {
'aes',
'cache',
'compat_utils',
'compat',
'cookies',
'core',
'dependencies',
'formats',
'jsinterp',
'networking',
'outtmpl',
'plugins',
'update',
'upstream',
'utils',
},
cls.MISC: {
@ -67,23 +59,40 @@ class CommitGroup(enum.Enum):
'cleanup',
'devscripts',
'docs',
'misc',
'test',
},
cls.EXTRACTOR: {'extractor', 'ie'},
cls.DOWNLOADER: {'downloader', 'fd'},
cls.POSTPROCESSOR: {'postprocessor', 'pp'},
cls.NETWORKING: {
'rh',
},
}.items()
for name in names
}
@classmethod
def get(cls, value):
result = cls.commit_lookup().get(value)
if result:
logger.debug(f'Mapped {value!r} => {result.name}')
@lru_cache
def group_lookup(cls):
result = {
'fd': cls.DOWNLOADER,
'ie': cls.EXTRACTOR,
'pp': cls.POSTPROCESSOR,
'upstream': cls.CORE,
}
result.update({item.name.lower(): item for item in iter(cls)})
return result
@classmethod
def get(cls, value: str) -> tuple[CommitGroup | None, str | None]:
group, _, subgroup = (group.strip().lower() for group in value.partition('/'))
result = cls.group_lookup().get(group)
if not result:
if subgroup:
return None, value
subgroup = group
result = cls.subgroup_lookup().get(subgroup)
return result, subgroup or None
@dataclass
class Commit:
@ -198,19 +207,23 @@ class Changelog:
for commit_infos in cleanup_misc_items.values():
sorted_items.append(CommitInfo(
'cleanup', ('Miscellaneous',), ', '.join(
self._format_message_link(None, info.commit.hash).strip()
self._format_message_link(None, info.commit.hash)
for info in sorted(commit_infos, key=lambda item: item.commit.hash or '')),
[], Commit(None, '', commit_infos[0].commit.authors), []))
return sorted_items
def format_single_change(self, info):
message = self._format_message_link(info.message, info.commit.hash)
def format_single_change(self, info: CommitInfo):
message, sep, rest = info.message.partition('\n')
if '[' not in message:
# If the message doesn't already contain markdown links, try to add a link to the commit
message = self._format_message_link(message, info.commit.hash)
if info.issues:
message = message.replace('\n', f' ({self._format_issues(info.issues)})\n', 1)
message = f'{message} ({self._format_issues(info.issues)})'
if info.commit.authors:
message = message.replace('\n', f' by {self._format_authors(info.commit.authors)}\n', 1)
message = f'{message} by {self._format_authors(info.commit.authors)}'
if info.fixes:
fix_message = ', '.join(f'{self._format_message_link(None, fix.hash)}' for fix in info.fixes)
@ -219,16 +232,14 @@ class Changelog:
if authors != info.commit.authors:
fix_message = f'{fix_message} by {self._format_authors(authors)}'
message = message.replace('\n', f' (With fixes in {fix_message})\n', 1)
message = f'{message} (With fixes in {fix_message})'
return message[:-1]
return message if not sep else f'{message}{sep}{rest}'
def _format_message_link(self, message, hash):
assert message or hash, 'Improperly defined commit message or override'
message = message if message else hash[:HASH_LENGTH]
if not hash:
return f'{message}\n'
return f'[{message}\n'.replace('\n', f']({self.repo_url}/commit/{hash})\n', 1)
return f'[{message}]({self.repo_url}/commit/{hash})' if hash else message
def _format_issues(self, issues):
return ', '.join(f'[#{issue}]({self.repo_url}/issues/{issue})' for issue in issues)
@ -318,7 +329,7 @@ class CommitRange:
for commitish, revert_commit in reverts.items():
reverted = commits.pop(commitish, None)
if reverted:
logger.debug(f'{commit} fully reverted {reverted}')
logger.debug(f'{commitish} fully reverted {reverted}')
else:
commits[revert_commit.hash] = revert_commit
@ -337,7 +348,7 @@ class CommitRange:
for override in overrides:
when = override.get('when')
if when and when not in self and when != self._start:
logger.debug(f'Ignored {when!r}, not in commits {self._start!r}')
logger.debug(f'Ignored {when!r} override')
continue
override_hash = override.get('hash') or when
@ -365,7 +376,7 @@ class CommitRange:
for commit in self:
upstream_re = self.UPSTREAM_MERGE_RE.search(commit.short)
if upstream_re:
commit.short = f'[core/upstream] Merged with youtube-dl {upstream_re.group(1)}'
commit.short = f'[upstream] Merged with youtube-dl {upstream_re.group(1)}'
match = self.MESSAGE_RE.fullmatch(commit.short)
if not match:
@ -410,25 +421,20 @@ class CommitRange:
if not prefix:
return CommitGroup.CORE, None, ()
prefix, _, details = prefix.partition('/')
prefix = prefix.strip()
details = details.strip()
prefix, *sub_details = prefix.split(':')
group = CommitGroup.get(prefix.lower())
if group is CommitGroup.PRIORITY:
prefix, _, details = details.partition('/')
group, details = CommitGroup.get(prefix)
if group is CommitGroup.PRIORITY and details:
details = details.partition('/')[2].strip()
if not details and prefix and prefix not in CommitGroup.ignorable_prefixes:
logger.debug(f'Replaced details with {prefix!r}')
details = prefix or None
if details and '/' in details:
logger.error(f'Prefix is overnested, using first part: {prefix}')
details = details.partition('/')[0].strip()
if details == 'common':
details = None
if details:
details, *sub_details = details.split(':')
else:
sub_details = []
elif group is CommitGroup.NETWORKING and details == 'rh':
details = 'Request Handler'
return group, details, sub_details

View File

@ -10,14 +10,14 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import argparse
import contextlib
import sys
from datetime import datetime
from datetime import datetime, timezone
from devscripts.utils import read_version, run_process, write_file
def get_new_version(version, revision):
if not version:
version = datetime.utcnow().strftime('%Y.%m.%d')
version = datetime.now(timezone.utc).strftime('%Y.%m.%d')
if revision:
assert revision.isdigit(), 'Revision must be a number'

View File

@ -281,17 +281,13 @@ class TestSocks4Proxy:
rh, proxies={'all': f'socks4://user:@{server_address}'})
assert response['version'] == 4
@pytest.mark.parametrize('handler,ctx', [
pytest.param('Urllib', 'http', marks=pytest.mark.xfail(
reason='socks4a implementation currently broken when destination is not a domain name'))
], indirect=True)
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
def test_socks4a_ipv4_target(self, handler, ctx):
with ctx.socks_server(Socks4ProxyHandler) as server_address:
with handler(proxies={'all': f'socks4a://{server_address}'}) as rh:
response = ctx.socks_info_request(rh, target_domain='127.0.0.1')
assert response['version'] == 4
assert response['ipv4_address'] == '127.0.0.1'
assert response['domain_address'] is None
assert (response['ipv4_address'] == '127.0.0.1') != (response['domain_address'] == '127.0.0.1')
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
def test_socks4a_domain_target(self, handler, ctx):
@ -302,10 +298,7 @@ class TestSocks4Proxy:
assert response['ipv4_address'] is None
assert response['domain_address'] == 'localhost'
@pytest.mark.parametrize('handler,ctx', [
pytest.param('Urllib', 'http', marks=pytest.mark.xfail(
reason='source_address is not yet supported for socks4 proxies'))
], indirect=True)
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
def test_ipv4_client_source_address(self, handler, ctx):
with ctx.socks_server(Socks4ProxyHandler) as server_address:
source_address = f'127.0.0.{random.randint(5, 255)}'
@ -327,10 +320,7 @@ class TestSocks4Proxy:
with pytest.raises(ProxyError):
ctx.socks_info_request(rh)
@pytest.mark.parametrize('handler,ctx', [
pytest.param('Urllib', 'http', marks=pytest.mark.xfail(
reason='IPv6 socks4 proxies are not yet supported'))
], indirect=True)
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
def test_ipv6_socks4_proxy(self, handler, ctx):
with ctx.socks_server(Socks4ProxyHandler, bind_ip='::1') as server_address:
with handler(proxies={'all': f'socks4://{server_address}'}) as rh:
@ -342,7 +332,7 @@ class TestSocks4Proxy:
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
def test_timeout(self, handler, ctx):
with ctx.socks_server(Socks4ProxyHandler, sleep=2) as server_address:
with handler(proxies={'all': f'socks4://{server_address}'}, timeout=1) as rh:
with handler(proxies={'all': f'socks4://{server_address}'}, timeout=0.5) as rh:
with pytest.raises(TransportError):
ctx.socks_info_request(rh)
@ -383,7 +373,7 @@ class TestSocks5Proxy:
with ctx.socks_server(Socks5ProxyHandler) as server_address:
with handler(proxies={'all': f'socks5://{server_address}'}) as rh:
response = ctx.socks_info_request(rh, target_domain='localhost')
assert response['ipv4_address'] == '127.0.0.1'
assert (response['ipv4_address'] == '127.0.0.1') != (response['ipv6_address'] == '::1')
assert response['version'] == 5
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
@ -404,22 +394,15 @@ class TestSocks5Proxy:
assert response['domain_address'] is None
assert response['version'] == 5
@pytest.mark.parametrize('handler,ctx', [
pytest.param('Urllib', 'http', marks=pytest.mark.xfail(
reason='IPv6 destination addresses are not yet supported'))
], indirect=True)
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
def test_socks5_ipv6_destination(self, handler, ctx):
with ctx.socks_server(Socks5ProxyHandler) as server_address:
with handler(proxies={'all': f'socks5://{server_address}'}) as rh:
response = ctx.socks_info_request(rh, target_domain='[::1]')
assert response['ipv6_address'] == '::1'
assert response['port'] == 80
assert response['version'] == 5
@pytest.mark.parametrize('handler,ctx', [
pytest.param('Urllib', 'http', marks=pytest.mark.xfail(
reason='IPv6 socks5 proxies are not yet supported'))
], indirect=True)
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
def test_ipv6_socks5_proxy(self, handler, ctx):
with ctx.socks_server(Socks5ProxyHandler, bind_ip='::1') as server_address:
with handler(proxies={'all': f'socks5://{server_address}'}) as rh:
@ -430,10 +413,7 @@ class TestSocks5Proxy:
# XXX: is there any feasible way of testing IPv6 source addresses?
# Same would go for non-proxy source_address test...
@pytest.mark.parametrize('handler,ctx', [
pytest.param('Urllib', 'http', marks=pytest.mark.xfail(
reason='source_address is not yet supported for socks5 proxies'))
], indirect=True)
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
def test_ipv4_client_source_address(self, handler, ctx):
with ctx.socks_server(Socks5ProxyHandler) as server_address:
source_address = f'127.0.0.{random.randint(5, 255)}'

View File

@ -2591,7 +2591,7 @@ class YoutubeDL:
# Working around out-of-range timestamp values (e.g. negative ones on Windows,
# see http://bugs.python.org/issue1646728)
with contextlib.suppress(ValueError, OverflowError, OSError):
upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
upload_date = datetime.datetime.fromtimestamp(info_dict[ts_key], datetime.timezone.utc)
info_dict[date_key] = upload_date.strftime('%Y%m%d')
live_keys = ('is_live', 'was_live')

View File

@ -15,7 +15,7 @@ def get_package_info(module):
name=getattr(module, '_yt_dlp__identifier', module.__name__),
version=str(next(filter(None, (
getattr(module, attr, None)
for attr in ('__version__', 'version_string', 'version')
for attr in ('_yt_dlp__version', '__version__', 'version_string', 'version')
)), None)))

View File

@ -43,6 +43,8 @@ except Exception as _err:
try:
import sqlite3
# We need to get the underlying `sqlite` version, see https://github.com/yt-dlp/yt-dlp/issues/8152
sqlite3._yt_dlp__version = sqlite3.sqlite_version
except ImportError:
# although sqlite3 is part of the standard library, it is possible to compile python without
# sqlite support. See: https://github.com/yt-dlp/yt-dlp/issues/544

View File

@ -122,7 +122,6 @@ from .applepodcasts import ApplePodcastsIE
from .archiveorg import (
ArchiveOrgIE,
YoutubeWebArchiveIE,
VLiveWebArchiveIE,
)
from .arcpublishing import ArcPublishingIE
from .arkena import ArkenaIE
@ -165,6 +164,7 @@ from .awaan import (
AWAANLiveIE,
AWAANSeasonIE,
)
from .axs import AxsIE
from .azmedien import AZMedienIE
from .baidu import BaiduVideoIE
from .banbye import (
@ -223,7 +223,11 @@ from .bilibili import (
BiliBiliPlayerIE,
BilibiliSpaceVideoIE,
BilibiliSpaceAudioIE,
BilibiliSpacePlaylistIE,
BilibiliCollectionListIE,
BilibiliSeriesListIE,
BilibiliFavoritesListIE,
BilibiliWatchlaterIE,
BilibiliPlaylistIE,
BiliIntlIE,
BiliIntlSeriesIE,
BiliLiveIE,
@ -292,9 +296,11 @@ from .cammodels import CamModelsIE
from .camsoda import CamsodaIE
from .camtasia import CamtasiaEmbedIE
from .camwithher import CamWithHerIE
from .canal1 import Canal1IE
from .canalalpha import CanalAlphaIE
from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
from .caracoltv import CaracolTvPlayIE
from .carambatv import (
CarambaTVIE,
CarambaTVPageIE,
@ -561,6 +567,7 @@ from .epicon import (
EpiconIE,
EpiconSeriesIE,
)
from .eplus import EplusIbIE
from .epoch import EpochIE
from .eporner import EpornerIE
from .eroprofile import (
@ -1501,6 +1508,7 @@ from .polskieradio import (
from .popcorntimes import PopcorntimesIE
from .popcorntv import PopcornTVIE
from .porn91 import Porn91IE
from .pornbox import PornboxIE
from .porncom import PornComIE
from .pornflip import PornFlipIE
from .pornhd import PornHdIE
@ -1519,7 +1527,7 @@ from .puhutv import (
PuhuTVIE,
PuhuTVSerieIE,
)
from .pr0gramm import Pr0grammStaticIE, Pr0grammIE
from .pr0gramm import Pr0grammIE
from .prankcast import PrankCastIE
from .premiershiprugby import PremiershipRugbyIE
from .presstv import PressTVIE
@ -1555,7 +1563,14 @@ from .radiocanada import (
from .radiode import RadioDeIE
from .radiojavan import RadioJavanIE
from .radiobremen import RadioBremenIE
from .radiofrance import FranceCultureIE, RadioFranceIE
from .radiofrance import (
FranceCultureIE,
RadioFranceIE,
RadioFranceLiveIE,
RadioFrancePodcastIE,
RadioFranceProfileIE,
RadioFranceProgramScheduleIE,
)
from .radiozet import RadioZetPodcastIE
from .radiokapital import (
RadioKapitalIE,
@ -1586,6 +1601,7 @@ from .rbmaradio import RBMARadioIE
from .rbgtum import (
RbgTumIE,
RbgTumCourseIE,
RbgTumNewCourseIE,
)
from .rcs import (
RCSIE,
@ -1710,7 +1726,10 @@ from .ruv import (
RuvIE,
RuvSpilaIE
)
from .s4c import S4CIE
from .s4c import (
S4CIE,
S4CSeriesIE
)
from .safari import (
SafariIE,
SafariApiIE,
@ -1791,7 +1810,10 @@ from .slideslive import SlidesLiveIE
from .slutload import SlutloadIE
from .smotrim import SmotrimIE
from .snotr import SnotrIE
from .sohu import SohuIE
from .sohu import (
SohuIE,
SohuVIE,
)
from .sonyliv import (
SonyLIVIE,
SonyLIVSeriesIE,
@ -2354,7 +2376,8 @@ from .webofstories import (
)
from .weibo import (
WeiboIE,
WeiboMobileIE
WeiboVideoIE,
WeiboUserIE,
)
from .weiqitv import WeiqiTVIE
from .weverse import (

View File

@ -12,7 +12,7 @@ import urllib.parse
import urllib.request
import urllib.response
import uuid
from ..utils.networking import clean_proxies
from .common import InfoExtractor
from ..aes import aes_ecb_decrypt
from ..utils import (
@ -35,7 +35,10 @@ def add_opener(ydl, handler): # FIXME: Create proper API in .networking
rh = ydl._request_director.handlers['Urllib']
if 'abematv-license' in rh._SUPPORTED_URL_SCHEMES:
return
opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=ydl.proxies)
headers = ydl.params['http_headers'].copy()
proxies = ydl.proxies.copy()
clean_proxies(proxies, headers)
opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=proxies)
assert isinstance(opener, urllib.request.OpenerDirector)
opener.add_handler(handler)
rh._SUPPORTED_URL_SCHEMES = (*rh._SUPPORTED_URL_SCHEMES, 'abematv-license')

View File

@ -22,8 +22,11 @@ class AmazonMiniTVBaseIE(InfoExtractor):
resp = self._download_json(
f'https://www.amazon.in/minitv/api/web/{"graphql" if data else "prs"}',
asin, note=note, headers={'Content-Type': 'application/json'},
data=json.dumps(data).encode() if data else None,
asin, note=note, headers={
'Content-Type': 'application/json',
'currentpageurl': '/',
'currentplatform': 'dWeb'
}, data=json.dumps(data).encode() if data else None,
query=None if data else {
'deviceType': 'A1WMMUXPCUJL4N',
'contentId': asin,
@ -46,7 +49,7 @@ class AmazonMiniTVIE(AmazonMiniTVBaseIE):
'ext': 'mp4',
'title': 'May I Kiss You?',
'language': 'Hindi',
'thumbnail': r're:^https?://.*\.jpg$',
'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
'description': 'md5:a549bfc747973e04feb707833474e59d',
'release_timestamp': 1644710400,
'release_date': '20220213',
@ -68,7 +71,7 @@ class AmazonMiniTVIE(AmazonMiniTVBaseIE):
'ext': 'mp4',
'title': 'Jahaan',
'language': 'Hindi',
'thumbnail': r're:^https?://.*\.jpg',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'description': 'md5:05eb765a77bf703f322f120ec6867339',
'release_timestamp': 1647475200,
'release_date': '20220317',

View File

@ -3,7 +3,6 @@ import re
import urllib.parse
from .common import InfoExtractor
from .naver import NaverBaseIE
from .youtube import YoutubeBaseInfoExtractor, YoutubeIE
from ..compat import compat_urllib_parse_unquote
from ..networking import HEADRequest
@ -947,237 +946,3 @@ class YoutubeWebArchiveIE(InfoExtractor):
if not info.get('title'):
info['title'] = video_id
return info
class VLiveWebArchiveIE(InfoExtractor):
IE_NAME = 'web.archive:vlive'
IE_DESC = 'web.archive.org saved vlive videos'
_VALID_URL = r'''(?x)
(?:https?://)?web\.archive\.org/
(?:web/)?(?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional
(?:https?(?::|%3[Aa])//)?(?:
(?:(?:www|m)\.)?vlive\.tv(?::(?:80|443))?/(?:video|embed)/(?P<id>[0-9]+) # VLive URL
)
'''
_TESTS = [{
'url': 'https://web.archive.org/web/20221221144331/http://www.vlive.tv/video/1326',
'md5': 'cc7314812855ce56de70a06a27314983',
'info_dict': {
'id': '1326',
'ext': 'mp4',
'title': "Girl's Day's Broadcast",
'creator': "Girl's Day",
'view_count': int,
'uploader_id': 'muploader_a',
'uploader_url': None,
'uploader': None,
'upload_date': '20150817',
'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
'timestamp': 1439816449,
'like_count': int,
'channel': 'Girl\'s Day',
'channel_id': 'FDF27',
'comment_count': int,
'release_timestamp': 1439818140,
'release_date': '20150817',
'duration': 1014,
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://web.archive.org/web/20221221182103/http://www.vlive.tv/video/16937',
'info_dict': {
'id': '16937',
'ext': 'mp4',
'title': '첸백시 걍방',
'creator': 'EXO',
'view_count': int,
'subtitles': 'mincount:12',
'uploader_id': 'muploader_j',
'uploader_url': 'http://vlive.tv',
'uploader': None,
'upload_date': '20161112',
'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
'timestamp': 1478923074,
'like_count': int,
'channel': 'EXO',
'channel_id': 'F94BD',
'comment_count': int,
'release_timestamp': 1478924280,
'release_date': '20161112',
'duration': 906,
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://web.archive.org/web/20221127190050/http://www.vlive.tv/video/101870',
'info_dict': {
'id': '101870',
'ext': 'mp4',
'title': '[ⓓ xV] “레벨이들 매력에 반해? 안 반해?” 움직이는 HD 포토 (레드벨벳:Red Velvet)',
'creator': 'Dispatch',
'view_count': int,
'subtitles': 'mincount:6',
'uploader_id': 'V__FRA08071',
'uploader_url': 'http://vlive.tv',
'uploader': None,
'upload_date': '20181130',
'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
'timestamp': 1543601327,
'like_count': int,
'channel': 'Dispatch',
'channel_id': 'C796F3',
'comment_count': int,
'release_timestamp': 1543601040,
'release_date': '20181130',
'duration': 279,
},
'params': {
'skip_download': True,
},
}]
# The wayback machine has special timestamp and "mode" values:
# timestamp:
# 1 = the first capture
# 2 = the last capture
# mode:
# id_ = Identity - perform no alterations of the original resource, return it as it was archived.
_WAYBACK_BASE_URL = 'https://web.archive.org/web/2id_/'
def _download_archived_page(self, url, video_id, *, timestamp='2', **kwargs):
for retry in self.RetryManager():
try:
return self._download_webpage(f'https://web.archive.org/web/{timestamp}id_/{url}', video_id, **kwargs)
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 404:
raise ExtractorError('Page was not archived', expected=True)
retry.error = e
continue
def _download_archived_json(self, url, video_id, **kwargs):
page = self._download_archived_page(url, video_id, **kwargs)
if not page:
raise ExtractorError('Page was not archived', expected=True)
else:
return self._parse_json(page, video_id)
def _extract_formats_from_m3u8(self, m3u8_url, params, video_id):
m3u8_doc = self._download_archived_page(m3u8_url, video_id, note='Downloading m3u8', query=params, fatal=False)
if not m3u8_doc:
return
# M3U8 document should be changed to archive domain
m3u8_doc = m3u8_doc.splitlines()
url_base = m3u8_url.rsplit('/', 1)[0]
first_segment = None
for i, line in enumerate(m3u8_doc):
if not line.startswith('#'):
m3u8_doc[i] = f'{self._WAYBACK_BASE_URL}{url_base}/{line}?{urllib.parse.urlencode(params)}'
first_segment = first_segment or m3u8_doc[i]
# Segments may not have been archived. See https://web.archive.org/web/20221127190050/http://www.vlive.tv/video/101870
urlh = self._request_webpage(HEADRequest(first_segment), video_id, errnote=False,
fatal=False, note='Check first segment availablity')
if urlh:
formats, subtitles = self._parse_m3u8_formats_and_subtitles('\n'.join(m3u8_doc), ext='mp4', video_id=video_id)
if subtitles:
self._report_ignoring_subs('m3u8')
return formats
# Closely follows the logic of the ArchiveTeam grab script
# See: https://github.com/ArchiveTeam/vlive-grab/blob/master/vlive.lua
def _real_extract(self, url):
video_id, url_date = self._match_valid_url(url).group('id', 'date')
webpage = self._download_archived_page(f'https://www.vlive.tv/video/{video_id}', video_id, timestamp=url_date)
player_info = self._search_json(r'__PRELOADED_STATE__\s*=', webpage, 'player info', video_id)
user_country = traverse_obj(player_info, ('common', 'userCountry'))
main_script_url = self._search_regex(r'<script\s+src="([^"]+/js/main\.[^"]+\.js)"', webpage, 'main script url')
main_script = self._download_archived_page(main_script_url, video_id, note='Downloading main script')
app_id = self._search_regex(r'appId\s*=\s*"([^"]+)"', main_script, 'app id')
inkey = self._download_archived_json(
f'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/{video_id}/inkey', video_id, note='Fetching inkey', query={
'appId': app_id,
'platformType': 'PC',
'gcc': user_country,
'locale': 'en_US',
}, fatal=False)
vod_id = traverse_obj(player_info, ('postDetail', 'post', 'officialVideo', 'vodId'))
vod_data = self._download_archived_json(
f'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{vod_id}', video_id, note='Fetching vod data', query={
'key': inkey.get('inkey'),
'pid': 'rmcPlayer_16692457559726800', # partially unix time and partially random. Fixed value used by archiveteam project
'sid': '2024',
'ver': '2.0',
'devt': 'html5_pc',
'doct': 'json',
'ptc': 'https',
'sptc': 'https',
'cpt': 'vtt',
'ctls': '%7B%22visible%22%3A%7B%22fullscreen%22%3Atrue%2C%22logo%22%3Afalse%2C%22playbackRate%22%3Afalse%2C%22scrap%22%3Afalse%2C%22playCount%22%3Atrue%2C%22commentCount%22%3Atrue%2C%22title%22%3Atrue%2C%22writer%22%3Atrue%2C%22expand%22%3Afalse%2C%22subtitles%22%3Atrue%2C%22thumbnails%22%3Atrue%2C%22quality%22%3Atrue%2C%22setting%22%3Atrue%2C%22script%22%3Afalse%2C%22logoDimmed%22%3Atrue%2C%22badge%22%3Atrue%2C%22seekingTime%22%3Atrue%2C%22muted%22%3Atrue%2C%22muteButton%22%3Afalse%2C%22viewerNotice%22%3Afalse%2C%22linkCount%22%3Afalse%2C%22createTime%22%3Afalse%2C%22thumbnail%22%3Atrue%7D%2C%22clicked%22%3A%7B%22expand%22%3Afalse%2C%22subtitles%22%3Afalse%7D%7D',
'pv': '4.26.9',
'dr': '1920x1080',
'cpl': 'en_US',
'lc': 'en_US',
'adi': '%5B%7B%22type%22%3A%22pre%22%2C%22exposure%22%3Afalse%2C%22replayExposure%22%3Afalse%7D%5D',
'adu': '%2F',
'videoId': vod_id,
'cc': user_country,
})
formats = []
streams = traverse_obj(vod_data, ('streams', ...))
if len(streams) > 1:
self.report_warning('Multiple streams found. Only the first stream will be downloaded.')
stream = streams[0]
max_stream = max(
stream.get('videos') or [],
key=lambda v: traverse_obj(v, ('bitrate', 'video'), default=0), default=None)
if max_stream is not None:
params = {arg.get('name'): arg.get('value') for arg in stream.get('keys', []) if arg.get('type') == 'param'}
formats = self._extract_formats_from_m3u8(max_stream.get('source'), params, video_id) or []
# For parts of the project MP4 files were archived
max_video = max(
traverse_obj(vod_data, ('videos', 'list', ...)),
key=lambda v: traverse_obj(v, ('bitrate', 'video'), default=0), default=None)
if max_video is not None:
video_url = self._WAYBACK_BASE_URL + max_video.get('source')
urlh = self._request_webpage(HEADRequest(video_url), video_id, errnote=False,
fatal=False, note='Check video availablity')
if urlh:
formats.append({'url': video_url})
return {
'id': video_id,
'formats': formats,
**traverse_obj(player_info, ('postDetail', 'post', {
'title': ('officialVideo', 'title', {str}),
'creator': ('author', 'nickname', {str}),
'channel': ('channel', 'channelName', {str}),
'channel_id': ('channel', 'channelCode', {str}),
'duration': ('officialVideo', 'playTime', {int_or_none}),
'view_count': ('officialVideo', 'playCount', {int_or_none}),
'like_count': ('officialVideo', 'likeCount', {int_or_none}),
'comment_count': ('officialVideo', 'commentCount', {int_or_none}),
'timestamp': ('officialVideo', 'createdAt', {lambda x: int_or_none(x, scale=1000)}),
'release_timestamp': ('officialVideo', 'willStartAt', {lambda x: int_or_none(x, scale=1000)}),
})),
**traverse_obj(vod_data, ('meta', {
'uploader_id': ('user', 'id', {str}),
'uploader': ('user', 'name', {str}),
'uploader_url': ('user', 'url', {url_or_none}),
'thumbnail': ('cover', 'source', {url_or_none}),
}), expected_type=lambda x: x or None),
**NaverBaseIE.process_subtitles(vod_data, lambda x: [self._WAYBACK_BASE_URL + x]),
}

View File

@ -12,7 +12,7 @@ class AWSIE(InfoExtractor): # XXX: Conventionally, base classes should end with
def _aws_execute_api(self, aws_dict, video_id, query=None):
query = query or {}
amz_date = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')
amz_date = datetime.datetime.now(datetime.timezone.utc).strftime('%Y%m%dT%H%M%SZ')
date = amz_date[:8]
headers = {
'Accept': 'application/json',

87
yt_dlp/extractor/axs.py Normal file
View File

@ -0,0 +1,87 @@
from .common import InfoExtractor
from ..utils import (
float_or_none,
js_to_json,
parse_iso8601,
traverse_obj,
url_or_none,
)
class AxsIE(InfoExtractor):
IE_NAME = 'axs.tv'
_VALID_URL = r'https?://(?:www\.)?axs\.tv/(?:channel/(?:[^/?#]+/)+)?video/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.axs.tv/video/5f4dc776b70e4f1c194f22ef/',
'md5': '8d97736ae8e50c64df528e5e676778cf',
'info_dict': {
'id': '5f4dc776b70e4f1c194f22ef',
'title': 'Small Town',
'ext': 'mp4',
'description': 'md5:e314d28bfaa227a4d7ec965fae19997f',
'upload_date': '20230602',
'timestamp': 1685729564,
'duration': 1284.216,
'series': 'Rock & Roll Road Trip with Sammy Hagar',
'season': 2,
'episode': '3',
'thumbnail': 'https://images.dotstudiopro.com/5f4e9d330a0c3b295a7e8394',
},
}, {
'url': 'https://www.axs.tv/channel/rock-star-interview/video/daryl-hall',
'md5': '300ae795cd8f9984652c0949734ffbdc',
'info_dict': {
'id': '5f488148b70e4f392572977c',
'display_id': 'daryl-hall',
'title': 'Daryl Hall',
'ext': 'mp4',
'description': 'md5:e54ecaa0f4b5683fc9259e9e4b196628',
'upload_date': '20230214',
'timestamp': 1676403615,
'duration': 2570.668,
'series': 'The Big Interview with Dan Rather',
'season': 3,
'episode': '5',
'thumbnail': 'https://images.dotstudiopro.com/5f4d1901f340b50d937cec32',
},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
webpage_json_data = self._search_json(
r'mountObj\s*=', webpage, 'video ID data', display_id,
transform_source=js_to_json)
video_id = webpage_json_data['video_id']
company_id = webpage_json_data['company_id']
meta = self._download_json(
f'https://api.myspotlight.tv/dotplayer/video/{company_id}/{video_id}',
video_id, query={'device_type': 'desktop_web'})['video']
formats = self._extract_m3u8_formats(
meta['video_m3u8'], video_id, 'mp4', m3u8_id='hls')
subtitles = {}
for cc in traverse_obj(meta, ('closeCaption', lambda _, v: url_or_none(v['srtPath']))):
subtitles.setdefault(cc.get('srtShortLang') or 'en', []).append(
{'ext': cc.get('srtExt'), 'url': cc['srtPath']})
return {
'id': video_id,
'display_id': display_id,
'formats': formats,
**traverse_obj(meta, {
'title': ('title', {str}),
'description': ('description', {str}),
'series': ('seriestitle', {str}),
'season': ('season', {int}),
'episode': ('episode', {str}),
'duration': ('duration', {float_or_none}),
'timestamp': ('updated_at', {parse_iso8601}),
'thumbnail': ('thumb', {url_or_none}),
}),
'subtitles': subtitles,
}

View File

@ -31,7 +31,7 @@ class BanByeBaseIE(InfoExtractor):
class BanByeIE(BanByeBaseIE):
_VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P<id>\w+)'
_VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T',
'md5': '2f4ea15c5ca259a73d909b2cfd558eb5',
@ -59,7 +59,27 @@ class BanByeIE(BanByeBaseIE):
'title': 'Krzysztof Karoń',
'id': 'p_Ld82N6gBw_OJ',
},
'playlist_count': 9,
'playlist_mincount': 9,
}, {
'url': 'https://banbye.com/watch/v_kb6_o1Kyq-CD',
'info_dict': {
'id': 'v_kb6_o1Kyq-CD',
'ext': 'mp4',
'title': 'Co tak naprawdę dzieje się we Francji?! Czy Warszawa a potem cała Polska będzie drugim Paryżem?!🤔🇵🇱',
'description': 'md5:82be4c0e13eae8ea1ca8b9f2e07226a8',
'uploader': 'Marcin Rola - MOIM ZDANIEM!🇵🇱',
'channel_id': 'ch_QgWnHvDG2fo5',
'channel_url': 'https://banbye.com/channel/ch_QgWnHvDG2fo5',
'duration': 597,
'timestamp': 1688642656,
'upload_date': '20230706',
'thumbnail': 'https://cdn.banbye.com/video/v_kb6_o1Kyq-CD/96.webp',
'tags': ['Paryż', 'Francja', 'Polska', 'Imigranci', 'Morawiecki', 'Tusk'],
'like_count': int,
'dislike_count': int,
'view_count': int,
'comment_count': int,
},
}]
def _real_extract(self, url):

View File

@ -15,11 +15,13 @@ from ..utils import (
float_or_none,
get_element_by_class,
int_or_none,
join_nonempty,
js_to_json,
parse_duration,
parse_iso8601,
parse_qs,
strip_or_none,
traverse_obj,
try_get,
unescapeHTML,
unified_timestamp,
@ -41,7 +43,6 @@ class BBCCoUkIE(InfoExtractor):
iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
music/(?:clips|audiovideo/popular)[/#]|
radio/player/|
sounds/play/|
events/[^/]+/play/[^/]+/
)
(?P<id>%s)(?!/(?:episodes|broadcasts|clips))
@ -218,20 +219,6 @@ class BBCCoUkIE(InfoExtractor):
# rtmp download
'skip_download': True,
},
}, {
'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
'note': 'Audio',
'info_dict': {
'id': 'm0007jz9',
'ext': 'mp4',
'title': 'BBC Proms, 2019, Prom 34: WestEastern Divan Orchestra',
'description': "Live BBC Proms. WestEastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
'duration': 9840,
},
'params': {
# rtmp download
'skip_download': True,
}
}, {
'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
'only_matching': True,
@ -844,6 +831,20 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'upload_date': '20190604',
'categories': ['Psychology'],
},
}, {
# BBC Sounds
'url': 'https://www.bbc.co.uk/sounds/play/m001q78b',
'info_dict': {
'id': 'm001q789',
'ext': 'mp4',
'title': 'The Night Tracks Mix - Music for the darkling hour',
'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg',
'chapters': 'count:8',
'description': 'md5:815fb51cbdaa270040aab8145b3f1d67',
'uploader': 'Radio 3',
'duration': 1800,
'uploader_id': 'bbc_radio_three',
},
}, { # onion routes
'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
'only_matching': True,
@ -1128,6 +1129,13 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'uploader_id': network.get('id'),
'formats': formats,
'subtitles': subtitles,
'chapters': traverse_obj(preload_state, (
'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), {
'title': ('titles', {lambda x: join_nonempty(
'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
'start_time': ('offset', 'start', {float_or_none}),
'end_time': ('offset', 'end', {float_or_none}),
})) or None,
}
bbc3_config = self._parse_json(

View File

@ -1,6 +1,7 @@
from .common import InfoExtractor
from ..utils import (
int_or_none,
traverse_obj,
unescapeHTML,
)
@ -8,7 +9,8 @@ from ..utils import (
class BildIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bild\.de/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+)(?:,auto=true)?\.bild\.html'
IE_DESC = 'Bild.de'
_TEST = {
_TESTS = [{
'note': 'static MP4 only',
'url': 'http://www.bild.de/video/clip/apple-ipad-air/das-koennen-die-neuen-ipads-38184146.bild.html',
'md5': 'dd495cbd99f2413502a1713a1156ac8a',
'info_dict': {
@ -19,7 +21,19 @@ class BildIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 196,
}
}
}, {
'note': 'static MP4 and HLS',
'url': 'https://www.bild.de/video/clip/news-ausland/deftiger-abgang-vom-10m-turm-bademeister-sorgt-fuer-skandal-85158620.bild.html',
'md5': 'fb0ed4f09c495d4ba7ce2eee0bb90de1',
'info_dict': {
'id': '85158620',
'ext': 'mp4',
'title': 'Der Sprungturm-Skandal',
'description': 'md5:709b543c24dc31bbbffee73bccda34ad',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 69,
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
@ -27,11 +41,23 @@ class BildIE(InfoExtractor):
video_data = self._download_json(
url.split('.bild.html')[0] + ',view=json.bild.html', video_id)
formats = []
for src in traverse_obj(video_data, ('clipList', 0, 'srces', lambda _, v: v['src'])):
src_type = src.get('type')
if src_type == 'application/x-mpegURL':
formats.extend(
self._extract_m3u8_formats(
src['src'], video_id, 'mp4', m3u8_id='hls', fatal=False))
elif src_type == 'video/mp4':
formats.append({'url': src['src'], 'format_id': 'http-mp4'})
else:
self.report_warning(f'Skipping unsupported format type: "{src_type}"')
return {
'id': video_id,
'title': unescapeHTML(video_data['title']).strip(),
'description': unescapeHTML(video_data.get('description')),
'url': video_data['clipList'][0]['srces'][0]['src'],
'formats': formats,
'thumbnail': video_data.get('poster'),
'duration': int_or_none(video_data.get('durationSec')),
}

View File

@ -3,6 +3,7 @@ import functools
import hashlib
import itertools
import math
import re
import time
import urllib.parse
@ -14,6 +15,7 @@ from ..utils import (
GeoRestrictedError,
InAdvancePagedList,
OnDemandPagedList,
bool_or_none,
filter_dict,
float_or_none,
format_field,
@ -34,27 +36,31 @@ from ..utils import (
unsmuggle_url,
url_or_none,
urlencode_postdata,
variadic,
)
class BilibiliBaseIE(InfoExtractor):
_FORMAT_ID_RE = re.compile(r'-(\d+)\.m4s\?')
def extract_formats(self, play_info):
format_names = {
r['quality']: traverse_obj(r, 'new_description', 'display_desc')
for r in traverse_obj(play_info, ('support_formats', lambda _, v: v['quality']))
}
audios = traverse_obj(play_info, ('dash', 'audio', ...))
audios = traverse_obj(play_info, ('dash', (None, 'dolby'), 'audio', ..., {dict}))
flac_audio = traverse_obj(play_info, ('dash', 'flac', 'audio'))
if flac_audio:
audios.append(flac_audio)
formats = [{
'url': traverse_obj(audio, 'baseUrl', 'base_url', 'url'),
'ext': mimetype2ext(traverse_obj(audio, 'mimeType', 'mime_type')),
'acodec': audio.get('codecs'),
'acodec': traverse_obj(audio, ('codecs', {str.lower})),
'vcodec': 'none',
'tbr': float_or_none(audio.get('bandwidth'), scale=1000),
'filesize': int_or_none(audio.get('size'))
'filesize': int_or_none(audio.get('size')),
'format_id': str_or_none(audio.get('id')),
} for audio in audios]
formats.extend({
@ -65,9 +71,13 @@ class BilibiliBaseIE(InfoExtractor):
'height': int_or_none(video.get('height')),
'vcodec': video.get('codecs'),
'acodec': 'none' if audios else None,
'dynamic_range': {126: 'DV', 125: 'HDR10'}.get(int_or_none(video.get('id'))),
'tbr': float_or_none(video.get('bandwidth'), scale=1000),
'filesize': int_or_none(video.get('size')),
'quality': int_or_none(video.get('id')),
'format_id': traverse_obj(
video, (('baseUrl', 'base_url'), {self._FORMAT_ID_RE.search}, 1),
('id', {str_or_none}), get_all=False),
'format': format_names.get(video.get('id')),
} for video in traverse_obj(play_info, ('dash', 'video', ...)))
@ -149,7 +159,7 @@ class BilibiliBaseIE(InfoExtractor):
class BiliBiliIE(BilibiliBaseIE):
_VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)'
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.bilibili.com/video/BV13x41117TL',
@ -245,7 +255,7 @@ class BiliBiliIE(BilibiliBaseIE):
'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4',
'duration': 313.557,
'upload_date': '20220709',
'uploader': '小夫Tech',
'uploader': '小夫太渴',
'timestamp': 1657347907,
'uploader_id': '1326814124',
'comment_count': int,
@ -502,7 +512,7 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
class BiliBiliBangumiMediaIE(BilibiliBaseIE):
_VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/media/md(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.bilibili.com/bangumi/media/md24097891',
'info_dict': {
@ -521,7 +531,7 @@ class BiliBiliBangumiMediaIE(BilibiliBaseIE):
class BiliBiliBangumiSeasonIE(BilibiliBaseIE):
_VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/ss(?P<id>\d+)'
_VALID_URL = r'(?x)https?://(?:www\.)?bilibili\.com/bangumi/play/ss(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.bilibili.com/bangumi/play/ss26801',
'info_dict': {
@ -672,13 +682,35 @@ class BilibiliSpaceAudioIE(BilibiliSpaceBaseIE):
return self.playlist_result(paged_list, playlist_id)
class BilibiliSpacePlaylistIE(BilibiliSpaceBaseIE):
_VALID_URL = r'https?://space.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail\?sid=(?P<sid>\d+)'
class BilibiliSpaceListBaseIE(BilibiliSpaceBaseIE):
def _get_entries(self, page_data, bvid_keys, ending_key='bvid'):
for bvid in traverse_obj(page_data, (*variadic(bvid_keys, (str, bytes, dict, set)), ..., ending_key, {str})):
yield self.url_result(f'https://www.bilibili.com/video/{bvid}', BiliBiliIE, bvid)
def _get_uploader(self, uid, playlist_id):
webpage = self._download_webpage(f'https://space.bilibili.com/{uid}', playlist_id, fatal=False)
return self._search_regex(r'(?s)<title\b[^>]*>([^<]+)的个人空间-', webpage, 'uploader', fatal=False)
def _extract_playlist(self, fetch_page, get_metadata, get_entries):
metadata, page_list = super()._extract_playlist(fetch_page, get_metadata, get_entries)
metadata.pop('page_count', None)
metadata.pop('page_size', None)
return metadata, page_list
class BilibiliCollectionListIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://space\.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail/?\?sid=(?P<sid>\d+)'
_TESTS = [{
'url': 'https://space.bilibili.com/2142762/channel/collectiondetail?sid=57445',
'info_dict': {
'id': '2142762_57445',
'title': '《底特律 变人》'
'title': '【完结】《底特律 变人》全结局流程解说',
'description': '',
'uploader': '老戴在此',
'uploader_id': '2142762',
'timestamp': int,
'upload_date': str,
'thumbnail': 'https://archive.biliimg.com/bfs/archive/e0e543ae35ad3df863ea7dea526bc32e70f4c091.jpg',
},
'playlist_mincount': 31,
}]
@ -699,22 +731,251 @@ class BilibiliSpacePlaylistIE(BilibiliSpaceBaseIE):
return {
'page_count': math.ceil(entry_count / page_size),
'page_size': page_size,
'title': traverse_obj(page_data, ('meta', 'name'))
'uploader': self._get_uploader(mid, playlist_id),
**traverse_obj(page_data, {
'title': ('meta', 'name', {str}),
'description': ('meta', 'description', {str}),
'uploader_id': ('meta', 'mid', {str_or_none}),
'timestamp': ('meta', 'ptime', {int_or_none}),
'thumbnail': ('meta', 'cover', {url_or_none}),
})
}
def get_entries(page_data):
for entry in page_data.get('archives', []):
yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}',
BiliBiliIE, entry['bvid'])
return self._get_entries(page_data, 'archives')
metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
return self.playlist_result(paged_list, playlist_id, metadata['title'])
return self.playlist_result(paged_list, playlist_id, **metadata)
class BilibiliSeriesListIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://space\.bilibili\.com/(?P<mid>\d+)/channel/seriesdetail/?\?\bsid=(?P<sid>\d+)'
_TESTS = [{
'url': 'https://space.bilibili.com/1958703906/channel/seriesdetail?sid=547718&ctype=0',
'info_dict': {
'id': '1958703906_547718',
'title': '直播回放',
'description': '直播回放',
'uploader': '靡烟miya',
'uploader_id': '1958703906',
'timestamp': 1637985853,
'upload_date': '20211127',
'modified_timestamp': int,
'modified_date': str,
},
'playlist_mincount': 513,
}]
def _real_extract(self, url):
mid, sid = self._match_valid_url(url).group('mid', 'sid')
playlist_id = f'{mid}_{sid}'
playlist_meta = traverse_obj(self._download_json(
f'https://api.bilibili.com/x/series/series?series_id={sid}', playlist_id, fatal=False
), {
'title': ('data', 'meta', 'name', {str}),
'description': ('data', 'meta', 'description', {str}),
'uploader_id': ('data', 'meta', 'mid', {str_or_none}),
'timestamp': ('data', 'meta', 'ctime', {int_or_none}),
'modified_timestamp': ('data', 'meta', 'mtime', {int_or_none}),
})
def fetch_page(page_idx):
return self._download_json(
'https://api.bilibili.com/x/series/archives',
playlist_id, note=f'Downloading page {page_idx}',
query={'mid': mid, 'series_id': sid, 'pn': page_idx + 1, 'ps': 30})['data']
def get_metadata(page_data):
page_size = page_data['page']['size']
entry_count = page_data['page']['total']
return {
'page_count': math.ceil(entry_count / page_size),
'page_size': page_size,
'uploader': self._get_uploader(mid, playlist_id),
**playlist_meta
}
def get_entries(page_data):
return self._get_entries(page_data, 'archives')
metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
return self.playlist_result(paged_list, playlist_id, **metadata)
class BilibiliFavoritesListIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://(?:space\.bilibili\.com/\d+/favlist/?\?fid=|(?:www\.)?bilibili\.com/medialist/detail/ml)(?P<id>\d+)'
_TESTS = [{
'url': 'https://space.bilibili.com/84912/favlist?fid=1103407912&ftype=create',
'info_dict': {
'id': '1103407912',
'title': '【V2】',
'description': '',
'uploader': '晓月春日',
'uploader_id': '84912',
'timestamp': 1604905176,
'upload_date': '20201109',
'modified_timestamp': int,
'modified_date': str,
'thumbnail': r"re:http://i\d\.hdslb\.com/bfs/archive/14b83c62aa8871b79083df1e9ab4fbc699ad16fe\.jpg",
'view_count': int,
'like_count': int,
},
'playlist_mincount': 22,
}, {
'url': 'https://www.bilibili.com/medialist/detail/ml1103407912',
'only_matching': True,
}]
def _real_extract(self, url):
fid = self._match_id(url)
list_info = self._download_json(
f'https://api.bilibili.com/x/v3/fav/resource/list?media_id={fid}&pn=1&ps=20',
fid, note='Downloading favlist metadata')
if list_info['code'] == -403:
self.raise_login_required(msg='This is a private favorites list. You need to log in as its owner')
entries = self._get_entries(self._download_json(
f'https://api.bilibili.com/x/v3/fav/resource/ids?media_id={fid}',
fid, note='Download favlist entries'), 'data')
return self.playlist_result(entries, fid, **traverse_obj(list_info, ('data', 'info', {
'title': ('title', {str}),
'description': ('intro', {str}),
'uploader': ('upper', 'name', {str}),
'uploader_id': ('upper', 'mid', {str_or_none}),
'timestamp': ('ctime', {int_or_none}),
'modified_timestamp': ('mtime', {int_or_none}),
'thumbnail': ('cover', {url_or_none}),
'view_count': ('cnt_info', 'play', {int_or_none}),
'like_count': ('cnt_info', 'thumb_up', {int_or_none}),
})))
class BilibiliWatchlaterIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/watchlater/?(?:[?#]|$)'
_TESTS = [{
'url': 'https://www.bilibili.com/watchlater/#/list',
'info_dict': {'id': 'watchlater'},
'playlist_mincount': 0,
'skip': 'login required',
}]
def _real_extract(self, url):
list_id = getattr(self._get_cookies(url).get('DedeUserID'), 'value', 'watchlater')
watchlater_info = self._download_json(
'https://api.bilibili.com/x/v2/history/toview/web?jsonp=jsonp', list_id)
if watchlater_info['code'] == -101:
self.raise_login_required(msg='You need to login to access your watchlater list')
entries = self._get_entries(watchlater_info, ('data', 'list'))
return self.playlist_result(entries, id=list_id, title='稍后再看')
class BilibiliPlaylistIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:medialist/play|list)/(?P<id>\w+)'
_TESTS = [{
'url': 'https://www.bilibili.com/list/1958703906?sid=547718',
'info_dict': {
'id': '5_547718',
'title': '直播回放',
'uploader': '靡烟miya',
'uploader_id': '1958703906',
'timestamp': 1637985853,
'upload_date': '20211127',
},
'playlist_mincount': 513,
}, {
'url': 'https://www.bilibili.com/medialist/play/1958703906?business=space_series&business_id=547718&desc=1',
'info_dict': {
'id': '5_547718',
},
'playlist_mincount': 513,
'skip': 'redirect url',
}, {
'url': 'https://www.bilibili.com/list/ml1103407912',
'info_dict': {
'id': '3_1103407912',
'title': '【V2】',
'uploader': '晓月春日',
'uploader_id': '84912',
'timestamp': 1604905176,
'upload_date': '20201109',
'thumbnail': r"re:http://i\d\.hdslb\.com/bfs/archive/14b83c62aa8871b79083df1e9ab4fbc699ad16fe\.jpg",
},
'playlist_mincount': 22,
}, {
'url': 'https://www.bilibili.com/medialist/play/ml1103407912',
'info_dict': {
'id': '3_1103407912',
},
'playlist_mincount': 22,
'skip': 'redirect url',
}, {
'url': 'https://www.bilibili.com/list/watchlater',
'info_dict': {'id': 'watchlater'},
'playlist_mincount': 0,
'skip': 'login required',
}, {
'url': 'https://www.bilibili.com/medialist/play/watchlater',
'info_dict': {'id': 'watchlater'},
'playlist_mincount': 0,
'skip': 'login required',
}]
def _extract_medialist(self, query, list_id):
for page_num in itertools.count(1):
page_data = self._download_json(
'https://api.bilibili.com/x/v2/medialist/resource/list',
list_id, query=query, note=f'getting playlist {query["biz_id"]} page {page_num}'
)['data']
yield from self._get_entries(page_data, 'media_list', ending_key='bv_id')
query['oid'] = traverse_obj(page_data, ('media_list', -1, 'id'))
if not page_data.get('has_more', False):
break
def _real_extract(self, url):
list_id = self._match_id(url)
webpage = self._download_webpage(url, list_id)
initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', list_id)
if traverse_obj(initial_state, ('error', 'code', {int_or_none})) != 200:
error_code = traverse_obj(initial_state, ('error', 'trueCode', {int_or_none}))
error_message = traverse_obj(initial_state, ('error', 'message', {str_or_none}))
if error_code == -400 and list_id == 'watchlater':
self.raise_login_required('You need to login to access your watchlater playlist')
elif error_code == -403:
self.raise_login_required('This is a private playlist. You need to login as its owner')
elif error_code == 11010:
raise ExtractorError('Playlist is no longer available', expected=True)
raise ExtractorError(f'Could not access playlist: {error_code} {error_message}')
query = {
'ps': 20,
'with_current': False,
**traverse_obj(initial_state, {
'type': ('playlist', 'type', {int_or_none}),
'biz_id': ('playlist', 'id', {int_or_none}),
'tid': ('tid', {int_or_none}),
'sort_field': ('sortFiled', {int_or_none}),
'desc': ('desc', {bool_or_none}, {str_or_none}, {str.lower}),
})
}
metadata = {
'id': f'{query["type"]}_{query["biz_id"]}',
**traverse_obj(initial_state, ('mediaListInfo', {
'title': ('title', {str}),
'uploader': ('upper', 'name', {str}),
'uploader_id': ('upper', 'mid', {str_or_none}),
'timestamp': ('ctime', {int_or_none}),
'thumbnail': ('cover', {url_or_none}),
})),
}
return self.playlist_result(self._extract_medialist(query, list_id), **metadata)
class BilibiliCategoryIE(InfoExtractor):
IE_NAME = 'Bilibili category extractor'
_MAX_RESULTS = 1000000
_VALID_URL = r'https?://www\.bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+'
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+'
_TESTS = [{
'url': 'https://www.bilibili.com/v/kichiku/mad',
'info_dict': {
@ -1399,7 +1660,7 @@ class BiliIntlSeriesIE(BiliIntlBaseIE):
class BiliLiveIE(InfoExtractor):
_VALID_URL = r'https?://live.bilibili.com/(?:blanc/)?(?P<id>\d+)'
_VALID_URL = r'https?://live\.bilibili\.com/(?:blanc/)?(?P<id>\d+)'
_TESTS = [{
'url': 'https://live.bilibili.com/196',

View File

@ -1,56 +1,170 @@
import functools
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_text_and_html_by_tag,
get_elements_by_class,
join_nonempty,
js_to_json,
determine_ext,
mimetype2ext,
unified_strdate,
url_or_none,
urljoin,
variadic,
)
from ..utils.traversal import traverse_obj
def html_get_element(tag=None, cls=None):
assert tag or cls, 'One of tag or class is required'
if cls:
func = functools.partial(get_elements_by_class, cls, tag=tag)
else:
func = functools.partial(get_element_text_and_html_by_tag, tag)
def html_get_element_wrapper(html):
return variadic(func(html))[0]
return html_get_element_wrapper
class BpbIE(InfoExtractor):
IE_DESC = 'Bundeszentrale für politische Bildung'
_VALID_URL = r'https?://(?:www\.)?bpb\.de/mediathek/(?P<id>[0-9]+)/'
_VALID_URL = r'https?://(?:www\.|m\.)?bpb\.de/(?:[^/?#]+/)*(?P<id>\d+)(?:[/?#]|$)'
_TEST = {
_TESTS = [{
'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr',
'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f',
'info_dict': {
'id': '297',
'ext': 'mp4',
'creator': 'Kooperative Berlin',
'description': 'md5:f4f75885ba009d3e2b156247a8941ce6',
'release_date': '20160115',
'series': 'Interview auf dem Geschichtsforum 1989 | 2009',
'tags': ['Friedliche Revolution', 'Erinnerungskultur', 'Vergangenheitspolitik', 'DDR 1949 - 1990', 'Freiheitsrecht', 'BStU', 'Deutschland'],
'thumbnail': 'https://www.bpb.de/cache/images/7/297_teaser_16x9_1240.jpg?8839D',
'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR',
'description': 'Joachim Gauck, erster Beauftragter für die Stasi-Unterlagen, spricht auf dem Geschichtsforum über die friedliche Revolution 1989 und eine "gewisse Traurigkeit" im Umgang mit der DDR-Vergangenheit.'
'uploader': 'Bundeszentrale für politische Bildung',
},
}, {
'url': 'https://www.bpb.de/mediathek/video/522184/krieg-flucht-und-falschmeldungen-wirstattdesinformation-2/',
'info_dict': {
'id': '522184',
'ext': 'mp4',
'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)',
'description': 'md5:f83c795ff8f825a69456a9e51fc15903',
'release_date': '20230621',
'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'],
'thumbnail': 'https://www.bpb.de/cache/images/4/522184_teaser_16x9_1240.png?EABFB',
'title': 'md5:9b01ccdbf58dbf9e5c9f6e771a803b1c',
'uploader': 'Bundeszentrale für politische Bildung',
},
}, {
'url': 'https://www.bpb.de/lernen/bewegtbild-und-politische-bildung/webvideo/518789/krieg-flucht-und-falschmeldungen-wirstattdesinformation-1/',
'info_dict': {
'id': '518789',
'ext': 'mp4',
'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)',
'description': 'md5:85228aed433e84ff0ff9bc582abd4ea8',
'release_date': '20230302',
'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'],
'thumbnail': 'https://www.bpb.de/cache/images/9/518789_teaser_16x9_1240.jpeg?56D0D',
'title': 'md5:3e956f264bb501f6383f10495a401da4',
'uploader': 'Bundeszentrale für politische Bildung',
},
}, {
'url': 'https://www.bpb.de/mediathek/podcasts/apuz-podcast/539727/apuz-20-china/',
'only_matching': True,
}, {
'url': 'https://www.bpb.de/mediathek/audio/315813/folge-1-eine-einfuehrung/',
'info_dict': {
'id': '315813',
'ext': 'mp3',
'creator': 'Axel Schröder',
'description': 'md5:eda9d1af34e5912efef5baf54fba4427',
'release_date': '20200921',
'series': 'Auf Endlagersuche. Der deutsche Weg zu einem sicheren Atommülllager',
'tags': ['Atomenergie', 'Endlager', 'hoch-radioaktiver Abfall', 'Endlagersuche', 'Atommüll', 'Atomendlager', 'Gorleben', 'Deutschland'],
'thumbnail': 'https://www.bpb.de/cache/images/3/315813_teaser_16x9_1240.png?92A94',
'title': 'Folge 1: Eine Einführung',
'uploader': 'Bundeszentrale für politische Bildung',
},
}, {
'url': 'https://www.bpb.de/517806/die-weltanschauung-der-neuen-rechten/',
'info_dict': {
'id': '517806',
'ext': 'mp3',
'creator': 'Bundeszentrale für politische Bildung',
'description': 'md5:594689600e919912aade0b2871cc3fed',
'release_date': '20230127',
'series': 'Vorträge des Fachtags "Modernisierer. Grenzgänger. Anstifter. Sechs Jahrzehnte \'Neue Rechte\'"',
'tags': ['Rechtsextremismus', 'Konservatismus', 'Konservativismus', 'neue Rechte', 'Rechtspopulismus', 'Schnellroda', 'Deutschland'],
'thumbnail': 'https://www.bpb.de/cache/images/6/517806_teaser_16x9_1240.png?7A7A0',
'title': 'Die Weltanschauung der "Neuen Rechten"',
'uploader': 'Bundeszentrale für politische Bildung',
},
}, {
'url': 'https://www.bpb.de/mediathek/reihen/zahlen-und-fakten-soziale-situation-filme/520153/zahlen-und-fakten-die-soziale-situation-in-deutschland-migration/',
'only_matching': True,
}]
_TITLE_RE = re.compile('(?P<title>[^<]*)<[^>]+>(?P<series>[^<]*)')
def _parse_vue_attributes(self, name, string, video_id):
attributes = extract_attributes(self._search_regex(rf'(<{name}(?:"[^"]*?"|[^>])*>)', string, name))
for key, value in attributes.items():
if key.startswith(':'):
attributes[key] = self._parse_json(value, video_id, transform_source=js_to_json, fatal=False)
return attributes
@staticmethod
def _process_source(source):
url = url_or_none(source['src'])
if not url:
return None
source_type = source.get('type', '')
extension = mimetype2ext(source_type)
is_video = source_type.startswith('video')
note = url.rpartition('.')[0].rpartition('_')[2] if is_video else None
return {
'url': url,
'ext': extension,
'vcodec': None if is_video else 'none',
'quality': 10 if note == 'high' else 0,
'format_note': note,
'format_id': join_nonempty(extension, note),
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
r'<h2 class="white">(.*?)</h2>', webpage, 'title')
video_info_dicts = re.findall(
r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage)
formats = []
for video_info in video_info_dicts:
video_info = self._parse_json(
video_info, video_id, transform_source=js_to_json, fatal=False)
if not video_info:
continue
video_url = video_info.get('src')
if not video_url:
continue
quality = 'high' if '_high' in video_url else 'low'
formats.append({
'url': video_url,
'quality': 10 if quality == 'high' else 0,
'format_note': quality,
'format_id': '%s-%s' % (quality, determine_ext(video_url)),
})
title_result = traverse_obj(webpage, ({html_get_element(cls='opening-header__title')}, {self._TITLE_RE.match}))
json_lds = list(self._yield_json_ld(webpage, video_id, fatal=False))
return {
'id': video_id,
'formats': formats,
'title': title,
'description': self._og_search_description(webpage),
'title': traverse_obj(title_result, ('title', {str.strip})) or None,
# This metadata could be interpreted otherwise, but it fits "series" the most
'series': traverse_obj(title_result, ('series', {str.strip})) or None,
'description': join_nonempty(*traverse_obj(webpage, [(
{html_get_element(cls='opening-intro')},
[{html_get_element(tag='bpb-accordion-item')}, {html_get_element(cls='text-content')}],
), {clean_html}]), delim='\n\n') or None,
'creator': self._html_search_meta('author', webpage),
'uploader': self._html_search_meta('publisher', webpage),
'release_date': unified_strdate(self._html_search_meta('date', webpage)),
'tags': traverse_obj(json_lds, (..., 'keywords', {lambda x: x.split(',')}, ...)),
**traverse_obj(self._parse_vue_attributes('bpb-player', webpage, video_id), {
'formats': (':sources', ..., {self._process_source}),
'thumbnail': ('poster', {lambda x: urljoin(url, x)}),
}),
}

View File

@ -0,0 +1,39 @@
from .common import InfoExtractor
class Canal1IE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|noticias\.)?canal1\.com\.co/(?:[^?#&])+/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://canal1.com.co/noticias/napa-i-una-cadena-de-produccion-de-arroz-que-se-quedo-en-veremos-y-abandonada-en-el-departamento-del-choco/',
'info_dict': {
'id': '63b39f6b354977084b85ab54',
'display_id': 'napa-i-una-cadena-de-produccion-de-arroz-que-se-quedo-en-veremos-y-abandonada-en-el-departamento-del-choco',
'title': 'Ñapa I Una cadena de producción de arroz que se quedó en veremos y abandonada en el departamento del Chocó',
'description': 'md5:bc49c6d64d20610ea1e7daf079a0d013',
'thumbnail': r're:^https?://[^?#]+63b39f6b354977084b85ab54',
'ext': 'mp4',
},
}, {
'url': 'https://noticias.canal1.com.co/noticias/tres-i-el-triste-record-que-impuso-elon-musk-el-dueno-de-tesla-y-de-twitter/',
'info_dict': {
'id': '63b39e93f5fd223aa32250fb',
'display_id': 'tres-i-el-triste-record-que-impuso-elon-musk-el-dueno-de-tesla-y-de-twitter',
'title': 'Tres I El triste récord que impuso Elon Musk, el dueño de Tesla y de Twitter',
'description': 'md5:d9f691f131a21ce6767ca6c05d17d791',
'thumbnail': r're:^https?://[^?#]+63b39e93f5fd223aa32250fb',
'ext': 'mp4',
},
}, {
# Geo-restricted to Colombia
'url': 'https://canal1.com.co/programas/guerreros-canal-1/video-inedito-guerreros-despedida-kewin-zarate/',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
return self.url_result(
self._search_regex(r'"embedUrl"\s*:\s*"([^"]+)', webpage, 'embed url'),
display_id=display_id, url_transparent=True)

View File

@ -0,0 +1,136 @@
import base64
import json
import uuid
from .common import InfoExtractor
from ..utils import (
int_or_none,
js_to_json,
traverse_obj,
urljoin,
)
class CaracolTvPlayIE(InfoExtractor):
_VALID_URL = r'https?://play\.caracoltv\.com/videoDetails/(?P<id>[^/?#]+)'
_NETRC_MACHINE = 'caracoltv-play'
_TESTS = [{
'url': 'https://play.caracoltv.com/videoDetails/OTo4NGFmNjUwOWQ2ZmM0NTg2YWRiOWU0MGNhOWViOWJkYQ==',
'info_dict': {
'id': 'OTo4NGFmNjUwOWQ2ZmM0NTg2YWRiOWU0MGNhOWViOWJkYQ==',
'title': 'La teoría del promedio',
'description': 'md5:1cdd6d2c13f19ef0d9649ab81a023ac3',
},
'playlist_count': 6,
}, {
'url': 'https://play.caracoltv.com/videoDetails/OTo3OWM4ZTliYzQxMmM0MTMxYTk4Mjk2YjdjNGQ4NGRkOQ==/ella?season=0',
'info_dict': {
'id': 'OTo3OWM4ZTliYzQxMmM0MTMxYTk4Mjk2YjdjNGQ4NGRkOQ==',
'title': 'Ella',
'description': 'md5:a639b1feb5ddcc0cff92a489b4e544b8',
},
'playlist_count': 10,
}, {
'url': 'https://play.caracoltv.com/videoDetails/OTpiYTY1YTVmOTI5MzI0ZWJhOGZiY2Y3MmRlOWZlYmJkOA==/la-vuelta-al-mundo-en-80-risas-2022?season=0',
'info_dict': {
'id': 'OTpiYTY1YTVmOTI5MzI0ZWJhOGZiY2Y3MmRlOWZlYmJkOA==',
'title': 'La vuelta al mundo en 80 risas 2022',
'description': 'md5:e97aac36106e5c37ebf947b3350106a4',
},
'playlist_count': 17,
}, {
'url': 'https://play.caracoltv.com/videoDetails/MzoxX3BwbjRmNjB1',
'only_matching': True,
}]
_USER_TOKEN = None
def _extract_app_token(self, webpage):
config_js_path = self._search_regex(
r'<script[^>]+src\s*=\s*"([^"]+coreConfig.js[^"]+)', webpage, 'config js url', fatal=False)
mediation_config = {} if not config_js_path else self._search_json(
r'mediation\s*:', self._download_webpage(
urljoin('https://play.caracoltv.com/', config_js_path), None, fatal=False, note='Extracting JS config'),
'mediation_config', None, transform_source=js_to_json, fatal=False)
key = traverse_obj(
mediation_config, ('live', 'key')) or '795cd9c089a1fc48094524a5eba85a3fca1331817c802f601735907c8bbb4f50'
secret = traverse_obj(
mediation_config, ('live', 'secret')) or '64dec00a6989ba83d087621465b5e5d38bdac22033b0613b659c442c78976fa0'
return base64.b64encode(f'{key}:{secret}'.encode()).decode()
def _perform_login(self, email, password):
webpage = self._download_webpage('https://play.caracoltv.com/', None, fatal=False)
app_token = self._extract_app_token(webpage)
bearer_token = self._download_json(
'https://eu-gateway.inmobly.com/applications/oauth', None, data=b'', note='Retrieving bearer token',
headers={'Authorization': f'Basic {app_token}'})['token']
self._USER_TOKEN = self._download_json(
'https://eu-gateway.inmobly.com/user/login', None, note='Performing login', headers={
'Content-Type': 'application/json',
'Authorization': f'Bearer {bearer_token}',
}, data=json.dumps({
'device_data': {
'device_id': str(uuid.uuid4()),
'device_token': '',
'device_type': 'web'
},
'login_data': {
'enabled': True,
'email': email,
'password': password,
}
}).encode())['user_token']
def _extract_video(self, video_data, series_id=None, season_id=None, season_number=None):
formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_data['stream_url'], series_id, 'mp4')
return {
'id': video_data['id'],
'title': video_data.get('name'),
'description': video_data.get('description'),
'formats': formats,
'subtitles': subtitles,
'thumbnails': traverse_obj(
video_data, ('extra_thumbs', ..., {'url': 'thumb_url', 'height': 'height', 'width': 'width'})),
'series_id': series_id,
'season_id': season_id,
'season_number': int_or_none(season_number),
'episode_number': int_or_none(video_data.get('item_order')),
'is_live': video_data.get('entry_type') == 3,
}
def _extract_series_seasons(self, seasons, series_id):
for season in seasons:
api_response = self._download_json(
'https://eu-gateway.inmobly.com/feed', series_id, query={'season_id': season['id']},
headers={'Authorization': f'Bearer {self._USER_TOKEN}'})
season_number = season.get('order')
for episode in api_response['items']:
yield self._extract_video(episode, series_id, season['id'], season_number)
def _real_extract(self, url):
series_id = self._match_id(url)
if self._USER_TOKEN is None:
self._perform_login('guest@inmobly.com', 'Test@gus1')
api_response = self._download_json(
'https://eu-gateway.inmobly.com/feed', series_id, query={'include_ids': series_id},
headers={'Authorization': f'Bearer {self._USER_TOKEN}'})['items'][0]
if not api_response.get('seasons'):
return self._extract_video(api_response)
return self.playlist_result(
self._extract_series_seasons(api_response['seasons'], series_id),
series_id, **traverse_obj(api_response, {
'title': 'name',
'description': 'description',
}))

View File

@ -339,12 +339,12 @@ class CBCGemIE(InfoExtractor):
data = json.dumps({'jwt': sig}).encode()
headers = {'content-type': 'application/json', 'ott-device-type': 'web'}
resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/token',
None, data=data, headers=headers)
None, data=data, headers=headers, expected_status=426)
cbc_access_token = resp['accessToken']
headers = {'content-type': 'application/json', 'ott-device-type': 'web', 'ott-access-token': cbc_access_token}
resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/profile',
None, headers=headers)
None, headers=headers, expected_status=426)
return resp['claimsToken']
def _get_claims_token_expiry(self):

View File

@ -90,10 +90,17 @@ class CCCPlaylistIE(InfoExtractor):
'id': '30c3',
},
'playlist_count': 135,
}, {
'url': 'https://media.ccc.de/c/DS2023',
'info_dict': {
'title': 'Datenspuren 2023',
'id': 'DS2023',
},
'playlist_count': 37
}]
def _real_extract(self, url):
playlist_id = self._match_id(url).lower()
playlist_id = self._match_id(url)
conf = self._download_json(
'https://media.ccc.de/public/conferences/' + playlist_id,

View File

@ -1,31 +1,72 @@
import time
import hashlib
import re
import urllib
import uuid
from .common import InfoExtractor
from .openload import PhantomJSwrapper
from ..utils import (
ExtractorError,
UserNotLive,
determine_ext,
int_or_none,
js_to_json,
parse_resolution,
str_or_none,
traverse_obj,
unescapeHTML,
unified_strdate,
url_or_none,
urlencode_postdata,
urljoin,
)
class DouyuTVIE(InfoExtractor):
IE_DESC = '斗鱼'
class DouyuBaseIE(InfoExtractor):
def _download_cryptojs_md5(self, video_id):
for url in [
'https://cdnjs.cloudflare.com/ajax/libs/crypto-js/3.1.2/rollups/md5.js',
'https://cdn.bootcdn.net/ajax/libs/crypto-js/3.1.2/rollups/md5.js',
]:
js_code = self._download_webpage(
url, video_id, note='Downloading signing dependency', fatal=False)
if js_code:
self.cache.store('douyu', 'crypto-js-md5', js_code)
return js_code
raise ExtractorError('Unable to download JS dependency (crypto-js/md5)')
def _get_cryptojs_md5(self, video_id):
return self.cache.load('douyu', 'crypto-js-md5') or self._download_cryptojs_md5(video_id)
def _calc_sign(self, sign_func, video_id, a):
b = uuid.uuid4().hex
c = round(time.time())
js_script = f'{self._get_cryptojs_md5(video_id)};{sign_func};console.log(ub98484234("{a}","{b}","{c}"))'
phantom = PhantomJSwrapper(self)
result = phantom.execute(js_script, video_id,
note='Executing JS signing script').strip()
return {i: v[0] for i, v in urllib.parse.parse_qs(result).items()}
def _search_js_sign_func(self, webpage, fatal=True):
# The greedy look-behind ensures last possible script tag is matched
return self._search_regex(
r'(?:<script.*)?<script[^>]*>(.*?ub98484234.*?)</script>', webpage, 'JS sign func', fatal=fatal)
class DouyuTVIE(DouyuBaseIE):
IE_DESC = '斗鱼直播'
_VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(topic/\w+\?rid=|(?:[^/]+/))*(?P<id>[A-Za-z0-9]+)'
_TESTS = [{
'url': 'http://www.douyutv.com/iseven',
'url': 'https://www.douyu.com/pigff',
'info_dict': {
'id': '17732',
'display_id': 'iseven',
'ext': 'flv',
'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r're:.*m7show@163\.com.*',
'thumbnail': r're:^https?://.*\.png',
'uploader': '7师傅',
'id': '24422',
'display_id': 'pigff',
'ext': 'mp4',
'title': 're:^【PIGFF】.* [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r'≥15级牌子看鱼吧置顶帖进粉丝vx群',
'thumbnail': str,
'uploader': 'pigff',
'is_live': True,
'live_status': 'is_live',
},
'params': {
'skip_download': True,
@ -85,15 +126,43 @@ class DouyuTVIE(InfoExtractor):
'only_matching': True,
}]
def _get_sign_func(self, room_id, video_id):
return self._download_json(
f'https://www.douyu.com/swf_api/homeH5Enc?rids={room_id}', video_id,
note='Getting signing script')['data'][f'room{room_id}']
def _extract_stream_formats(self, stream_formats):
formats = []
for stream_info in traverse_obj(stream_formats, (..., 'data')):
stream_url = urljoin(
traverse_obj(stream_info, 'rtmp_url'), traverse_obj(stream_info, 'rtmp_live'))
if stream_url:
rate_id = traverse_obj(stream_info, ('rate', {int_or_none}))
rate_info = traverse_obj(stream_info, ('multirates', lambda _, v: v['rate'] == rate_id), get_all=False)
ext = determine_ext(stream_url)
formats.append({
'url': stream_url,
'format_id': str_or_none(rate_id),
'ext': 'mp4' if ext == 'm3u8' else ext,
'protocol': 'm3u8_native' if ext == 'm3u8' else 'https',
'quality': rate_id % -10000 if rate_id is not None else None,
**traverse_obj(rate_info, {
'format': ('name', {str_or_none}),
'tbr': ('bit', {int_or_none}),
}),
})
return formats
def _real_extract(self, url):
video_id = self._match_id(url)
if video_id.isdigit():
room_id = video_id
else:
page = self._download_webpage(url, video_id)
room_id = self._html_search_regex(
r'"room_id\\?"\s*:\s*(\d+),', page, 'room id')
webpage = self._download_webpage(url, video_id)
room_id = self._search_regex(r'\$ROOM\.room_id\s*=\s*(\d+)', webpage, 'room id')
if self._search_regex(r'"videoLoop"\s*:\s*(\d+)', webpage, 'loop', default='') == '1':
raise UserNotLive('The channel is auto-playing VODs', video_id=video_id)
if self._search_regex(r'\$ROOM\.show_status\s*=\s*(\d+)', webpage, 'status', default='') == '2':
raise UserNotLive(video_id=video_id)
# Grab metadata from API
params = {
@ -102,110 +171,136 @@ class DouyuTVIE(InfoExtractor):
'time': int(time.time()),
}
params['auth'] = hashlib.md5(
f'room/{video_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest()
room = self._download_json(
f'room/{room_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest()
room = traverse_obj(self._download_json(
f'http://www.douyutv.com/api/v1/room/{room_id}', video_id,
note='Downloading room info', query=params)['data']
note='Downloading room info', query=params, fatal=False), 'data')
# 1 = live, 2 = offline
if room.get('show_status') == '2':
raise ExtractorError('Live stream is offline', expected=True)
if traverse_obj(room, 'show_status') == '2':
raise UserNotLive(video_id=video_id)
video_url = urljoin('https://hls3-akm.douyucdn.cn/', self._search_regex(r'(live/.*)', room['hls_url'], 'URL'))
formats, subs = self._extract_m3u8_formats_and_subtitles(video_url, room_id)
js_sign_func = self._search_js_sign_func(webpage, fatal=False) or self._get_sign_func(room_id, video_id)
form_data = {
'rate': 0,
**self._calc_sign(js_sign_func, video_id, room_id),
}
stream_formats = [self._download_json(
f'https://www.douyu.com/lapi/live/getH5Play/{room_id}',
video_id, note="Downloading livestream format",
data=urlencode_postdata(form_data))]
title = unescapeHTML(room['room_name'])
description = room.get('show_details')
thumbnail = room.get('room_src')
uploader = room.get('nickname')
for rate_id in traverse_obj(stream_formats[0], ('data', 'multirates', ..., 'rate')):
if rate_id != traverse_obj(stream_formats[0], ('data', 'rate')):
form_data['rate'] = rate_id
stream_formats.append(self._download_json(
f'https://www.douyu.com/lapi/live/getH5Play/{room_id}',
video_id, note=f'Downloading livestream format {rate_id}',
data=urlencode_postdata(form_data)))
return {
'id': room_id,
'display_id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'formats': self._extract_stream_formats(stream_formats),
'is_live': True,
'subtitles': subs,
'formats': formats,
**traverse_obj(room, {
'display_id': ('url', {str}, {lambda i: i[1:]}),
'title': ('room_name', {unescapeHTML}),
'description': ('show_details', {str}),
'uploader': ('nickname', {str}),
'thumbnail': ('room_src', {url_or_none}),
})
}
class DouyuShowIE(InfoExtractor):
class DouyuShowIE(DouyuBaseIE):
_VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)'
_TESTS = [{
'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw',
'md5': '0c2cfd068ee2afe657801269b2d86214',
'url': 'https://v.douyu.com/show/mPyq7oVNe5Yv1gLY',
'info_dict': {
'id': 'rjNBdvnVXNzvE2yw',
'id': 'mPyq7oVNe5Yv1gLY',
'ext': 'mp4',
'title': '陈一发儿:砒霜 我有个室友系列04-01 22点场',
'duration': 7150.08,
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': '陈一发儿',
'uploader_id': 'XrZwYelr5wbK',
'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK',
'upload_date': '20170402',
'title': '四川人小时候的味道“蒜苗回锅肉”,传统菜不能丢,要常做来吃',
'duration': 633,
'thumbnail': str,
'uploader': '美食作家王刚V',
'uploader_id': 'OVAO4NVx1m7Q',
'timestamp': 1661850002,
'upload_date': '20220830',
'view_count': int,
'tags': ['美食', '美食综合'],
},
}, {
'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw',
'only_matching': True,
}]
_FORMATS = {
'super': '原画',
'high': '超清',
'normal': '高清',
}
_QUALITIES = {
'super': -1,
'high': -2,
'normal': -3,
}
_RESOLUTIONS = {
'super': '1920x1080',
'high': '1280x720',
'normal': '852x480',
}
def _real_extract(self, url):
url = url.replace('vmobile.', 'v.')
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
room_info = self._parse_json(self._search_regex(
r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id)
video_info = self._search_json(
r'<script>\s*window\.\$DATA\s*=', webpage,
'video info', video_id, transform_source=js_to_json)
video_info = None
js_sign_func = self._search_js_sign_func(webpage)
form_data = {
'vid': video_id,
**self._calc_sign(js_sign_func, video_id, video_info['ROOM']['point_id']),
}
url_info = self._download_json(
'https://v.douyu.com/api/stream/getStreamUrl', video_id,
data=urlencode_postdata(form_data), note="Downloading video formats")
for trial in range(5):
# Sometimes Douyu rejects our request. Let's try it more times
try:
video_info = self._download_json(
'https://vmobile.douyu.com/video/getInfo', video_id,
query={'vid': video_id},
headers={
'Referer': url,
'x-requested-with': 'XMLHttpRequest',
})
break
except ExtractorError:
self._sleep(1, video_id)
if not video_info:
raise ExtractorError('Can\'t fetch video info')
formats = self._extract_m3u8_formats(
video_info['data']['video_url'], video_id,
entry_protocol='m3u8_native', ext='mp4')
upload_date = unified_strdate(self._html_search_regex(
r'<em>上传时间:</em><span>([^<]+)</span>', webpage,
'upload date', fatal=False))
uploader = uploader_id = uploader_url = None
mobj = re.search(
r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"',
webpage)
if mobj:
uploader_id, uploader = mobj.groups()
uploader_url = urljoin(url, '/author/' + uploader_id)
formats = []
for name, url in traverse_obj(url_info, ('data', 'thumb_video', {dict.items}, ...)):
video_url = traverse_obj(url, ('url', {url_or_none}))
if video_url:
ext = determine_ext(video_url)
formats.append({
'format': self._FORMATS.get(name),
'format_id': name,
'url': video_url,
'quality': self._QUALITIES.get(name),
'ext': 'mp4' if ext == 'm3u8' else ext,
'protocol': 'm3u8_native' if ext == 'm3u8' else 'https',
**parse_resolution(self._RESOLUTIONS.get(name))
})
else:
self.to_screen(
f'"{self._FORMATS.get(name, name)}" format may require logging in. {self._login_hint()}')
return {
'id': video_id,
'title': room_info['name'],
'formats': formats,
'duration': room_info.get('duration'),
'thumbnail': room_info.get('pic'),
'upload_date': upload_date,
'uploader': uploader,
'uploader_id': uploader_id,
'uploader_url': uploader_url,
**traverse_obj(video_info, ('DATA', {
'title': ('content', 'title', {str}),
'uploader': ('content', 'author', {str}),
'uploader_id': ('content', 'up_id', {str_or_none}),
'duration': ('content', 'video_duration', {int_or_none}),
'thumbnail': ('content', 'video_pic', {url_or_none}),
'timestamp': ('content', 'create_time', {int_or_none}),
'view_count': ('content', 'view_num', {int_or_none}),
'tags': ('videoTag', ..., 'tagName', {str}),
}))
}

96
yt_dlp/extractor/eplus.py Normal file
View File

@ -0,0 +1,96 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
try_call,
unified_timestamp,
)
class EplusIbIE(InfoExtractor):
IE_NAME = 'eplus:inbound'
IE_DESC = 'e+ (イープラス) overseas'
_VALID_URL = r'https?://live\.eplus\.jp/ex/player\?ib=(?P<id>(?:\w|%2B|%2F){86}%3D%3D)'
_TESTS = [{
'url': 'https://live.eplus.jp/ex/player?ib=YEFxb3Vyc2Dombnjg7blkrLlrablnJLjgrnjgq%2Fjg7zjg6vjgqLjgqTjg4njg6vlkIzlpb3kvJpgTGllbGxhIQ%3D%3D',
'info_dict': {
'id': '354502-0001-002',
'title': 'LoveLive!Series Presents COUNTDOWN LoveLive! 2021→2022LIVE with a smile!【Streaming+(配信)】',
'live_status': 'was_live',
'release_date': '20211231',
'release_timestamp': 1640952000,
'description': str,
},
'params': {
'skip_download': True,
'ignore_no_formats_error': True,
},
'expected_warnings': [
'Could not find the playlist URL. This event may not be accessible',
'No video formats found!',
'Requested format is not available',
],
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
data_json = self._search_json(r'<script>\s*var app\s*=', webpage, 'data json', video_id)
delivery_status = data_json.get('delivery_status')
archive_mode = data_json.get('archive_mode')
release_timestamp = try_call(lambda: unified_timestamp(data_json['event_datetime']) - 32400)
release_timestamp_str = data_json.get('event_datetime_text') # JST
self.write_debug(f'delivery_status = {delivery_status}, archive_mode = {archive_mode}')
if delivery_status == 'PREPARING':
live_status = 'is_upcoming'
elif delivery_status == 'STARTED':
live_status = 'is_live'
elif delivery_status == 'STOPPED':
if archive_mode != 'ON':
raise ExtractorError(
'This event has ended and there is no archive for this event', expected=True)
live_status = 'post_live'
elif delivery_status == 'WAIT_CONFIRM_ARCHIVED':
live_status = 'post_live'
elif delivery_status == 'CONFIRMED_ARCHIVE':
live_status = 'was_live'
else:
self.report_warning(f'Unknown delivery_status {delivery_status}, treat it as a live')
live_status = 'is_live'
formats = []
m3u8_playlist_urls = self._search_json(
r'var listChannels\s*=', webpage, 'hls URLs', video_id, contains_pattern=r'\[.+\]', default=[])
if not m3u8_playlist_urls:
if live_status == 'is_upcoming':
self.raise_no_formats(
f'Could not find the playlist URL. This live event will begin at {release_timestamp_str} JST', expected=True)
else:
self.raise_no_formats(
'Could not find the playlist URL. This event may not be accessible', expected=True)
elif live_status == 'is_upcoming':
self.raise_no_formats(f'This live event will begin at {release_timestamp_str} JST', expected=True)
elif live_status == 'post_live':
self.raise_no_formats('This event has ended, and the archive will be available shortly', expected=True)
else:
for m3u8_playlist_url in m3u8_playlist_urls:
formats.extend(self._extract_m3u8_formats(m3u8_playlist_url, video_id))
# FIXME: HTTP request headers need to be updated to continue download
warning = 'Due to technical limitations, the download will be interrupted after one hour'
if live_status == 'is_live':
self.report_warning(warning)
elif live_status == 'was_live':
self.report_warning(f'{warning}. You can restart to continue the download')
return {
'id': data_json['app_id'],
'title': data_json.get('app_name'),
'formats': formats,
'live_status': live_status,
'description': data_json.get('content'),
'release_timestamp': release_timestamp,
}

View File

@ -11,8 +11,8 @@ class ExpressenIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:www\.)?(?:expressen|di)\.se/
(?:(?:tvspelare/video|videoplayer/embed)/)?
tv/(?:[^/]+/)*
(?:(?:tvspelare/video|video-?player/embed)/)?
(?:tv|nyheter)/(?:[^/?#]+/)*
(?P<id>[^/?#&]+)
'''
_EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1']
@ -42,6 +42,12 @@ class ExpressenIE(InfoExtractor):
}, {
'url': 'https://www.di.se/videoplayer/embed/tv/ditv/borsmorgon/implantica-rusar-70--under-borspremiaren-hor-styrelsemedlemmen/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di',
'only_matching': True,
}, {
'url': 'https://www.expressen.se/video-player/embed/tv/nyheter/ekero-fodda-olof-gustafsson-forvaltar-knarkbaronen-pablo-escobars-namn',
'only_matching': True,
}, {
'url': 'https://www.expressen.se/nyheter/efter-egna-telefonbluffen-escobar-stammer-klarna/',
'only_matching': True,
}]
def _real_extract(self, url):

View File

@ -74,6 +74,22 @@ class FacebookIE(InfoExtractor):
_VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary'
_TESTS = [{
'url': 'https://www.facebook.com/radiokicksfm/videos/3676516585958356/',
'info_dict': {
'id': '3676516585958356',
'ext': 'mp4',
'title': 'dr Adam Przygoda',
'description': 'md5:34675bda53336b1d16400265c2bb9b3b',
'uploader': 'RADIO KICKS FM',
'upload_date': '20230818',
'timestamp': 1692346159,
'thumbnail': r're:^https?://.*',
'uploader_id': '100063551323670',
'duration': 3132.184,
'view_count': int,
'concurrent_view_count': 0,
},
}, {
'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',
'md5': '6a40d33c0eccbb1af76cf0485a052659',
'info_dict': {
@ -97,7 +113,7 @@ class FacebookIE(InfoExtractor):
'upload_date': '20140506',
'timestamp': 1399398998,
'thumbnail': r're:^https?://.*',
'uploader_id': 'pfbid04scW44U4P9iTyLZAGy8y8W3pR3i2VugvHCimiRudUAVbN3MPp9eXBaYFcgVworZwl',
'uploader_id': 'pfbid028wxorhX2ErLFJ578N6P3crHD3PHmXTCqCvfBpsnbSLmbokwSY75p5hWBjHGkG4zxl',
'duration': 131.03,
'concurrent_view_count': int,
},
@ -179,7 +195,7 @@ class FacebookIE(InfoExtractor):
'timestamp': 1486648217,
'upload_date': '20170209',
'uploader': 'Yaroslav Korpan',
'uploader_id': 'pfbid029y8j22EwH3ikeqgH3SEP9G3CAi9kmWKgXJJG9s5geV7mo3J2bvURqHCdgucRgAyhl',
'uploader_id': 'pfbid06AScABAWcW91qpiuGrLt99Ef9tvwHoXP6t8KeFYEqkSfreMtfa9nTveh8b2ZEVSWl',
'concurrent_view_count': int,
'thumbnail': r're:^https?://.*',
'view_count': int,
@ -274,7 +290,7 @@ class FacebookIE(InfoExtractor):
'title': 'Josef',
'thumbnail': r're:^https?://.*',
'concurrent_view_count': int,
'uploader_id': 'pfbid02gXHbDwxumkaKJQaTGUf3znYfYzTuidGEWawiramNx4YamSj2afwYSRkpcjtHtMRJl',
'uploader_id': 'pfbid0cibUN6tV7DYgdbJdsUFN46wc4jKpVSPAvJQhFofGqBGmVn3V3JtAs2tfUwziw2hUl',
'timestamp': 1549275572,
'duration': 3.413,
'uploader': 'Josef Novak',
@ -401,9 +417,9 @@ class FacebookIE(InfoExtractor):
def extract_metadata(webpage):
post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall(
r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)]
r'data-sjs>({.*?ScheduledServerJS.*?})</script>', webpage)]
post = traverse_obj(post_data, (
..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or []
..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or []
media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: (
k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict)
title = get_first(media, ('title', 'text'))
@ -489,18 +505,17 @@ class FacebookIE(InfoExtractor):
# with non-browser User-Agent.
for f in info['formats']:
f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
info['_format_sort_fields'] = ('res', 'quality')
def extract_relay_data(_filter):
return self._parse_json(self._search_regex(
r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter,
r'data-sjs>({.*?%s.*?})</script>' % _filter,
webpage, 'replay data', default='{}'), video_id, fatal=False) or {}
def extract_relay_prefetched_data(_filter):
replay_data = extract_relay_data(_filter)
for require in (replay_data.get('require') or []):
if require[0] == 'RelayPrefetchedStreamCache':
return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {}
return traverse_obj(extract_relay_data(_filter), (
'require', (None, (..., ..., ..., '__bbox', 'require')),
lambda _, v: 'RelayPrefetchedStreamCache' in v, ..., ...,
'__bbox', 'result', 'data', {dict}), get_all=False) or {}
if not video_data:
server_js_data = self._parse_json(self._search_regex([
@ -511,7 +526,7 @@ class FacebookIE(InfoExtractor):
if not video_data:
data = extract_relay_prefetched_data(
r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"')
r'"(?:dash_manifest|playable_url(?:_quality_hd)?)')
if data:
entries = []
@ -526,7 +541,8 @@ class FacebookIE(InfoExtractor):
formats = []
q = qualities(['sd', 'hd'])
for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
('playable_url_dash', '')):
('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
('browser_native_sd_url', 'sd')):
playable_url = video.get(key)
if not playable_url:
continue
@ -535,7 +551,8 @@ class FacebookIE(InfoExtractor):
else:
formats.append({
'format_id': format_id,
'quality': q(format_id),
# sd, hd formats w/o resolution info should be deprioritized below DASH
'quality': q(format_id) - 3,
'url': playable_url,
})
extract_dash_manifest(video, formats)
@ -702,9 +719,11 @@ class FacebookIE(InfoExtractor):
for src_type in ('src', 'src_no_ratelimit'):
src = f[0].get('%s_%s' % (quality, src_type))
if src:
preference = -10 if format_id == 'progressive' else -1
# sd, hd formats w/o resolution info should be deprioritized below DASH
# TODO: investigate if progressive or src formats still exist
preference = -10 if format_id == 'progressive' else -3
if quality == 'hd':
preference += 5
preference += 1
formats.append({
'format_id': '%s_%s_%s' % (format_id, quality, src_type),
'url': src,

View File

@ -60,6 +60,7 @@ class Funker530IE(InfoExtractor):
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
info = {}
rumble_url = list(RumbleEmbedIE._extract_embed_urls(url, webpage))
if rumble_url:
info = {'url': rumble_url[0], 'ie_key': RumbleEmbedIE.ie_key()}

View File

@ -2370,7 +2370,7 @@ class GenericIE(InfoExtractor):
'id': flashvars['video_id'],
'display_id': display_id,
'title': title,
'thumbnail': thumbnail,
'thumbnail': urljoin(url, thumbnail),
'formats': formats,
}

View File

@ -66,7 +66,7 @@ class GofileIE(InfoExtractor):
query_params = {
'contentId': file_id,
'token': self._TOKEN,
'websiteToken': 12345,
'websiteToken': '7fd94ds12fds4', # From https://gofile.io/dist/js/alljs.js
}
password = self.get_param('videopassword')
if password:

View File

@ -383,9 +383,9 @@ class AwsIdp:
months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
time_now = datetime.datetime.utcnow()
time_now = datetime.datetime.now(datetime.timezone.utc)
format_string = "{} {} {} %H:%M:%S UTC %Y".format(days[time_now.weekday()], months[time_now.month], time_now.day)
time_string = datetime.datetime.utcnow().strftime(format_string)
time_string = time_now.strftime(format_string)
return time_string
def __str__(self):

View File

@ -1,9 +1,9 @@
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
int_or_none,
parse_age_limit,
parse_iso8601,
time_seconds,
update_url_query,
)
@ -11,15 +11,14 @@ from ..utils import (
class IndavideoEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P<id>[\da-f]+)'
# Some example URLs covered by generic extractor:
# http://indavideo.hu/video/Vicces_cica_1
# http://index.indavideo.hu/video/2015_0728_beregszasz
# http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko
# http://erotika.indavideo.hu/video/Amator_tini_punci
# http://film.indavideo.hu/video/f_hrom_nagymamm_volt
# http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes
_EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)']
# https://indavideo.hu/video/Vicces_cica_1
# https://index.indavideo.hu/video/Hod_Nemetorszagban
# https://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko
# https://film.indavideo.hu/video/f_farkaslesen
# https://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes
_EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)//embed\.indavideo\.hu/player/video/[\da-f]+)']
_TESTS = [{
'url': 'http://indavideo.hu/player/video/1bdc3c6d80/',
'url': 'https://indavideo.hu/player/video/1bdc3c6d80/',
'md5': 'c8a507a1c7410685f83a06eaeeaafeab',
'info_dict': {
'id': '1837039',
@ -36,21 +35,33 @@ class IndavideoEmbedIE(InfoExtractor):
'tags': ['tánc', 'cica', 'cuki', 'cukiajanlo', 'newsroom'],
},
}, {
'url': 'http://embed.indavideo.hu/player/video/1bdc3c6d80?autostart=1&hide=1',
'only_matching': True,
}, {
'url': 'http://assets.indavideo.hu/swf/player.swf?v=fe25e500&vID=1bdc3c6d80&autostart=1&hide=1&i=1',
'url': 'https://embed.indavideo.hu/player/video/1bdc3c6d80?autostart=1&hide=1',
'only_matching': True,
}]
_WEBPAGE_TESTS = [{
'url': 'https://indavideo.hu/video/Vicces_cica_1',
'info_dict': {
'id': '1335611',
'ext': 'mp4',
'title': 'Vicces cica',
'description': 'Játszik a tablettel. :D',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Jet_Pack',
'uploader_id': '491217',
'timestamp': 1390821212,
'upload_date': '20140127',
'duration': 7,
'age_limit': 0,
'tags': ['cica', 'Jet_Pack'],
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
video = self._download_json(
'https://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/%s' % video_id,
video_id)['data']
title = video['title']
f'https://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/{video_id}/',
video_id, query={'_': time_seconds()})['data']
video_urls = []
@ -60,33 +71,21 @@ class IndavideoEmbedIE(InfoExtractor):
elif isinstance(video_files, dict):
video_urls.extend(video_files.values())
video_file = video.get('video_file')
if video:
video_urls.append(video_file)
video_urls = list(set(video_urls))
video_prefix = video_urls[0].rsplit('/', 1)[0]
for flv_file in video.get('flv_files', []):
flv_url = '%s/%s' % (video_prefix, flv_file)
if flv_url not in video_urls:
video_urls.append(flv_url)
filesh = video.get('filesh')
filesh = video.get('filesh') or {}
formats = []
for video_url in video_urls:
height = int_or_none(self._search_regex(
r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None))
if filesh:
if not height:
continue
token = filesh.get(compat_str(height))
if token is None:
continue
video_url = update_url_query(video_url, {'token': token})
if not height and len(filesh) == 1:
height = int_or_none(list(filesh.keys())[0])
token = filesh.get(str(height))
if token is None:
continue
formats.append({
'url': video_url,
'url': update_url_query(video_url, {'token': token}),
'height': height,
})
@ -103,7 +102,7 @@ class IndavideoEmbedIE(InfoExtractor):
return {
'id': video.get('id') or video_id,
'title': title,
'title': video.get('title'),
'description': video.get('description'),
'thumbnails': thumbnails,
'uploader': video.get('user_name'),

View File

@ -57,8 +57,8 @@ class LecturioIE(LecturioBaseIE):
_VALID_URL = r'''(?x)
https://
(?:
app\.lecturio\.com/([^/]+/(?P<nt>[^/?#&]+)\.lecture|(?:\#/)?lecture/c/\d+/(?P<id>\d+))|
(?:www\.)?lecturio\.de/[^/]+/(?P<nt_de>[^/?#&]+)\.vortrag
app\.lecturio\.com/([^/?#]+/(?P<nt>[^/?#&]+)\.lecture|(?:\#/)?lecture/c/\d+/(?P<id>\d+))|
(?:www\.)?lecturio\.de/(?:[^/?#]+/)+(?P<nt_de>[^/?#&]+)\.vortrag
)
'''
_TESTS = [{
@ -73,6 +73,9 @@ class LecturioIE(LecturioBaseIE):
}, {
'url': 'https://www.lecturio.de/jura/oeffentliches-recht-staatsexamen.vortrag',
'only_matching': True,
}, {
'url': 'https://www.lecturio.de/jura/oeffentliches-recht-at-1-staatsexamen/oeffentliches-recht-staatsexamen.vortrag',
'only_matching': True,
}, {
'url': 'https://app.lecturio.com/#/lecture/c/6434/39634',
'only_matching': True,

View File

@ -17,11 +17,12 @@ class MassengeschmackTVIE(InfoExtractor):
_TEST = {
'url': 'https://massengeschmack.tv/play/fktv202',
'md5': 'a9e054db9c2b5a08f0a0527cc201e8d3',
'md5': '9996f314994a49fefe5f39aa1b07ae21',
'info_dict': {
'id': 'fktv202',
'ext': 'mp4',
'title': 'Fernsehkritik-TV - Folge 202',
'title': 'Fernsehkritik-TV #202',
'thumbnail': 'https://cache.massengeschmack.tv/img/mag/fktv202.jpg'
},
}
@ -29,9 +30,6 @@ class MassengeschmackTVIE(InfoExtractor):
episode = self._match_id(url)
webpage = self._download_webpage(url, episode)
title = clean_html(self._html_search_regex(
'<h3>([^<]+)</h3>', webpage, 'title'))
thumbnail = self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False)
sources = self._parse_json(self._search_regex(r'(?s)MEDIA\s*=\s*(\[.+?\]);', webpage, 'media'), episode, js_to_json)
formats = []
@ -67,7 +65,8 @@ class MassengeschmackTVIE(InfoExtractor):
return {
'id': episode,
'title': title,
'title': clean_html(self._html_search_regex(
r'<span[^>]+\bid=["\']clip-title["\'][^>]*>([^<]+)', webpage, 'title', fatal=False)),
'formats': formats,
'thumbnail': thumbnail,
'thumbnail': self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False),
}

View File

@ -1,5 +1,8 @@
from ..utils import (
unified_strdate
ExtractorError,
traverse_obj,
unified_strdate,
url_or_none,
)
from .common import InfoExtractor
from ..compat import (
@ -15,7 +18,7 @@ class MediaKlikkIE(InfoExtractor):
(?P<id>[^/#?_]+)'''
_TESTS = [{
# mediaklikk. date in html.
# (old) mediaklikk. date in html.
'url': 'https://mediaklikk.hu/video/hazajaro-delnyugat-bacska-a-duna-menten-palankatol-doroszloig/',
'info_dict': {
'id': '4754129',
@ -23,9 +26,21 @@ class MediaKlikkIE(InfoExtractor):
'ext': 'mp4',
'upload_date': '20210901',
'thumbnail': 'http://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg'
},
'skip': 'Webpage redirects to 404 page',
}, {
# mediaklikk. date in html.
'url': 'https://mediaklikk.hu/video/hazajaro-fabova-hegyseg-kishont-koronaja/',
'info_dict': {
'id': '6696133',
'title': 'Hazajáró, Fabova-hegység - Kishont koronája',
'display_id': 'hazajaro-fabova-hegyseg-kishont-koronaja',
'ext': 'mp4',
'upload_date': '20230903',
'thumbnail': 'https://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg'
}
}, {
# m4sport
# (old) m4sport
'url': 'https://m4sport.hu/video/2021/08/30/gyemant-liga-parizs/',
'info_dict': {
'id': '4754999',
@ -33,6 +48,18 @@ class MediaKlikkIE(InfoExtractor):
'ext': 'mp4',
'upload_date': '20210830',
'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/08/vlcsnap-2021-08-30-18h21m20s10-1024x576.jpg'
},
'skip': 'Webpage redirects to 404 page',
}, {
# m4sport
'url': 'https://m4sport.hu/sportkozvetitesek/video/2023/09/08/atletika-gyemant-liga-brusszel/',
'info_dict': {
'id': '6711136',
'title': 'Atlétika Gyémánt Liga, Brüsszel',
'display_id': 'atletika-gyemant-liga-brusszel',
'ext': 'mp4',
'upload_date': '20230908',
'thumbnail': 'https://m4sport.hu/wp-content/uploads/sites/4/2023/09/vlcsnap-2023-09-08-22h43m18s691.jpg'
}
}, {
# m4sport with *video/ url and no date
@ -40,20 +67,33 @@ class MediaKlikkIE(InfoExtractor):
'info_dict': {
'id': '4492099',
'title': 'Real Madrid - Chelsea 1-1',
'display_id': 'real-madrid-chelsea-1-1',
'ext': 'mp4',
'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/04/Sequence-01.Still001-1024x576.png'
'thumbnail': 'https://m4sport.hu/wp-content/uploads/sites/4/2021/04/Sequence-01.Still001-1024x576.png'
}
}, {
# hirado
# (old) hirado
'url': 'https://hirado.hu/videok/felteteleket-szabott-a-fovaros/',
'info_dict': {
'id': '4760120',
'title': 'Feltételeket szabott a főváros',
'ext': 'mp4',
'thumbnail': 'http://hirado.hu/wp-content/uploads/sites/4/2021/09/vlcsnap-2021-09-01-20h20m37s165.jpg'
},
'skip': 'Webpage redirects to video list page',
}, {
# hirado
'url': 'https://hirado.hu/belfold/video/2023/09/11/marad-az-eves-elszamolas-a-napelemekre-beruhazo-csaladoknal',
'info_dict': {
'id': '6716068',
'title': 'Marad az éves elszámolás a napelemekre beruházó családoknál',
'display_id': 'marad-az-eves-elszamolas-a-napelemekre-beruhazo-csaladoknal',
'ext': 'mp4',
'upload_date': '20230911',
'thumbnail': 'https://hirado.hu/wp-content/uploads/sites/4/2023/09/vlcsnap-2023-09-11-09h16m09s882.jpg'
}
}, {
# petofilive
# (old) petofilive
'url': 'https://petofilive.hu/video/2021/06/07/tha-shudras-az-akusztikban/',
'info_dict': {
'id': '4571948',
@ -61,6 +101,18 @@ class MediaKlikkIE(InfoExtractor):
'ext': 'mp4',
'upload_date': '20210607',
'thumbnail': 'http://petofilive.hu/wp-content/uploads/sites/4/2021/06/vlcsnap-2021-06-07-22h14m23s915-1024x576.jpg'
},
'skip': 'Webpage redirects to empty page',
}, {
# petofilive
'url': 'https://petofilive.hu/video/2023/09/09/futball-fesztival-a-margitszigeten/',
'info_dict': {
'id': '6713233',
'title': 'Futball Fesztivál a Margitszigeten',
'display_id': 'futball-fesztival-a-margitszigeten',
'ext': 'mp4',
'upload_date': '20230909',
'thumbnail': 'https://petofilive.hu/wp-content/uploads/sites/4/2023/09/Clipboard11-2.jpg'
}
}]
@ -84,8 +136,12 @@ class MediaKlikkIE(InfoExtractor):
player_data['video'] = player_data.pop('token')
player_page = self._download_webpage('https://player.mediaklikk.hu/playernew/player.php', video_id, query=player_data)
playlist_url = self._proto_relative_url(compat_urllib_parse_unquote(
self._html_search_regex(r'\"file\":\s*\"(\\?/\\?/.*playlist\.m3u8)\"', player_page, 'playlist_url')).replace('\\/', '/'))
player_json = self._search_json(
r'\bpl\.setup\s*\(', player_page, 'player json', video_id, end_pattern=r'\);')
playlist_url = traverse_obj(
player_json, ('playlist', lambda _, v: v['type'] == 'hls', 'file', {url_or_none}), get_all=False)
if not playlist_url:
raise ExtractorError('Unable to extract playlist url')
formats = self._extract_wowza_formats(
playlist_url, video_id, skip_protocols=['f4m', 'smil', 'dash'])

View File

@ -14,7 +14,7 @@ class MediaStreamBaseIE(InfoExtractor):
_BASE_URL_RE = r'https?://mdstrm\.com/(?:embed|live-stream)'
def _extract_mediastream_urls(self, webpage):
yield from traverse_obj(list(self._yield_json_ld(webpage, None)), (
yield from traverse_obj(list(self._yield_json_ld(webpage, None, fatal=False)), (
lambda _, v: v['@type'] == 'VideoObject', ('embedUrl', 'contentUrl'),
{lambda x: x if re.match(rf'{self._BASE_URL_RE}/\w+', x) else None}))
@ -106,8 +106,12 @@ class MediaStreamIE(MediaStreamBaseIE):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
if 'Debido a tu ubicación no puedes ver el contenido' in webpage:
self.raise_geo_restricted()
for message in [
'Debido a tu ubicación no puedes ver el contenido',
'You are not allowed to watch this video: Geo Fencing Restriction'
]:
if message in webpage:
self.raise_geo_restricted()
player_config = self._search_json(r'window\.MDSTRM\.OPTIONS\s*=', webpage, 'metadata', video_id)

View File

@ -20,7 +20,7 @@ class MixcloudBaseIE(InfoExtractor):
def _call_api(self, object_type, object_fields, display_id, username, slug=None):
lookup_key = object_type + 'Lookup'
return self._download_json(
'https://www.mixcloud.com/graphql', display_id, query={
'https://app.mixcloud.com/graphql', display_id, query={
'query': '''{
%s(lookup: {username: "%s"%s}) {
%s
@ -46,7 +46,15 @@ class MixcloudIE(MixcloudBaseIE):
'view_count': int,
'timestamp': 1321359578,
'upload_date': '20111115',
'uploader_url': 'https://www.mixcloud.com/dholbach/',
'artist': 'Submorphics & Chino , Telekinesis, Porter Robinson, Enei, Breakage ft Jess Mills',
'duration': 3723,
'tags': [],
'comment_count': int,
'repost_count': int,
'like_count': int,
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/',
'info_dict': {
@ -60,7 +68,14 @@ class MixcloudIE(MixcloudBaseIE):
'view_count': int,
'timestamp': 1422987057,
'upload_date': '20150203',
'uploader_url': 'https://www.mixcloud.com/gillespeterson/',
'duration': 2992,
'tags': [],
'comment_count': int,
'repost_count': int,
'like_count': int,
},
'params': {'skip_download': '404 playback error on site'},
}, {
'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/',
'only_matching': True,
@ -259,9 +274,9 @@ class MixcloudPlaylistBaseIE(MixcloudBaseIE):
cloudcast_url = cloudcast.get('url')
if not cloudcast_url:
continue
slug = try_get(cloudcast, lambda x: x['slug'], compat_str)
item_slug = try_get(cloudcast, lambda x: x['slug'], compat_str)
owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str)
video_id = '%s_%s' % (owner_username, slug) if slug and owner_username else None
video_id = f'{owner_username}_{item_slug}' if item_slug and owner_username else None
entries.append(self.url_result(
cloudcast_url, MixcloudIE.ie_key(), video_id))
@ -284,7 +299,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
'info_dict': {
'id': 'dholbach_uploads',
'title': 'Daniel Holbach (uploads)',
'description': 'md5:b60d776f0bab534c5dabe0a34e47a789',
'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b',
},
'playlist_mincount': 36,
}, {
@ -292,7 +307,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
'info_dict': {
'id': 'dholbach_uploads',
'title': 'Daniel Holbach (uploads)',
'description': 'md5:b60d776f0bab534c5dabe0a34e47a789',
'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b',
},
'playlist_mincount': 36,
}, {
@ -300,7 +315,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
'info_dict': {
'id': 'dholbach_favorites',
'title': 'Daniel Holbach (favorites)',
'description': 'md5:b60d776f0bab534c5dabe0a34e47a789',
'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b',
},
# 'params': {
# 'playlist_items': '1-100',
@ -323,9 +338,9 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
'info_dict': {
'id': 'FirstEar_stream',
'title': 'First Ear (stream)',
'description': 'Curators of good music\r\n\r\nfirstearmusic.com',
'description': 'we maraud for ears',
},
'playlist_mincount': 271,
'playlist_mincount': 269,
}]
_TITLE_KEY = 'displayName'

View File

@ -151,7 +151,7 @@ class MotherlessIE(InfoExtractor):
'd': 'days',
}
kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta}
upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d')
upload_date = (datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(**kwargs)).strftime('%Y%m%d')
comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage))
uploader_id = self._html_search_regex(

View File

@ -33,7 +33,7 @@ class N1InfoAssetIE(InfoExtractor):
class N1InfoIIE(InfoExtractor):
IE_NAME = 'N1Info:article'
_VALID_URL = r'https?://(?:(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)|nova\.rs)/(?:[^/]+/){1,2}(?P<id>[^/]+)'
_VALID_URL = r'https?://(?:(?:\w+\.)?n1info\.\w+|nova\.rs)/(?:[^/?#]+/){1,2}(?P<id>[^/?#]+)'
_TESTS = [{
# Youtube embedded
'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/',
@ -94,6 +94,16 @@ class N1InfoIIE(InfoExtractor):
'upload_date': '20211102',
'timestamp': 1635861677,
},
}, {
'url': 'https://n1info.rs/vesti/cuta-biti-u-kosovskoj-mitrovici-znaci-da-te-docekaju-eksplozivnim-napravama/',
'info_dict': {
'id': '1332368',
'ext': 'mp4',
'title': 'Ćuta: Biti u Kosovskoj Mitrovici znači da te dočekaju eksplozivnim napravama',
'upload_date': '20230620',
'timestamp': 1687290536,
'thumbnail': 'https://cdn.brid.tv/live/partners/26827/snapshot/1332368_th_6492013a8356f_1687290170.jpg'
},
}, {
'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/',
'only_matching': True,
@ -105,19 +115,35 @@ class N1InfoIIE(InfoExtractor):
title = self._html_search_regex(r'<h1[^>]+>(.+?)</h1>', webpage, 'title')
timestamp = unified_timestamp(self._html_search_meta('article:published_time', webpage))
videos = re.findall(r'(?m)(<video[^>]+>)', webpage)
plugin_data = self._html_search_meta('BridPlugin', webpage)
entries = []
for video in videos:
video_data = extract_attributes(video)
entries.append({
'_type': 'url_transparent',
'url': video_data.get('data-url'),
'id': video_data.get('id'),
'title': title,
'thumbnail': video_data.get('data-thumbnail'),
'timestamp': timestamp,
'ie_key': 'N1InfoAsset'})
if plugin_data:
site_id = self._html_search_regex(r'site:(\d+)', webpage, 'site id')
for video_data in re.findall(r'\$bp\("Brid_\d+", (.+)\);', webpage):
video_id = self._parse_json(video_data, title)['video']
entries.append({
'id': video_id,
'title': title,
'timestamp': timestamp,
'thumbnail': self._html_search_meta('thumbnailURL', webpage),
'formats': self._extract_m3u8_formats(
f'https://cdn-uc.brid.tv/live/partners/{site_id}/streaming/{video_id}/{video_id}.m3u8',
video_id, fatal=False),
})
else:
# Old player still present in older articles
videos = re.findall(r'(?m)(<video[^>]+>)', webpage)
for video in videos:
video_data = extract_attributes(video)
entries.append({
'_type': 'url_transparent',
'url': video_data.get('data-url'),
'id': video_data.get('id'),
'title': title,
'thumbnail': video_data.get('data-thumbnail'),
'timestamp': timestamp,
'ie_key': 'N1InfoAsset',
})
embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage)
for embedded_video in embedded_videos:

View File

@ -21,7 +21,7 @@ from ..utils import (
class NaverBaseIE(InfoExtractor):
_CAPTION_EXT_RE = r'\.(?:ttml|vtt)'
@staticmethod # NB: Used in VLiveWebArchiveIE, WeverseIE
@staticmethod # NB: Used in WeverseIE
def process_subtitles(vod_data, process_url):
ret = {'subtitles': {}, 'automatic_captions': {}}
for caption in traverse_obj(vod_data, ('captions', 'list', ...)):

View File

@ -265,6 +265,26 @@ class NitterIE(InfoExtractor):
'repost_count': int,
'comment_count': int,
}
}, { # no OpenGraph title
'url': f'https://{current_instance}/LocalBateman/status/1678455464038735895#m',
'info_dict': {
'id': '1678455464038735895',
'ext': 'mp4',
'title': 'Your Typical Local Man - Local man, what did Romanians ever do to you?',
'description': 'Local man, what did Romanians ever do to you?',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Your Typical Local Man',
'uploader_id': 'LocalBateman',
'uploader_url': f'https://{current_instance}/LocalBateman',
'upload_date': '20230710',
'timestamp': 1689009900,
'view_count': int,
'like_count': int,
'repost_count': int,
'comment_count': int,
},
'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
'params': {'skip_download': 'm3u8'},
}
]
@ -292,7 +312,7 @@ class NitterIE(InfoExtractor):
'ext': ext
}]
title = description = self._og_search_description(full_webpage) or self._html_search_regex(
title = description = self._og_search_description(full_webpage, default=None) or self._html_search_regex(
r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)
uploader_id = self._html_search_regex(

View File

@ -6,7 +6,6 @@ from ..utils import (
determine_ext,
int_or_none,
js_to_json,
qualities,
traverse_obj,
unified_strdate,
url_or_none,
@ -49,77 +48,52 @@ class NovaEmbedIE(InfoExtractor):
duration = None
formats = []
player = self._parse_json(
self._search_regex(
(r'(?:(?:replacePlaceholders|processAdTagModifier).*?:\s*)?(?:replacePlaceholders|processAdTagModifier)\s*\(\s*(?P<json>{.*?})\s*\)(?:\s*\))?\s*,',
r'Player\.init\s*\([^,]+,(?P<cndn>\s*\w+\s*\?)?\s*(?P<json>{(?(cndn).+?|.+)})\s*(?(cndn):|,\s*{.+?}\s*\)\s*;)'),
webpage, 'player', default='{}', group='json'), video_id, fatal=False)
if player:
for format_id, format_list in player['tracks'].items():
if not isinstance(format_list, list):
format_list = [format_list]
for format_dict in format_list:
if not isinstance(format_dict, dict):
continue
if (not self.get_param('allow_unplayable_formats')
and traverse_obj(format_dict, ('drm', 'keySystem'))):
has_drm = True
continue
format_url = url_or_none(format_dict.get('src'))
format_type = format_dict.get('type')
ext = determine_ext(format_url)
if (format_type == 'application/x-mpegURL'
or format_id == 'HLS' or ext == 'm3u8'):
formats.extend(self._extract_m3u8_formats(
format_url, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls',
fatal=False))
elif (format_type == 'application/dash+xml'
or format_id == 'DASH' or ext == 'mpd'):
formats.extend(self._extract_mpd_formats(
format_url, video_id, mpd_id='dash', fatal=False))
else:
formats.append({
'url': format_url,
})
duration = int_or_none(player.get('duration'))
else:
# Old path, not actual as of 08.04.2020
bitrates = self._parse_json(
self._search_regex(
r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'),
video_id, transform_source=js_to_json)
QUALITIES = ('lq', 'mq', 'hq', 'hd')
quality_key = qualities(QUALITIES)
for format_id, format_list in bitrates.items():
if not isinstance(format_list, list):
format_list = [format_list]
for format_url in format_list:
format_url = url_or_none(format_url)
if not format_url:
continue
if format_id == 'hls':
formats.extend(self._extract_m3u8_formats(
format_url, video_id, ext='mp4',
entry_protocol='m3u8_native', m3u8_id='hls',
fatal=False))
continue
f = {
def process_format_list(format_list, format_id=""):
nonlocal formats, has_drm
if not isinstance(format_list, list):
format_list = [format_list]
for format_dict in format_list:
if not isinstance(format_dict, dict):
continue
if (not self.get_param('allow_unplayable_formats')
and traverse_obj(format_dict, ('drm', 'keySystem'))):
has_drm = True
continue
format_url = url_or_none(format_dict.get('src'))
format_type = format_dict.get('type')
ext = determine_ext(format_url)
if (format_type == 'application/x-mpegURL'
or format_id == 'HLS' or ext == 'm3u8'):
formats.extend(self._extract_m3u8_formats(
format_url, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls',
fatal=False))
elif (format_type == 'application/dash+xml'
or format_id == 'DASH' or ext == 'mpd'):
formats.extend(self._extract_mpd_formats(
format_url, video_id, mpd_id='dash', fatal=False))
else:
formats.append({
'url': format_url,
}
f_id = format_id
for quality in QUALITIES:
if '%s.mp4' % quality in format_url:
f_id += '-%s' % quality
f.update({
'quality': quality_key(quality),
'format_note': quality.upper(),
})
break
f['format_id'] = f_id
formats.append(f)
})
player = self._search_json(
r'player:', webpage, 'player', video_id, fatal=False, end_pattern=r';\s*</script>')
if player:
for src in traverse_obj(player, ('lib', 'source', 'sources', ...)):
process_format_list(src)
duration = traverse_obj(player, ('sourceInfo', 'duration', {int_or_none}))
if not formats and not has_drm:
# older code path, in use before August 2023
player = self._parse_json(
self._search_regex(
(r'(?:(?:replacePlaceholders|processAdTagModifier).*?:\s*)?(?:replacePlaceholders|processAdTagModifier)\s*\(\s*(?P<json>{.*?})\s*\)(?:\s*\))?\s*,',
r'Player\.init\s*\([^,]+,(?P<cndn>\s*\w+\s*\?)?\s*(?P<json>{(?(cndn).+?|.+)})\s*(?(cndn):|,\s*{.+?}\s*\)\s*;)'),
webpage, 'player', group='json'), video_id)
if player:
for format_id, format_list in player['tracks'].items():
process_format_list(format_list, format_id)
duration = int_or_none(player.get('duration'))
if not formats and has_drm:
self.report_drm(video_id)

View File

@ -1,7 +1,7 @@
import calendar
import json
import functools
from datetime import datetime
from datetime import datetime, timezone
from random import random
from .common import InfoExtractor
@ -243,7 +243,7 @@ class PanoptoIE(PanoptoBaseIE):
invocation_id = delivery_info.get('InvocationId')
stream_id = traverse_obj(delivery_info, ('Delivery', 'Streams', ..., 'PublicID'), get_all=False, expected_type=str)
if invocation_id and stream_id and duration:
timestamp_str = f'/Date({calendar.timegm(datetime.utcnow().timetuple())}000)/'
timestamp_str = f'/Date({calendar.timegm(datetime.now(timezone.utc).timetuple())}000)/'
data = {
'streamRequests': [
{

113
yt_dlp/extractor/pornbox.py Normal file
View File

@ -0,0 +1,113 @@
from .common import InfoExtractor
from ..compat import functools
from ..utils import (
int_or_none,
parse_duration,
parse_iso8601,
qualities,
str_or_none,
traverse_obj,
url_or_none,
)
class PornboxIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?pornbox\.com/application/watch-page/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://pornbox.com/application/watch-page/212108',
'md5': '3ff6b6e206f263be4c5e987a3162ac6e',
'info_dict': {
'id': '212108',
'ext': 'mp4',
'title': 'md5:ececc5c6e6c9dd35d290c45fed05fd49',
'uploader': 'Lily Strong',
'timestamp': 1665871200,
'upload_date': '20221015',
'age_limit': 18,
'availability': 'needs_auth',
'duration': 1505,
'cast': ['Lily Strong', 'John Strong'],
'tags': 'count:11',
'description': 'md5:589c7f33e183aa8aa939537300efb859',
'thumbnail': r're:^https?://cdn-image\.gtflixtv\.com.*\.jpg.*$'
}
}, {
'url': 'https://pornbox.com/application/watch-page/216045',
'info_dict': {
'id': '216045',
'title': 'md5:3e48528e73a9a2b12f7a2772ed0b26a2',
'description': 'md5:3e631dcaac029f15ed434e402d1b06c7',
'uploader': 'VK Studio',
'timestamp': 1618264800,
'upload_date': '20210412',
'age_limit': 18,
'availability': 'premium_only',
'duration': 2710,
'cast': 'count:3',
'tags': 'count:29',
'thumbnail': r're:^https?://cdn-image\.gtflixtv\.com.*\.jpg.*$',
'subtitles': 'count:6'
},
'params': {
'skip_download': True,
'ignore_no_formats_error': True
},
'expected_warnings': [
'You are either not logged in or do not have access to this scene',
'No video formats found', 'Requested format is not available']
}]
def _real_extract(self, url):
video_id = self._match_id(url)
public_data = self._download_json(f'https://pornbox.com/contents/{video_id}', video_id)
subtitles = {country_code: [{
'url': f'https://pornbox.com/contents/{video_id}/subtitles/{country_code}',
'ext': 'srt'
}] for country_code in traverse_obj(public_data, ('subtitles', ..., {str}))}
is_free_scene = traverse_obj(
public_data, ('price', 'is_available_for_free', {bool}), default=False)
metadata = {
'id': video_id,
**traverse_obj(public_data, {
'title': ('scene_name', {str.strip}),
'description': ('small_description', {str.strip}),
'uploader': 'studio',
'duration': ('runtime', {parse_duration}),
'cast': (('models', 'male_models'), ..., 'model_name'),
'thumbnail': ('player_poster', {url_or_none}),
'tags': ('niches', ..., 'niche'),
}),
'age_limit': 18,
'timestamp': parse_iso8601(traverse_obj(
public_data, ('studios', 'release_date'), 'publish_date')),
'availability': self._availability(needs_auth=True, needs_premium=not is_free_scene),
'subtitles': subtitles,
}
if not public_data.get('is_purchased') or not is_free_scene:
self.raise_login_required(
'You are either not logged in or do not have access to this scene', metadata_available=True)
return metadata
media_id = traverse_obj(public_data, (
'medias', lambda _, v: v['title'] == 'Full video', 'media_id', {int}), get_all=False)
if not media_id:
self.raise_no_formats('Could not find stream id', video_id=video_id)
stream_data = self._download_json(
f'https://pornbox.com/media/{media_id}/stream', video_id=video_id, note='Getting manifest urls')
get_quality = qualities(['web', 'vga', 'hd', '1080p', '4k', '8k'])
metadata['formats'] = traverse_obj(stream_data, ('qualities', lambda _, v: v['src'], {
'url': 'src',
'vbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}),
'format_id': ('quality', {str_or_none}),
'quality': ('quality', {get_quality}),
'width': ('size', {lambda x: int(x[:-1])}),
}))
return metadata

View File

@ -1,97 +1,155 @@
import re
import json
from datetime import date
from urllib.parse import unquote
from .common import InfoExtractor
from ..utils import merge_dicts
from ..compat import functools
from ..utils import ExtractorError, make_archive_id, urljoin
from ..utils.traversal import traverse_obj
class Pr0grammStaticIE(InfoExtractor):
# Possible urls:
# https://pr0gramm.com/static/5466437
_VALID_URL = r'https?://pr0gramm\.com/static/(?P<id>[0-9]+)'
_TEST = {
'url': 'https://pr0gramm.com/static/5466437',
'md5': '52fa540d70d3edc286846f8ca85938aa',
'info_dict': {
'id': '5466437',
'ext': 'mp4',
'title': 'pr0gramm-5466437 by g11st',
'uploader': 'g11st',
'upload_date': '20221221',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
# Fetch media sources
entries = self._parse_html5_media_entries(url, webpage, video_id)
media_info = entries[0]
# Fetch author
uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader')
# Fetch approx upload timestamp from filename
# Have None-defaults in case the extraction fails
uploadDay = None
uploadMon = None
uploadYear = None
uploadTimestr = None
# (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4)
m = re.search(r'//img\.pr0gramm\.com/(?P<year>[\d]+)/(?P<mon>[\d]+)/(?P<day>[\d]+)/\w+\.\w{,4}', webpage)
if (m):
# Up to a day of accuracy should suffice...
uploadDay = m.groupdict().get('day')
uploadMon = m.groupdict().get('mon')
uploadYear = m.groupdict().get('year')
uploadTimestr = uploadYear + uploadMon + uploadDay
return merge_dicts({
'id': video_id,
'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''),
'uploader': uploader,
'upload_date': uploadTimestr
}, media_info)
# This extractor is for the primary url (used for sharing, and appears in the
# location bar) Since this page loads the DOM via JS, yt-dl can't find any
# video information here. So let's redirect to a compatibility version of
# the site, which does contain the <video>-element by itself, without requiring
# js to be ran.
class Pr0grammIE(InfoExtractor):
# Possible urls:
# https://pr0gramm.com/new/546637
# https://pr0gramm.com/new/video/546637
# https://pr0gramm.com/top/546637
# https://pr0gramm.com/top/video/546637
# https://pr0gramm.com/user/g11st/uploads/5466437
# https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290
# https://pr0gramm.com/user/froschler/reinziehen-1elf/5232030
# https://pr0gramm.com/user/froschler/1elf/5232030
# https://pr0gramm.com/new/5495710:comment62621020 <- this is not the id!
# https://pr0gramm.com/top/fruher war alles damals/5498175
_VALID_URL = r'https?:\/\/pr0gramm\.com\/(?!static/\d+).+?\/(?P<id>[\d]+)(:|$)'
_TEST = {
_VALID_URL = r'https?://pr0gramm\.com\/(?:[^/?#]+/)+(?P<id>[\d]+)(?:[/?#:]|$)'
_TESTS = [{
# Tags require account
'url': 'https://pr0gramm.com/new/video/5466437',
'info_dict': {
'id': '5466437',
'ext': 'mp4',
'title': 'pr0gramm-5466437 by g11st',
'tags': ['Neon Genesis Evangelion', 'Touhou Project', 'Fly me to the Moon', 'Marisad', 'Marisa Kirisame', 'video', 'sound', 'Marisa', 'Anime'],
'uploader': 'g11st',
'uploader_id': 394718,
'upload_timestamp': 1671590240,
'upload_date': '20221221',
}
}
'like_count': int,
'dislike_count': int,
'age_limit': 0,
'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
},
}, {
# Tags require account
'url': 'https://pr0gramm.com/new/3052805:comment28391322',
'info_dict': {
'id': '3052805',
'ext': 'mp4',
'title': 'pr0gramm-3052805 by Hansking1',
'tags': 'count:15',
'uploader': 'Hansking1',
'uploader_id': 385563,
'upload_timestamp': 1552930408,
'upload_date': '20190318',
'like_count': int,
'dislike_count': int,
'age_limit': 0,
'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
},
}, {
# Requires verified account
'url': 'https://pr0gramm.com/new/Gianna%20Michaels/5848332',
'info_dict': {
'id': '5848332',
'ext': 'mp4',
'title': 'pr0gramm-5848332 by erd0pfel',
'tags': 'count:18',
'uploader': 'erd0pfel',
'uploader_id': 349094,
'upload_timestamp': 1694489652,
'upload_date': '20230912',
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
},
}, {
'url': 'https://pr0gramm.com/static/5466437',
'only_matching': True,
}, {
'url': 'https://pr0gramm.com/new/rowan%20atkinson%20herr%20bohne/3052805',
'only_matching': True,
}, {
'url': 'https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290',
'only_matching': True,
}]
def _generic_title():
return "oof"
BASE_URL = 'https://pr0gramm.com'
@functools.cached_property
def _is_logged_in(self):
return 'pp' in self._get_cookies(self.BASE_URL)
@functools.cached_property
def _maximum_flags(self):
# We need to guess the flags for the content otherwise the api will raise an error
# We can guess the maximum allowed flags for the account from the cookies
# Bitflags are (msbf): nsfp, nsfl, nsfw, sfw
flags = 0b0001
if self._is_logged_in:
flags |= 0b1000
cookies = self._get_cookies(self.BASE_URL)
if 'me' not in cookies:
self._download_webpage(self.BASE_URL, None, 'Refreshing verification information')
if traverse_obj(cookies, ('me', {lambda x: x.value}, {unquote}, {json.loads}, 'verified')):
flags |= 0b0110
return flags
def _call_api(self, endpoint, video_id, query={}, note='Downloading API json'):
data = self._download_json(
f'https://pr0gramm.com/api/items/{endpoint}',
video_id, note, query=query, expected_status=403)
error = traverse_obj(data, ('error', {str}))
if error in ('nsfwRequired', 'nsflRequired', 'nsfpRequired', 'verificationRequired'):
if not self._is_logged_in:
self.raise_login_required()
raise ExtractorError(f'Unverified account cannot access NSFW/NSFL ({error})', expected=True)
elif error:
message = traverse_obj(data, ('msg', {str})) or error
raise ExtractorError(f'API returned error: {message}', expected=True)
return data
def _real_extract(self, url):
video_id = self._match_id(url)
video_info = traverse_obj(
self._call_api('get', video_id, {'id': video_id, 'flags': self._maximum_flags}),
('items', 0, {dict}))
return self.url_result(
'https://pr0gramm.com/static/' + video_id,
video_id=video_id,
ie=Pr0grammStaticIE.ie_key())
source = urljoin('https://img.pr0gramm.com', video_info.get('image'))
if not source or not source.endswith('mp4'):
self.raise_no_formats('Could not extract a video', expected=bool(source), video_id=video_id)
tags = None
if self._is_logged_in:
metadata = self._call_api('info', video_id, {'itemId': video_id})
tags = traverse_obj(metadata, ('tags', ..., 'tag', {str}))
# Sorted by "confidence", higher confidence = earlier in list
confidences = traverse_obj(metadata, ('tags', ..., 'confidence', ({int}, {float})))
if confidences:
tags = [tag for _, tag in sorted(zip(confidences, tags), reverse=True)]
return {
'id': video_id,
'title': f'pr0gramm-{video_id} by {video_info.get("user")}',
'formats': [{
'url': source,
'ext': 'mp4',
**traverse_obj(video_info, {
'width': ('width', {int}),
'height': ('height', {int}),
}),
}],
'tags': tags,
'age_limit': 18 if traverse_obj(video_info, ('flags', {0b110.__and__})) else 0,
'_old_archive_ids': [make_archive_id('Pr0grammStatic', video_id)],
**traverse_obj(video_info, {
'uploader': ('user', {str}),
'uploader_id': ('userId', {int}),
'like_count': ('up', {int}),
'dislike_count': ('down', {int}),
'upload_timestamp': ('created', {int}),
'upload_date': ('created', {int}, {date.fromtimestamp}, {lambda x: x.strftime('%Y%m%d')}),
'thumbnail': ('thumb', {lambda x: urljoin('https://thumb.pr0gramm.com', x)})
}),
}

View File

@ -1,7 +1,18 @@
import itertools
import re
import urllib.parse
from .common import InfoExtractor
from ..utils import parse_duration, unified_strdate
from ..utils import (
int_or_none,
join_nonempty,
js_to_json,
parse_duration,
strftime_or_none,
traverse_obj,
unified_strdate,
urljoin,
)
class RadioFranceIE(InfoExtractor):
@ -56,8 +67,32 @@ class RadioFranceIE(InfoExtractor):
}
class FranceCultureIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/(?:franceculture|fip|francemusique|mouv|franceinter)/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d+)($|[?#])'
class RadioFranceBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?radiofrance\.fr'
_STATIONS_RE = '|'.join(map(re.escape, (
'franceculture',
'franceinfo',
'franceinter',
'francemusique',
'fip',
'mouv',
)))
def _extract_data_from_webpage(self, webpage, display_id, key):
return traverse_obj(self._search_json(
r'\bconst\s+data\s*=', webpage, key, display_id,
contains_pattern=r'(\[\{.*?\}\]);', transform_source=js_to_json),
(..., 'data', key, {dict}), get_all=False) or {}
class FranceCultureIE(RadioFranceBaseIE):
_VALID_URL = rf'''(?x)
{RadioFranceBaseIE._VALID_URL_BASE}
/(?:{RadioFranceBaseIE._STATIONS_RE})
/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d{{6,}})(?:$|[?#])
'''
_TESTS = [
{
'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487',
@ -67,14 +102,30 @@ class FranceCultureIE(InfoExtractor):
'ext': 'mp3',
'title': 'La physique dEinstein aiderait-elle à comprendre le cerveau ?',
'description': 'Existerait-il un pont conceptuel entre la physique de lespace-temps et les neurosciences ?',
'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'upload_date': '20220514',
'duration': 2750,
},
},
{
'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9-30/le-7-9-30-du-vendredi-10-mars-2023-2107675',
'info_dict': {
'id': '2107675',
'display_id': 'le-7-9-30-du-vendredi-10-mars-2023',
'title': 'Inflation alimentaire : comment en sortir ? - Régis Debray et Claude Grange - Cybèle Idelot',
'description': 'md5:36ee74351ede77a314fdebb94026b916',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'upload_date': '20230310',
'duration': 8977,
'ext': 'mp3',
},
},
{
'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507',
'only_matching': True,
}, {
'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-sciences/sante-bientot-un-vaccin-contre-l-asthme-allergique-3057200',
'only_matching': True,
}
]
@ -89,7 +140,6 @@ class FranceCultureIE(InfoExtractor):
'id': video_id,
'display_id': display_id,
'url': video_data['contentUrl'],
'ext': video_data.get('encodingFormat'),
'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None,
'duration': parse_duration(video_data.get('duration')),
'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>',
@ -102,3 +152,322 @@ class FranceCultureIE(InfoExtractor):
'upload_date': unified_strdate(self._search_regex(
r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False))
}
class RadioFranceLiveIE(RadioFranceBaseIE):
_VALID_URL = rf'''(?x)
https?://(?:www\.)?radiofrance\.fr
/(?P<id>{RadioFranceBaseIE._STATIONS_RE})
/?(?P<substation_id>radio-[\w-]+)?(?:[#?]|$)
'''
_TESTS = [{
'url': 'https://www.radiofrance.fr/franceinter/',
'info_dict': {
'id': 'franceinter',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/franceculture',
'info_dict': {
'id': 'franceculture',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/mouv/radio-musique-kids-family',
'info_dict': {
'id': 'mouv-radio-musique-kids-family',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/mouv/radio-rnb-soul',
'info_dict': {
'id': 'mouv-radio-rnb-soul',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/mouv/radio-musique-mix',
'info_dict': {
'id': 'mouv-radio-musique-mix',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/fip/radio-rock',
'info_dict': {
'id': 'fip-radio-rock',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/mouv',
'only_matching': True,
}]
def _real_extract(self, url):
station_id, substation_id = self._match_valid_url(url).group('id', 'substation_id')
if substation_id:
webpage = self._download_webpage(url, station_id)
api_response = self._extract_data_from_webpage(webpage, station_id, 'webRadioData')
else:
api_response = self._download_json(
f'https://www.radiofrance.fr/{station_id}/api/live', station_id)
formats, subtitles = [], {}
for media_source in traverse_obj(api_response, (('now', None), 'media', 'sources', lambda _, v: v['url'])):
if media_source.get('format') == 'hls':
fmts, subs = self._extract_m3u8_formats_and_subtitles(media_source['url'], station_id, fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
else:
formats.append({
'url': media_source['url'],
'abr': media_source.get('bitrate'),
})
return {
'id': join_nonempty(station_id, substation_id),
'title': traverse_obj(api_response, ('visual', 'legend')) or join_nonempty(
('now', 'firstLine', 'title'), ('now', 'secondLine', 'title'), from_dict=api_response, delim=' - '),
'formats': formats,
'subtitles': subtitles,
'is_live': True,
}
class RadioFrancePlaylistBase(RadioFranceBaseIE):
"""Subclasses must set _METADATA_KEY"""
def _call_api(self, content_id, cursor, page_num):
raise NotImplementedError('This method must be implemented by subclasses')
def _generate_playlist_entries(self, content_id, content_response):
for page_num in itertools.count(2):
for entry in content_response['items']:
yield self.url_result(
f'https://www.radiofrance.fr/{entry["path"]}', url_transparent=True, **traverse_obj(entry, {
'title': 'title',
'description': 'standFirst',
'timestamp': ('publishedDate', {int_or_none}),
'thumbnail': ('visual', 'src'),
}))
next_cursor = traverse_obj(content_response, (('pagination', None), 'next'), get_all=False)
if not next_cursor:
break
content_response = self._call_api(content_id, next_cursor, page_num)
def _real_extract(self, url):
display_id = self._match_id(url)
metadata = self._download_json(
'https://www.radiofrance.fr/api/v2.1/path', display_id,
query={'value': urllib.parse.urlparse(url).path})['content']
content_id = metadata['id']
return self.playlist_result(
self._generate_playlist_entries(content_id, metadata[self._METADATA_KEY]), content_id,
display_id=display_id, **{**traverse_obj(metadata, {
'title': 'title',
'description': 'standFirst',
'thumbnail': ('visual', 'src'),
}), **traverse_obj(metadata, {
'title': 'name',
'description': 'role',
})})
class RadioFrancePodcastIE(RadioFrancePlaylistBase):
_VALID_URL = rf'''(?x)
{RadioFranceBaseIE._VALID_URL_BASE}
/(?:{RadioFranceBaseIE._STATIONS_RE})
/podcasts/(?P<id>[\w-]+)/?(?:[?#]|$)
'''
_TESTS = [{
'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-vert',
'info_dict': {
'id': 'eaf6ef81-a980-4f1c-a7d1-8a75ecd54b17',
'display_id': 'le-billet-vert',
'title': 'Le billet sciences',
'description': 'md5:eb1007b34b0c0a680daaa71525bbd4c1',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_mincount': 11,
}, {
'url': 'https://www.radiofrance.fr/franceinter/podcasts/jean-marie-le-pen-l-obsession-nationale',
'info_dict': {
'id': '566fd524-3074-4fbc-ac69-8696f2152a54',
'display_id': 'jean-marie-le-pen-l-obsession-nationale',
'title': 'Jean-Marie Le Pen, l\'obsession nationale',
'description': 'md5:a07c0cfb894f6d07a62d0ad12c4b7d73',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_count': 7,
}, {
'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine',
'info_dict': {
'id': '63c1ddc9-9f15-457a-98b2-411bac63f48d',
'display_id': 'serie-thomas-grjebine',
'title': 'Thomas Grjebine',
},
'playlist_count': 1,
}, {
'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip',
'info_dict': {
'id': '143dff38-e956-4a5d-8576-1c0b7242b99e',
'display_id': 'certains-l-aiment-fip',
'title': 'Certains laiment Fip',
'description': 'md5:ff974672ba00d4fd5be80fb001c5b27e',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_mincount': 321,
}, {
'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9',
'only_matching': True,
}, {
'url': 'https://www.radiofrance.fr/mouv/podcasts/dirty-mix',
'only_matching': True,
}]
_METADATA_KEY = 'expressions'
def _call_api(self, podcast_id, cursor, page_num):
return self._download_json(
f'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id,
note=f'Downloading page {page_num}', query={'pageCursor': cursor})
class RadioFranceProfileIE(RadioFrancePlaylistBase):
_VALID_URL = rf'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet?p=3',
'info_dict': {
'id': '86c62790-e481-11e2-9f7b-782bcb6744eb',
'display_id': 'thomas-pesquet',
'title': 'Thomas Pesquet',
'description': 'Astronaute à l\'agence spatiale européenne',
},
'playlist_mincount': 212,
}, {
'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie',
'info_dict': {
'id': '9593050b-0183-4972-a0b5-d8f699079e02',
'display_id': 'eugenie-bastie',
'title': 'Eugénie Bastié',
'description': 'Journaliste et essayiste',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_mincount': 39,
}, {
'url': 'https://www.radiofrance.fr/personnes/lea-salame',
'only_matching': True,
}]
_METADATA_KEY = 'documents'
def _call_api(self, profile_id, cursor, page_num):
resp = self._download_json(
f'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id,
note=f'Downloading page {page_num}', query={
'relation': 'personality',
'cursor': cursor,
})
resp['next'] = traverse_obj(resp, ('pagination', 'next'))
return resp
class RadioFranceProgramScheduleIE(RadioFranceBaseIE):
_VALID_URL = rf'''(?x)
{RadioFranceBaseIE._VALID_URL_BASE}
/(?P<station>{RadioFranceBaseIE._STATIONS_RE})
/grille-programmes(?:\?date=(?P<date>[\d-]+))?
'''
_TESTS = [{
'url': 'https://www.radiofrance.fr/franceinter/grille-programmes?date=17-02-2023',
'info_dict': {
'id': 'franceinter-program-20230217',
'upload_date': '20230217',
},
'playlist_count': 25,
}, {
'url': 'https://www.radiofrance.fr/franceculture/grille-programmes?date=01-02-2023',
'info_dict': {
'id': 'franceculture-program-20230201',
'upload_date': '20230201',
},
'playlist_count': 25,
}, {
'url': 'https://www.radiofrance.fr/mouv/grille-programmes?date=19-03-2023',
'info_dict': {
'id': 'mouv-program-20230319',
'upload_date': '20230319',
},
'playlist_count': 3,
}, {
'url': 'https://www.radiofrance.fr/francemusique/grille-programmes?date=18-03-2023',
'info_dict': {
'id': 'francemusique-program-20230318',
'upload_date': '20230318',
},
'playlist_count': 15,
}, {
'url': 'https://www.radiofrance.fr/franceculture/grille-programmes',
'only_matching': True,
}]
def _generate_playlist_entries(self, webpage_url, api_response):
for entry in traverse_obj(api_response, ('steps', lambda _, v: v['expression']['path'])):
yield self.url_result(
urljoin(webpage_url, f'/{entry["expression"]["path"]}'), ie=FranceCultureIE,
url_transparent=True, **traverse_obj(entry, {
'title': ('expression', 'title'),
'thumbnail': ('expression', 'visual', 'src'),
'timestamp': ('startTime', {int_or_none}),
'series_id': ('concept', 'id'),
'series': ('concept', 'title'),
}))
def _real_extract(self, url):
station, date = self._match_valid_url(url).group('station', 'date')
webpage = self._download_webpage(url, station)
grid_data = self._extract_data_from_webpage(webpage, station, 'grid')
upload_date = strftime_or_none(grid_data.get('date'), '%Y%m%d')
return self.playlist_result(
self._generate_playlist_entries(url, grid_data),
join_nonempty(station, 'program', upload_date), upload_date=upload_date)

View File

@ -1,10 +1,11 @@
import re
from .common import InfoExtractor
from ..utils import parse_qs, remove_start, traverse_obj, ExtractorError
class RbgTumIE(InfoExtractor):
_VALID_URL = r'https://live\.rbg\.tum\.de/w/(?P<id>.+)'
_VALID_URL = r'https://(?:live\.rbg\.tum\.de|tum\.live)/w/(?P<id>[^?#]+)'
_TESTS = [{
# Combined view
'url': 'https://live.rbg.tum.de/w/cpp/22128',
@ -35,16 +36,18 @@ class RbgTumIE(InfoExtractor):
'title': 'Fachschaftsvollversammlung',
'series': 'Fachschaftsvollversammlung Informatik',
}
}, {
'url': 'https://tum.live/w/linalginfo/27102',
'only_matching': True,
}, ]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
m3u8 = self._html_search_regex(r'(https://.+?\.m3u8)', webpage, 'm3u8')
lecture_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title')
lecture_series_title = self._html_search_regex(
r'(?s)<title\b[^>]*>\s*(?:TUM-Live\s\|\s?)?([^:]+):?.*?</title>', webpage, 'series')
m3u8 = self._html_search_regex(r'"(https://[^"]+\.m3u8[^"]*)', webpage, 'm3u8')
lecture_title = self._html_search_regex(r'<h1[^>]*>([^<]+)</h1>', webpage, 'title', fatal=False)
lecture_series_title = remove_start(self._html_extract_title(webpage), 'TUM-Live | ')
formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
@ -57,9 +60,9 @@ class RbgTumIE(InfoExtractor):
class RbgTumCourseIE(InfoExtractor):
_VALID_URL = r'https://live\.rbg\.tum\.de/course/(?P<id>.+)'
_VALID_URL = r'https://(?P<hostname>(?:live\.rbg\.tum\.de|tum\.live))/old/course/(?P<id>(?P<year>\d+)/(?P<term>\w+)/(?P<slug>[^/?#]+))'
_TESTS = [{
'url': 'https://live.rbg.tum.de/course/2022/S/fpv',
'url': 'https://live.rbg.tum.de/old/course/2022/S/fpv',
'info_dict': {
'title': 'Funktionale Programmierung und Verifikation (IN0003)',
'id': '2022/S/fpv',
@ -69,7 +72,7 @@ class RbgTumCourseIE(InfoExtractor):
},
'playlist_count': 13,
}, {
'url': 'https://live.rbg.tum.de/course/2022/W/set',
'url': 'https://live.rbg.tum.de/old/course/2022/W/set',
'info_dict': {
'title': 'SET FSMPIC',
'id': '2022/W/set',
@ -78,16 +81,62 @@ class RbgTumCourseIE(InfoExtractor):
'noplaylist': False,
},
'playlist_count': 6,
}, {
'url': 'https://tum.live/old/course/2023/S/linalginfo',
'only_matching': True,
}, ]
def _real_extract(self, url):
course_id = self._match_id(url)
webpage = self._download_webpage(url, course_id)
course_id, hostname, year, term, slug = self._match_valid_url(url).group('id', 'hostname', 'year', 'term', 'slug')
meta = self._download_json(
f'https://{hostname}/api/courses/{slug}/', course_id, fatal=False,
query={'year': year, 'term': term}) or {}
lecture_series_title = meta.get('Name')
lectures = [self.url_result(f'https://{hostname}/w/{slug}/{stream_id}', RbgTumIE)
for stream_id in traverse_obj(meta, ('Streams', ..., 'ID'))]
lecture_series_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title')
if not lectures:
webpage = self._download_webpage(url, course_id)
lecture_series_title = remove_start(self._html_extract_title(webpage), 'TUM-Live | ')
lectures = [self.url_result(f'https://{hostname}{lecture_path}', RbgTumIE)
for lecture_path in re.findall(r'href="(/w/[^/"]+/[^/"]+)"', webpage)]
lecture_urls = []
for lecture_url in re.findall(r'(?i)href="/w/(.+)(?<!/cam)(?<!/pres)(?<!/chat)"', webpage):
lecture_urls.append(self.url_result('https://live.rbg.tum.de/w/' + lecture_url, ie=RbgTumIE.ie_key()))
return self.playlist_result(lectures, course_id, lecture_series_title)
return self.playlist_result(lecture_urls, course_id, lecture_series_title)
class RbgTumNewCourseIE(InfoExtractor):
_VALID_URL = r'https://(?P<hostname>(?:live\.rbg\.tum\.de|tum\.live))/\?'
_TESTS = [{
'url': 'https://live.rbg.tum.de/?year=2022&term=S&slug=fpv&view=3',
'info_dict': {
'title': 'Funktionale Programmierung und Verifikation (IN0003)',
'id': '2022/S/fpv',
},
'params': {
'noplaylist': False,
},
'playlist_count': 13,
}, {
'url': 'https://live.rbg.tum.de/?year=2022&term=W&slug=set&view=3',
'info_dict': {
'title': 'SET FSMPIC',
'id': '2022/W/set',
},
'params': {
'noplaylist': False,
},
'playlist_count': 6,
}, {
'url': 'https://tum.live/?year=2023&term=S&slug=linalginfo&view=3',
'only_matching': True,
}]
def _real_extract(self, url):
query = parse_qs(url)
errors = [key for key in ('year', 'term', 'slug') if not query.get(key)]
if errors:
raise ExtractorError(f'Input URL is missing query parameters: {", ".join(errors)}')
year, term, slug = query['year'][0], query['term'][0], query['slug'][0]
hostname = self._match_valid_url(url).group('hostname')
return self.url_result(f'https://{hostname}/old/course/{year}/{term}/{slug}', RbgTumCourseIE)

View File

@ -319,16 +319,20 @@ class RedditIE(InfoExtractor):
'format_id': 'fallback',
'format_note': 'DASH video, mp4_dash',
}]
formats.extend(self._extract_m3u8_formats(
hls_playlist_url, display_id, 'mp4', m3u8_id='hls', fatal=False))
formats.extend(self._extract_mpd_formats(
dash_playlist_url, display_id, mpd_id='dash', fatal=False))
hls_fmts, subtitles = self._extract_m3u8_formats_and_subtitles(
hls_playlist_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
formats.extend(hls_fmts)
dash_fmts, dash_subs = self._extract_mpd_formats_and_subtitles(
dash_playlist_url, display_id, mpd_id='dash', fatal=False)
formats.extend(dash_fmts)
self._merge_subtitles(dash_subs, target=subtitles)
return {
**info,
'id': video_id,
'display_id': display_id,
'formats': formats,
'subtitles': subtitles,
'duration': int_or_none(reddit_video.get('duration')),
}

View File

@ -1,6 +1,7 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
parse_duration,
traverse_obj,
unified_timestamp,
@ -25,7 +26,7 @@ class RTVSLOIE(InfoExtractor):
'url': 'https://www.rtvslo.si/rtv365/arhiv/174842550?s=tv',
'info_dict': {
'id': '174842550',
'ext': 'flv',
'ext': 'mp4',
'release_timestamp': 1643140032,
'upload_date': '20220125',
'series': 'Dnevnik',
@ -69,7 +70,21 @@ class RTVSLOIE(InfoExtractor):
'tbr': 128000,
'release_date': '20220201',
},
}, {
'url': 'https://365.rtvslo.si/arhiv/razred-zase/148350750',
'info_dict': {
'id': '148350750',
'ext': 'mp4',
'title': 'Prvi šolski dan, mozaična oddaja za mlade',
'series': 'Razred zase',
'series_id': '148185730',
'duration': 1481,
'upload_date': '20121019',
'timestamp': 1350672122,
'release_date': '20121019',
'release_timestamp': 1350672122,
'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/148185730/razred_zase_2014_logo_4d_wide2.jpg',
},
}, {
'url': 'https://4d.rtvslo.si/arhiv/dnevnik/174842550',
'only_matching': True
@ -98,13 +113,14 @@ class RTVSLOIE(InfoExtractor):
media = self._download_json(self._API_BASE.format('getMedia', v_id), v_id, query={'jwt': jwt})['response']
formats = []
skip_protocols = ['smil', 'f4m', 'dash']
adaptive_url = traverse_obj(media, ('addaptiveMedia', 'hls_sec'), expected_type=url_or_none)
if adaptive_url:
formats = self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=['smil'])
formats = self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=skip_protocols)
adaptive_url = traverse_obj(media, ('addaptiveMedia_sl', 'hls_sec'), expected_type=url_or_none)
if adaptive_url:
for f in self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=['smil']):
for f in self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=skip_protocols):
formats.append({
**f,
'format_id': 'sign-' + f['format_id'],
@ -114,19 +130,19 @@ class RTVSLOIE(InfoExtractor):
else f.get('language'))
})
formats.extend(
{
'url': f['streams'][strm],
'ext': traverse_obj(f, 'mediaType', expected_type=str.lower),
'width': f.get('width'),
'height': f.get('height'),
'tbr': f.get('bitrate'),
'filesize': f.get('filesize'),
}
for strm in ('http', 'https')
for f in media.get('mediaFiles') or []
if traverse_obj(f, ('streams', strm))
)
for mediafile in traverse_obj(media, ('mediaFiles', lambda _, v: url_or_none(v['streams']['https']))):
formats.append(traverse_obj(mediafile, {
'url': ('streams', 'https'),
'ext': ('mediaType', {str.lower}),
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
'tbr': ('bitrate', {int_or_none}),
'filesize': ('filesize', {int_or_none}),
}))
for mediafile in traverse_obj(media, ('mediaFiles', lambda _, v: url_or_none(v['streams']['hls_sec']))):
formats.extend(self._extract_wowza_formats(
mediafile['streams']['hls_sec'], v_id, skip_protocols=skip_protocols))
if any('intermission.mp4' in x['url'] for x in formats):
self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)

View File

@ -1,6 +1,6 @@
import re
from ..utils import parse_duration
from ..utils import parse_duration, unescapeHTML
from .common import InfoExtractor
@ -16,7 +16,8 @@ class Rule34VideoIE(InfoExtractor):
'title': 'Shot It-(mmd hmv)',
'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065157/preview.jpg',
'duration': 347.0,
'age_limit': 18
'age_limit': 18,
'tags': 'count:14'
}
},
{
@ -28,7 +29,8 @@ class Rule34VideoIE(InfoExtractor):
'title': 'Lara in Trouble Ep. 7 [WildeerStudio]',
'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065296/preview.jpg',
'duration': 938.0,
'age_limit': 18
'age_limit': 18,
'tags': 'count:50'
}
},
]
@ -57,5 +59,7 @@ class Rule34VideoIE(InfoExtractor):
'title': title,
'thumbnail': thumbnail,
'duration': parse_duration(duration),
'age_limit': 18
'age_limit': 18,
'tags': list(map(unescapeHTML, re.findall(
r'<a class="tag_item"[^>]+\bhref="https://rule34video\.com/tags/\d+/"[^>]*>(?P<tag>[^>]*)</a>', webpage))),
}

View File

@ -33,7 +33,7 @@ class RumbleEmbedIE(InfoExtractor):
'upload_date': '20191020',
'channel_url': 'https://rumble.com/c/WMAR',
'channel': 'WMAR',
'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.OvCc-small-WMAR-2-News-Latest-Headline.jpg',
'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.qR4e-small-WMAR-2-News-Latest-Headline.jpg',
'duration': 234,
'uploader': 'WMAR',
'live_status': 'not_live',
@ -84,7 +84,7 @@ class RumbleEmbedIE(InfoExtractor):
'info_dict': {
'id': 'v1essrt',
'ext': 'mp4',
'title': 'startswith:lofi hip hop radio - beats to relax/study',
'title': 'startswith:lofi hip hop radio 📚 - beats to relax/study to',
'timestamp': 1661519399,
'upload_date': '20220826',
'channel_url': 'https://rumble.com/c/LofiGirl',
@ -99,7 +99,7 @@ class RumbleEmbedIE(InfoExtractor):
'url': 'https://rumble.com/embed/v1amumr',
'info_dict': {
'id': 'v1amumr',
'ext': 'webm',
'ext': 'mp4',
'fps': 60,
'title': 'Turning Point USA 2022 Student Action Summit DAY 1 - Rumble Exclusive Live',
'timestamp': 1658518457,
@ -129,7 +129,7 @@ class RumbleEmbedIE(InfoExtractor):
'duration': 92,
'title': '911 Audio From The Man Who Wanted To Kill Supreme Court Justice Kavanaugh',
'channel_url': 'https://rumble.com/c/RichSementa',
'thumbnail': 'https://sp.rmbl.ws/s8/1/P/j/f/A/PjfAe.OvCc-small-911-Audio-From-The-Man-Who-.jpg',
'thumbnail': 'https://sp.rmbl.ws/s8/1/P/j/f/A/PjfAe.qR4e-small-911-Audio-From-The-Man-Who-.jpg',
'timestamp': 1654892716,
'uploader': 'Mr Producer Media',
'upload_date': '20220610',
@ -144,7 +144,7 @@ class RumbleEmbedIE(InfoExtractor):
if embeds:
return embeds
return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer(
r'<script>[^<]*\bRumble\(\s*"play"\s*,\s*{\s*[\'"]?video[\'"]?\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)]
r'<script>[^<]*\bRumble\(\s*"play"\s*,\s*{[^}]*[\'"]?video[\'"]?\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)]
def _real_extract(self, url):
video_id = self._match_id(url)
@ -236,7 +236,9 @@ class RumbleEmbedIE(InfoExtractor):
class RumbleIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?rumble\.com/(?P<id>v(?!ideos)[\w.-]+)[^/]*$'
_EMBED_REGEX = [r'<a class=video-item--a href=(?P<url>/v[\w.-]+\.html)>']
_EMBED_REGEX = [
r'<a class=video-item--a href=(?P<url>/v[\w.-]+\.html)>',
r'<a[^>]+class="videostream__link link"[^>]+href=(?P<url>/v[\w.-]+\.html)[^>]*>']
_TESTS = [{
'add_ie': ['RumbleEmbed'],
'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html',
@ -254,6 +256,7 @@ class RumbleIE(InfoExtractor):
'thumbnail': r're:https://.+\.jpg',
'duration': 103,
'like_count': int,
'dislike_count': int,
'view_count': int,
'live_status': 'not_live',
}
@ -278,6 +281,9 @@ class RumbleIE(InfoExtractor):
'channel_url': 'https://rumble.com/c/Redacted',
'live_status': 'not_live',
'thumbnail': 'https://sp.rmbl.ws/s8/1/d/x/2/O/dx2Oi.qR4e-small-The-U.S.-CANNOT-hide-this-i.jpg',
'like_count': int,
'dislike_count': int,
'view_count': int,
},
}, {
'url': 'https://rumble.com/v2e7fju-the-covid-twitter-files-drop-protecting-fauci-while-censoring-the-truth-wma.html',
@ -296,12 +302,15 @@ class RumbleIE(InfoExtractor):
'channel_url': 'https://rumble.com/c/KimIversen',
'channel': 'Kim Iversen',
'thumbnail': 'https://sp.rmbl.ws/s8/1/6/b/w/O/6bwOi.qR4e-small-The-Covid-Twitter-Files-Dro.jpg',
'like_count': int,
'dislike_count': int,
'view_count': int,
},
}]
_WEBPAGE_TESTS = [{
'url': 'https://rumble.com/videos?page=2',
'playlist_count': 25,
'playlist_mincount': 24,
'info_dict': {
'id': 'videos?page=2',
'title': 'All videos',
@ -309,17 +318,16 @@ class RumbleIE(InfoExtractor):
'age_limit': 0,
},
}, {
'url': 'https://rumble.com/live-videos',
'playlist_mincount': 19,
'url': 'https://rumble.com/browse/live',
'playlist_mincount': 25,
'info_dict': {
'id': 'live-videos',
'title': 'Live Videos',
'description': 'Live videos on Rumble.com',
'id': 'live',
'title': 'Browse',
'age_limit': 0,
},
}, {
'url': 'https://rumble.com/search/video?q=rumble&sort=views',
'playlist_count': 24,
'playlist_mincount': 24,
'info_dict': {
'id': 'video?q=rumble&sort=views',
'title': 'Search results for: rumble',
@ -334,19 +342,20 @@ class RumbleIE(InfoExtractor):
if not url_info:
raise UnsupportedError(url)
release_ts_str = self._search_regex(
r'(?:Livestream begins|Streamed on):\s+<time datetime="([^"]+)',
webpage, 'release date', fatal=False, default=None)
view_count_str = self._search_regex(r'<span class="media-heading-info">([\d,]+) Views',
webpage, 'view count', fatal=False, default=None)
return self.url_result(
url_info['url'], ie_key=url_info['ie_key'], url_transparent=True,
view_count=parse_count(view_count_str),
release_timestamp=parse_iso8601(release_ts_str),
like_count=parse_count(get_element_by_class('rumbles-count', webpage)),
description=clean_html(get_element_by_class('media-description', webpage)),
)
return {
'_type': 'url_transparent',
'ie_key': url_info['ie_key'],
'url': url_info['url'],
'release_timestamp': parse_iso8601(self._search_regex(
r'(?:Livestream begins|Streamed on):\s+<time datetime="([^"]+)', webpage, 'release date', default=None)),
'view_count': int_or_none(self._search_regex(
r'"userInteractionCount"\s*:\s*(\d+)', webpage, 'view count', default=None)),
'like_count': parse_count(self._search_regex(
r'<span data-js="rumbles_up_votes">\s*([\d,.KM]+)', webpage, 'like count', default=None)),
'dislike_count': parse_count(self._search_regex(
r'<span data-js="rumbles_down_votes">\s*([\d,.KM]+)', webpage, 'dislike count', default=None)),
'description': clean_html(get_element_by_class('media-description', webpage))
}
class RumbleChannelIE(InfoExtractor):

View File

@ -1,5 +1,5 @@
from .common import InfoExtractor
from ..utils import traverse_obj
from ..utils import traverse_obj, url_or_none
class S4CIE(InfoExtractor):
@ -11,7 +11,8 @@ class S4CIE(InfoExtractor):
'ext': 'mp4',
'title': 'Y Swn',
'description': 'md5:f7681a30e4955b250b3224aa9fe70cf0',
'duration': 5340
'duration': 5340,
'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Y_Swn_2023S4C_099_ii.jpg'
},
}, {
'url': 'https://www.s4c.cymru/clic/programme/856636948',
@ -21,6 +22,7 @@ class S4CIE(InfoExtractor):
'title': 'Am Dro',
'duration': 2880,
'description': 'md5:100d8686fc9a632a0cb2db52a3433ffe',
'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Am_Dro_2022-23S4C_P6_4005.jpg'
},
}]
@ -30,7 +32,7 @@ class S4CIE(InfoExtractor):
f'https://www.s4c.cymru/df/full_prog_details?lang=e&programme_id={video_id}',
video_id, fatal=False)
filename = self._download_json(
player_config = self._download_json(
'https://player-api.s4c-cdn.co.uk/player-configuration/prod', video_id, query={
'programme_id': video_id,
'signed': '0',
@ -38,7 +40,13 @@ class S4CIE(InfoExtractor):
'mode': 'od',
'appId': 'clic',
'streamName': '',
}, note='Downloading player config JSON')['filename']
}, note='Downloading player config JSON')
subtitles = {}
for sub in traverse_obj(player_config, ('subtitles', lambda _, v: url_or_none(v['0']))):
subtitles.setdefault(sub.get('3', 'en'), []).append({
'url': sub['0'],
'name': sub.get('1'),
})
m3u8_url = self._download_json(
'https://player-api.s4c-cdn.co.uk/streaming-urls/prod', video_id, query={
'mode': 'od',
@ -46,17 +54,52 @@ class S4CIE(InfoExtractor):
'region': 'WW',
'extra': 'false',
'thirdParty': 'false',
'filename': filename,
'filename': player_config['filename'],
}, note='Downloading streaming urls JSON')['hls']
formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls')
return {
'id': video_id,
'formats': formats,
'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls'),
'subtitles': subtitles,
'thumbnail': url_or_none(player_config.get('poster')),
**traverse_obj(details, ('full_prog_details', 0, {
'title': (('programme_title', 'series_title'), {str}),
'description': ('full_billing', {str.strip}),
'duration': ('duration', {lambda x: int(x) * 60}),
}), get_all=False),
}
class S4CSeriesIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?s4c\.cymru/clic/series/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.s4c.cymru/clic/series/864982911',
'playlist_mincount': 6,
'info_dict': {
'id': '864982911',
'title': 'Iaith ar Daith',
'description': 'md5:e878ebf660dce89bd2ef521d7ce06397'
},
}, {
'url': 'https://www.s4c.cymru/clic/series/866852587',
'playlist_mincount': 8,
'info_dict': {
'id': '866852587',
'title': 'FFIT Cymru',
'description': 'md5:abcb3c129cb68dbb6cd304fd33b07e96'
},
}]
def _real_extract(self, url):
series_id = self._match_id(url)
series_details = self._download_json(
'https://www.s4c.cymru/df/series_details', series_id, query={
'lang': 'e',
'series_id': series_id,
'show_prog_in_series': 'Y'
}, note='Downloading series details JSON')
return self.playlist_result(
[self.url_result(f'https://www.s4c.cymru/clic/programme/{episode_id}', S4CIE, episode_id)
for episode_id in traverse_obj(series_details, ('other_progs_in_series', ..., 'id'))],
series_id, traverse_obj(series_details, ('full_prog_details', 0, 'series_title', {str})))

View File

@ -1,3 +1,4 @@
import base64
import re
from .common import InfoExtractor
@ -8,7 +9,12 @@ from ..compat import (
from ..utils import (
ExtractorError,
int_or_none,
float_or_none,
url_or_none,
unified_timestamp,
try_get,
urljoin,
traverse_obj,
)
@ -31,13 +37,20 @@ class SohuIE(InfoExtractor):
'id': '409385080',
'ext': 'mp4',
'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》',
}
},
'skip': 'no longer available',
}, {
'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml',
'info_dict': {
'id': '78693464',
'ext': 'mp4',
'title': '【爱范品】第31期MWC见不到的奇葩手机',
'uploader': '爱范儿视频',
'duration': 213,
'timestamp': 1425519600,
'upload_date': '20150305',
'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M10/83/FA/MTAuMTAuODguODA=/6_14cbccdde5eg104SysCutcloud_78693464_7_0b.jpg',
'tags': ['爱范儿', '爱范品', 'MWC', '手机'],
}
}, {
'note': 'Multipart video',
@ -45,6 +58,12 @@ class SohuIE(InfoExtractor):
'info_dict': {
'id': '78910339',
'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
'uploader': '小苍cany',
'duration': 744.0,
'timestamp': 1426269360,
'upload_date': '20150313',
'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M11/89/57/MTAuMTAuODguODA=/6_14cea022a1dg102SysCutcloud_78910339_8_0b.jpg',
'tags': ['小苍MM', '英雄联盟', '实战秘籍'],
},
'playlist': [{
'info_dict': {
@ -75,6 +94,11 @@ class SohuIE(InfoExtractor):
'id': '78932792',
'ext': 'mp4',
'title': 'youtube-dl testing video',
'duration': 360,
'timestamp': 1426348620,
'upload_date': '20150314',
'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M02/8A/00/MTAuMTAuODguNzk=/6_14cee1be192g102SysCutcloud_78932792_7_7b.jpg',
'tags': [],
},
'params': {
'skip_download': True
@ -100,7 +124,7 @@ class SohuIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
title = re.sub(r' - 搜狐视频$', '', self._og_search_title(webpage))
title = re.sub(r'( - 高清正版在线观看)? - 搜狐视频$', '', self._og_search_title(webpage))
vid = self._html_search_regex(
r'var vid ?= ?["\'](\d+)["\']',
@ -132,7 +156,9 @@ class SohuIE(InfoExtractor):
allot = format_data['allot']
data = format_data['data']
clips_url = data['clipsURL']
clip_url = traverse_obj(data, (('clipsURL', 'mp4PlayUrl'), i, {url_or_none}), get_all=False)
if not clip_url:
raise ExtractorError(f'Unable to extract url for clip {i}')
su = data['su']
video_url = 'newflv.sohu.ccgslb.net'
@ -142,9 +168,9 @@ class SohuIE(InfoExtractor):
while 'newflv.sohu.ccgslb.net' in video_url:
params = {
'prot': 9,
'file': clips_url[i],
'file': clip_url,
'new': su[i],
'prod': 'flash',
'prod': 'h5n',
'rb': 1,
}
@ -193,6 +219,75 @@ class SohuIE(InfoExtractor):
'entries': playlist,
'id': video_id,
'title': title,
'duration': traverse_obj(vid_data, ('data', 'totalDuration', {float_or_none})),
}
return info
if mytv:
publish_time = unified_timestamp(self._search_regex(
r'publishTime:\s*["\'](\d+-\d+-\d+ \d+:\d+)["\']', webpage, 'publish time', fatal=False))
else:
publish_time = traverse_obj(vid_data, ('tv_application_time', {unified_timestamp}))
return {
'timestamp': publish_time - 8 * 3600 if publish_time else None,
**traverse_obj(vid_data, {
'alt_title': ('data', 'subName', {str}),
'uploader': ('wm_data', 'wm_username', {str}),
'thumbnail': ('data', 'coverImg', {url_or_none}),
'tags': ('data', 'tag', {str.split}),
}),
**info,
}
class SohuVIE(InfoExtractor):
_VALID_URL = r'https?://tv\.sohu\.com/v/(?P<id>[\w=-]+)\.html(?:$|[#?])'
_TESTS = [{
'note': 'Multipart video',
'url': 'https://tv.sohu.com/v/MjAyMzA2MTQvbjYwMTMxNTE5Mi5zaHRtbA==.html',
'info_dict': {
'id': '601315192',
'title': '《淬火丹心》第1集',
'alt_title': '“点天灯”发生事故',
'duration': 2701.692,
'timestamp': 1686758040,
'upload_date': '20230614',
'thumbnail': 'http://photocdn.tv.sohu.com/img/20230614/vrsa_hor_1686738763256_454010551.jpg',
},
'playlist_mincount': 9,
'skip': 'Only available in China',
}, {
'url': 'https://tv.sohu.com/v/dXMvMjMyNzk5ODg5Lzc4NjkzNDY0LnNodG1s.html',
'info_dict': {
'id': '78693464',
'ext': 'mp4',
'title': '【爱范品】第31期MWC见不到的奇葩手机',
'uploader': '爱范儿视频',
'duration': 213,
'timestamp': 1425519600,
'upload_date': '20150305',
'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M10/83/FA/MTAuMTAuODguODA=/6_14cbccdde5eg104SysCutcloud_78693464_7_0b.jpg',
'tags': ['爱范儿', '爱范品', 'MWC', '手机'],
}
}, {
'note': 'Multipart video',
'url': 'https://tv.sohu.com/v/dXMvMjQyNTYyMTYzLzc4OTEwMzM5LnNodG1s.html?src=pl',
'info_dict': {
'id': '78910339',
'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
'uploader': '小苍cany',
'duration': 744.0,
'timestamp': 1426269360,
'upload_date': '20150313',
'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M11/89/57/MTAuMTAuODguODA=/6_14cea022a1dg102SysCutcloud_78910339_8_0b.jpg',
'tags': ['小苍MM', '英雄联盟', '实战秘籍'],
},
'playlist_mincount': 3,
}]
def _real_extract(self, url):
encoded_id = self._match_id(url)
path = base64.urlsafe_b64decode(encoded_id).decode()
subdomain = 'tv' if re.match(r'\d+/n\d+\.shtml', path) else 'my.tv'
return self.url_result(urljoin(f'http://{subdomain}.sohu.com/', path), SohuIE)

View File

@ -15,7 +15,6 @@ from ..utils import (
UserNotLive,
determine_ext,
format_field,
get_element_by_id,
get_first,
int_or_none,
join_nonempty,
@ -50,8 +49,9 @@ class TikTokBaseIE(InfoExtractor):
return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}'
def _get_sigi_state(self, webpage, display_id):
return self._parse_json(get_element_by_id(
'SIGI_STATE|sigi-persisted-data', webpage, escape_value=False), display_id)
return self._search_json(
r'<script[^>]+\bid="(?:SIGI_STATE|sigi-persisted-data)"[^>]*>', webpage,
'sigi state', display_id, end_pattern=r'</script>')
def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True,
note='Downloading API JSON', errnote='Unable to download API page'):

View File

@ -1,10 +1,14 @@
import urllib.parse
from .common import InfoExtractor
from ..utils import (
determine_ext,
extract_attributes,
int_or_none,
parse_duration,
traverse_obj,
try_get,
url_or_none,
)
@ -12,6 +16,36 @@ class TV5MondePlusIE(InfoExtractor):
IE_DESC = 'TV5MONDE+'
_VALID_URL = r'https?://(?:www\.)?(?:tv5mondeplus|revoir\.tv5monde)\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)'
_TESTS = [{
# movie
'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/les-novices',
'md5': 'c86f60bf8b75436455b1b205f9745955',
'info_dict': {
'id': 'ZX0ipMyFQq_6D4BA7b',
'display_id': 'les-novices',
'ext': 'mp4',
'title': 'Les novices',
'description': 'md5:2e7c33ba3ad48dabfcc2a956b88bde2b',
'upload_date': '20230821',
'thumbnail': 'https://revoir.tv5monde.com/uploads/media/video_thumbnail/0738/60/01e952b7ccf36b7c6007ec9131588954ab651de9.jpeg',
'duration': 5177,
'episode': 'Les novices',
},
}, {
# series episode
'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/opj-les-dents-de-la-terre-2',
'info_dict': {
'id': 'wJ0eeEPozr_6D4BA7b',
'display_id': 'opj-les-dents-de-la-terre-2',
'ext': 'mp4',
'title': "OPJ - Les dents de la Terre (2)",
'description': 'md5:288f87fd68d993f814e66e60e5302d9d',
'upload_date': '20230823',
'series': 'OPJ',
'episode': 'Les dents de la Terre (2)',
'duration': 2877,
'thumbnail': 'https://dl-revoir.tv5monde.com/images/1a/5753448.jpg'
},
}, {
# movie
'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/ceux-qui-travaillent',
'md5': '32fa0cde16a4480d1251502a66856d5f',
@ -23,6 +57,7 @@ class TV5MondePlusIE(InfoExtractor):
'description': 'md5:570e8bb688036ace873b2d50d24c026d',
'upload_date': '20210819',
},
'skip': 'no longer available',
}, {
# series episode
'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/vestiaires-caro-actrice',
@ -39,6 +74,7 @@ class TV5MondePlusIE(InfoExtractor):
'params': {
'skip_download': True,
},
'skip': 'no longer available',
}, {
'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/neuf-jours-en-hiver-neuf-jours-en-hiver',
'only_matching': True,
@ -63,20 +99,45 @@ class TV5MondePlusIE(InfoExtractor):
video_files = self._parse_json(
vpl_data['data-broadcast'], display_id)
formats = []
for video_file in video_files:
v_url = video_file.get('url')
if not v_url:
continue
video_format = video_file.get('format') or determine_ext(v_url)
if video_format == 'm3u8':
formats.extend(self._extract_m3u8_formats(
v_url, display_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
else:
formats.append({
'url': v_url,
'format_id': video_format,
})
video_id = None
def process_video_files(v):
nonlocal video_id
for video_file in v:
v_url = video_file.get('url')
if not v_url:
continue
if video_file.get('type') == 'application/deferred':
d_param = urllib.parse.quote(v_url)
token = video_file.get('token')
if not token:
continue
deferred_json = self._download_json(
f'https://api.tv5monde.com/player/asset/{d_param}/resolve?condenseKS=true', display_id,
note='Downloading deferred info', headers={'Authorization': f'Bearer {token}'}, fatal=False)
v_url = traverse_obj(deferred_json, (0, 'url', {url_or_none}))
if not v_url:
continue
# data-guid from the webpage isn't stable, use the material id from the json urls
video_id = self._search_regex(
r'materials/([\da-zA-Z]{10}_[\da-fA-F]{7})/', v_url, 'video id', default=None)
process_video_files(deferred_json)
video_format = video_file.get('format') or determine_ext(v_url)
if video_format == 'm3u8':
formats.extend(self._extract_m3u8_formats(
v_url, display_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
elif video_format == 'mpd':
formats.extend(self._extract_mpd_formats(
v_url, display_id, fatal=False))
else:
formats.append({
'url': v_url,
'format_id': video_format,
})
process_video_files(video_files)
metadata = self._parse_json(
vpl_data['data-metadata'], display_id)
@ -100,10 +161,11 @@ class TV5MondePlusIE(InfoExtractor):
if upload_date:
upload_date = upload_date.replace('_', '')
video_id = self._search_regex(
(r'data-guid=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
r'id_contenu["\']\s:\s*(\d+)'), webpage, 'video id',
default=display_id)
if not video_id:
video_id = self._search_regex(
(r'data-guid=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
r'id_contenu["\']\s:\s*(\d+)'), webpage, 'video id',
default=display_id)
return {
'id': video_id,

View File

@ -22,7 +22,7 @@ from ..utils import (
class TwitCastingIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/(?:movie|twplayer)/(?P<id>\d+)'
_VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P<uploader_id>[^/?#]+)/(?:movie|twplayer)/(?P<id>\d+)'
_M3U8_HEADERS = {
'Origin': 'https://twitcasting.tv',
'Referer': 'https://twitcasting.tv/',
@ -231,7 +231,7 @@ class TwitCastingIE(InfoExtractor):
class TwitCastingLiveIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<id>[^/]+)/?(?:[#?]|$)'
_VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P<id>[^/?#]+)/?(?:[#?]|$)'
_TESTS = [{
'url': 'https://twitcasting.tv/ivetesangalo',
'only_matching': True,
@ -265,8 +265,15 @@ class TwitCastingLiveIE(InfoExtractor):
class TwitCastingUserIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<id>[^/]+)/show/?(?:[#?]|$)'
_VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P<id>[^/?#]+)/(:?show|archive)/?(?:[#?]|$)'
_TESTS = [{
'url': 'https://twitcasting.tv/natsuiromatsuri/archive/',
'info_dict': {
'id': 'natsuiromatsuri',
'title': 'natsuiromatsuri - Live History',
},
'playlist_mincount': 235,
}, {
'url': 'https://twitcasting.tv/noriyukicas/show',
'only_matching': True,
}]

View File

@ -1,9 +1,10 @@
import functools
import json
import random
import re
from .common import InfoExtractor
from .periscope import PeriscopeBaseIE, PeriscopeIE
from ..compat import functools # isort: split
from ..compat import (
compat_parse_qs,
compat_urllib_parse_unquote,
@ -147,10 +148,14 @@ class TwitterBaseIE(InfoExtractor):
def is_logged_in(self):
return bool(self._get_cookies(self._API_BASE).get('auth_token'))
@functools.cached_property
def _selected_api(self):
return self._configuration_arg('api', ['graphql'], ie_key='Twitter')[0]
def _fetch_guest_token(self, display_id):
guest_token = traverse_obj(self._download_json(
f'{self._API_BASE}guest/activate.json', display_id, 'Downloading guest token', data=b'',
headers=self._set_base_headers(legacy=display_id and self._configuration_arg('legacy_api'))),
headers=self._set_base_headers(legacy=display_id and self._selected_api == 'legacy')),
('guest_token', {str}))
if not guest_token:
raise ExtractorError('Could not retrieve guest token')
@ -295,7 +300,7 @@ class TwitterBaseIE(InfoExtractor):
self.report_login()
def _call_api(self, path, video_id, query={}, graphql=False):
headers = self._set_base_headers(legacy=not graphql and self._configuration_arg('legacy_api'))
headers = self._set_base_headers(legacy=not graphql and self._selected_api == 'legacy')
headers.update({
'x-twitter-auth-type': 'OAuth2Session',
'x-twitter-client-language': 'en',
@ -707,6 +712,7 @@ class TwitterIE(TwitterBaseIE):
'tags': [],
'age_limit': 0,
},
'skip': 'This Tweet is unavailable',
}, {
# not available in Periscope
'url': 'https://twitter.com/ViviEducation/status/1136534865145286656',
@ -721,6 +727,7 @@ class TwitterIE(TwitterBaseIE):
'view_count': int,
},
'add_ie': ['TwitterBroadcast'],
'skip': 'Broadcast no longer exists',
}, {
# unified card
'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20',
@ -773,9 +780,9 @@ class TwitterIE(TwitterBaseIE):
'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464',
'info_dict': {
'id': '1577719286659006464',
'title': 'Ultima📛 | #вʟм - Test',
'title': 'Ultima📛| New Era - Test',
'description': 'Test https://t.co/Y3KEZD7Dad',
'uploader': 'Ultima📛 | #вʟм',
'uploader': 'Ultima📛| New Era',
'uploader_id': 'UltimaShadowX',
'uploader_url': 'https://twitter.com/UltimaShadowX',
'upload_date': '20221005',
@ -811,7 +818,7 @@ class TwitterIE(TwitterBaseIE):
'age_limit': 0,
},
}, {
# Adult content, fails if not logged in (GraphQL)
# Adult content, fails if not logged in
'url': 'https://twitter.com/Rizdraws/status/1575199173472927762',
'info_dict': {
'id': '1575199163847000068',
@ -831,9 +838,10 @@ class TwitterIE(TwitterBaseIE):
'age_limit': 18,
'tags': []
},
'params': {'skip_download': 'The media could not be played'},
'skip': 'Requires authentication',
}, {
# Playlist result only with auth
# Playlist result only with graphql API
'url': 'https://twitter.com/Srirachachau/status/1395079556562706435',
'playlist_mincount': 2,
'info_dict': {
@ -898,7 +906,7 @@ class TwitterIE(TwitterBaseIE):
'uploader_id': 'MoniqueCamarra',
'live_status': 'was_live',
'release_timestamp': 1658417414,
'description': 'md5:4dc8e972f1d8b3c6580376fabb02a3ad',
'description': 'md5:acce559345fd49f129c20dbcda3f1201',
'timestamp': 1658407771,
'release_date': '20220721',
'upload_date': '20220721',
@ -1007,10 +1015,10 @@ class TwitterIE(TwitterBaseIE):
'view_count': int,
'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig',
'age_limit': 0,
'uploader': 'Mün The Friend Of YWAP',
'uploader': 'Mün',
'repost_count': int,
'upload_date': '20221206',
'title': 'Mün The Friend Of YWAP - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525',
'title': 'Mün - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525',
'comment_count': int,
'like_count': int,
'tags': [],
@ -1019,7 +1027,7 @@ class TwitterIE(TwitterBaseIE):
'timestamp': 1670306984.0,
},
}, {
# url to retweet id w/ legacy api
# retweeted_status (private)
'url': 'https://twitter.com/liberdalau/status/1623739803874349067',
'info_dict': {
'id': '1623274794488659969',
@ -1039,32 +1047,84 @@ class TwitterIE(TwitterBaseIE):
'like_count': int,
'repost_count': int,
},
'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}},
'skip': 'Protected tweet',
}, {
# orig tweet w/ graphql
'url': 'https://twitter.com/liberdalau/status/1623739803874349067',
# retweeted_status
'url': 'https://twitter.com/playstrumpcard/status/1695424220702888009',
'info_dict': {
'id': '1623274794488659969',
'display_id': '1623739803874349067',
'id': '1694928337846538240',
'ext': 'mp4',
'title': '@selfisekai@hackerspace.pl 🐀 - RT @Johnnybull3ts: Me after going viral to over 30million people: Whoopsie-daisy',
'description': 'md5:9258bdbb54793bdc124fe1cd47e96c6a',
'uploader': '@selfisekai@hackerspace.pl 🐀',
'uploader_id': 'liberdalau',
'uploader_url': 'https://twitter.com/liberdalau',
'display_id': '1695424220702888009',
'title': 'md5:e8daa9527bc2b947121395494f786d9d',
'description': 'md5:004f2d37fd58737724ec75bc7e679938',
'uploader': 'Benny Johnson',
'uploader_id': 'bennyjohnson',
'uploader_url': 'https://twitter.com/bennyjohnson',
'age_limit': 0,
'tags': [],
'duration': 8.033,
'timestamp': 1675964711.0,
'upload_date': '20230209',
'thumbnail': r're:https://pbs\.twimg\.com/ext_tw_video_thumb/.+',
'duration': 45.001,
'timestamp': 1692962814.0,
'upload_date': '20230825',
'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+',
'like_count': int,
'view_count': int,
'repost_count': int,
'view_count': int,
'comment_count': int,
},
'skip': 'Protected tweet',
}, {
# retweeted_status w/ legacy API
'url': 'https://twitter.com/playstrumpcard/status/1695424220702888009',
'info_dict': {
'id': '1694928337846538240',
'ext': 'mp4',
'display_id': '1695424220702888009',
'title': 'md5:e8daa9527bc2b947121395494f786d9d',
'description': 'md5:004f2d37fd58737724ec75bc7e679938',
'uploader': 'Benny Johnson',
'uploader_id': 'bennyjohnson',
'uploader_url': 'https://twitter.com/bennyjohnson',
'age_limit': 0,
'tags': [],
'duration': 45.001,
'timestamp': 1692962814.0,
'upload_date': '20230825',
'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+',
'like_count': int,
'repost_count': int,
},
'params': {'extractor_args': {'twitter': {'api': ['legacy']}}},
}, {
# Broadcast embedded in tweet
'url': 'https://twitter.com/JessicaDobsonWX/status/1693057346933600402',
'info_dict': {
'id': '1yNGaNLjEblJj',
'ext': 'mp4',
'title': 'Jessica Dobson - WAVE Weather Now - Saturday 8/19/23 Update',
'uploader': 'Jessica Dobson',
'uploader_id': '1DZEoDwDovRQa',
'thumbnail': r're:^https?://.*\.jpg',
'view_count': int,
},
'add_ie': ['TwitterBroadcast'],
}, {
# Animated gif and quote tweet video, with syndication API
'url': 'https://twitter.com/BAKKOOONN/status/1696256659889565950',
'playlist_mincount': 2,
'info_dict': {
'id': '1696256659889565950',
'title': 'BAKOON - https://t.co/zom968d0a0',
'description': 'https://t.co/zom968d0a0',
'tags': [],
'uploader': 'BAKOON',
'uploader_id': 'BAKKOOONN',
'uploader_url': 'https://twitter.com/BAKKOOONN',
'age_limit': 18,
'timestamp': 1693254077.0,
'upload_date': '20230828',
'like_count': int,
},
'params': {'extractor_args': {'twitter': {'api': ['syndication']}}},
'expected_warnings': ['Not all metadata'],
}, {
# onion route
'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273',
@ -1103,6 +1163,14 @@ class TwitterIE(TwitterBaseIE):
'only_matching': True,
}]
_MEDIA_ID_RE = re.compile(r'_video/(\d+)/')
@property
def _GRAPHQL_ENDPOINT(self):
if self.is_logged_in:
return 'zZXycP0V6H7m-2r0mOnFcA/TweetDetail'
return '2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId'
def _graphql_to_legacy(self, data, twid):
result = traverse_obj(data, (
'threaded_conversation_with_injections_v2', 'instructions', 0, 'entries',
@ -1130,9 +1198,14 @@ class TwitterIE(TwitterBaseIE):
'user': ('core', 'user_results', 'result', 'legacy'),
'card': ('card', 'legacy'),
'quoted_status': ('quoted_status_result', 'result', 'legacy'),
'retweeted_status': ('legacy', 'retweeted_status_result', 'result', 'legacy'),
}, expected_type=dict, default={}))
# extra transformation is needed since result does not match legacy format
# extra transformations needed since result does not match legacy format
if status.get('retweeted_status'):
status['retweeted_status']['user'] = traverse_obj(status, (
'retweeted_status_result', 'result', 'core', 'user_results', 'result', 'legacy', {dict})) or {}
binding_values = {
binding_value.get('key'): binding_value.get('value')
for binding_value in traverse_obj(status, ('card', 'binding_values', ..., {dict}))
@ -1208,33 +1281,42 @@ class TwitterIE(TwitterBaseIE):
}
def _extract_status(self, twid):
if self.is_logged_in:
return self._graphql_to_legacy(
self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid)
if self.is_logged_in or self._selected_api == 'graphql':
status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid)
try:
if not self._configuration_arg('legacy_api'):
return self._graphql_to_legacy(
self._call_graphql_api('2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId', twid), twid)
return traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, {
elif self._selected_api == 'legacy':
status = self._call_api(f'statuses/show/{twid}.json', twid, {
'cards_platform': 'Web-12',
'include_cards': 1,
'include_reply_count': 1,
'include_user_entities': 0,
'tweet_mode': 'extended',
}), 'retweeted_status', None)
})
except ExtractorError as e:
if e.expected:
raise
elif self._selected_api == 'syndication':
self.report_warning(
f'{e.orig_msg}. Falling back to syndication endpoint; some metadata may be missing', twid)
'Not all metadata or media is available via syndication endpoint', twid, only_once=True)
status = self._download_json(
'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON',
headers={'User-Agent': 'Googlebot'}, query={
'id': twid,
# TODO: token = ((Number(twid) / 1e15) * Math.PI).toString(36).replace(/(0+|\.)/g, '')
'token': ''.join(random.choices('123456789abcdefghijklmnopqrstuvwxyz', k=10)),
})
if not status:
raise ExtractorError('Syndication endpoint returned empty JSON response')
# Transform the result so its structure matches that of legacy/graphql
media = []
for detail in traverse_obj(status, ((None, 'quoted_tweet'), 'mediaDetails', ..., {dict})):
detail['id_str'] = traverse_obj(detail, (
'video_info', 'variants', ..., 'url', {self._MEDIA_ID_RE.search}, 1), get_all=False) or twid
media.append(detail)
status['extended_entities'] = {'media': media}
status = self._download_json(
'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON',
headers={'User-Agent': 'Googlebot'}, query={'id': twid})
status['extended_entities'] = {'media': status.get('mediaDetails')}
return status
else:
raise ExtractorError(f'"{self._selected_api}" is not a valid API selection', expected=True)
return traverse_obj(status, 'retweeted_status', None, expected_type=dict) or {}
def _real_extract(self, url):
twid, selected_index = self._match_valid_url(url).group('id', 'index')
@ -1266,10 +1348,7 @@ class TwitterIE(TwitterBaseIE):
}
def extract_from_video_info(media):
media_id = traverse_obj(media, 'id_str', 'id', (
'video_info', 'variants', ..., 'url',
{functools.partial(re.search, r'_video/(\d+)/')}, 1
), get_all=False, expected_type=str_or_none) or twid
media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none)
self.write_debug(f'Extracting from video info: {media_id}')
formats = []
@ -1503,6 +1582,8 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE):
broadcast = self._call_api(
'broadcasts/show.json', broadcast_id,
{'ids': broadcast_id})['broadcasts'][broadcast_id]
if not broadcast:
raise ExtractorError('Broadcast no longer exists', expected=True)
info = self._parse_broadcast_data(broadcast, broadcast_id)
media_key = broadcast['media_key']
source = self._call_api(

View File

@ -38,6 +38,7 @@ class VideaIE(InfoExtractor):
'title': 'Az őrült kígyász 285 kígyót enged szabadon',
'thumbnail': r're:^https?://.*',
'duration': 21,
'age_limit': 0,
},
}, {
'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH',
@ -48,6 +49,7 @@ class VideaIE(InfoExtractor):
'title': 'Supercars előzés',
'thumbnail': r're:^https?://.*',
'duration': 64,
'age_limit': 0,
},
}, {
'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ',
@ -58,6 +60,7 @@ class VideaIE(InfoExtractor):
'title': 'Az őrült kígyász 285 kígyót enged szabadon',
'thumbnail': r're:^https?://.*',
'duration': 21,
'age_limit': 0,
},
}, {
'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1',
@ -124,7 +127,7 @@ class VideaIE(InfoExtractor):
query['_t'] = result[:16]
b64_info, handle = self._download_webpage_handle(
'http://videa.hu/videaplayer_get_xml.php', video_id, query=query)
'http://videa.hu/player/xml', video_id, query=query)
if b64_info.startswith('<?xml'):
info = self._parse_xml(b64_info, video_id)
else:

View File

@ -173,6 +173,7 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE
'skip': 'HTTP Error 404: Not Found',
},
{
# FIXME: Asset JSON is directly embedded in webpage
'url': 'http://www1.wdr.de/mediathek/video/live/index.html',
'info_dict': {
'id': 'mdb-2296252',
@ -221,6 +222,8 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE
'id': 'mdb-869971',
'ext': 'mp4',
'title': r're:^COSMO Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'alt_title': 'COSMO Livestream',
'live_status': 'is_live',
'upload_date': '20160101',
},
'params': {
@ -248,6 +251,16 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE
'url': 'https://kinder.wdr.de/tv/die-sendung-mit-dem-elefanten/av/video-folge---astronaut-100.html',
'only_matching': True,
},
{
'url': 'https://www1.wdr.de/mediathek/video/sendungen/rockpalast/video-baroness---freak-valley-festival--100.html',
'info_dict': {
'id': 'mdb-2741028',
'ext': 'mp4',
'title': 'Baroness - Freak Valley Festival 2022',
'alt_title': 'Rockpalast',
'upload_date': '20220725',
},
}
]
def _real_extract(self, url):
@ -259,7 +272,7 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE
# Article with several videos
# for wdr.de the data-extension is in a tag with the class "mediaLink"
# for wdr.de the data-extension-ard is in a tag with the class "mediaLink"
# for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn"
# for wdrmaus, in a tag with the class "videoButton" (previously a link
# to the page in a multiline "videoLink"-tag)
@ -268,7 +281,7 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE
(?:
(["\'])(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b.*?\1[^>]+|
(["\'])videoLink\b.*?\2[\s]*>\n[^\n]*
)data-extension=(["\'])(?P<data>(?:(?!\3).)+)\3
)data-extension(?:-ard)?=(["\'])(?P<data>(?:(?!\3).)+)\3
''', webpage):
media_link_obj = self._parse_json(
mobj.group('data'), display_id, transform_source=js_to_json,
@ -295,7 +308,7 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE
compat_urlparse.urljoin(url, mobj.group('href')),
ie=WDRPageIE.ie_key())
for mobj in re.finditer(
r'<a[^>]+\bhref=(["\'])(?P<href>(?:(?!\1).)+)\1[^>]+\bdata-extension=',
r'<a[^>]+\bhref=(["\'])(?P<href>(?:(?!\1).)+)\1[^>]+\bdata-extension(?:-ard)?=',
webpage) if re.match(self._PAGE_REGEX, mobj.group('href'))
]

View File

@ -1,134 +1,241 @@
from .common import InfoExtractor
import json
import random
import re
import itertools
import urllib.parse
from ..compat import (
compat_parse_qs,
compat_str,
)
from .common import InfoExtractor
from ..utils import (
js_to_json,
int_or_none,
make_archive_id,
mimetype2ext,
parse_resolution,
str_or_none,
strip_jsonp,
traverse_obj,
url_or_none,
urlencode_postdata,
urljoin,
)
class WeiboIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?weibo\.com/[0-9]+/(?P<id>[a-zA-Z0-9]+)'
_TEST = {
'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment',
'info_dict': {
'id': 'Fp6RGfbff',
'ext': 'mp4',
'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博',
}
}
class WeiboBaseIE(InfoExtractor):
def _update_visitor_cookies(self, video_id):
visitor_data = self._download_json(
'https://passport.weibo.com/visitor/genvisitor', video_id,
note='Generating first-visit guest request',
transform_source=strip_jsonp,
data=urlencode_postdata({
'cb': 'gen_callback',
'fp': '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}',
}))
def _real_extract(self, url):
video_id = self._match_id(url)
# to get Referer url for genvisitor
webpage, urlh = self._download_webpage_handle(url, video_id)
visitor_url = urlh.url
if 'passport.weibo.com' in visitor_url:
# first visit
visitor_data = self._download_json(
'https://passport.weibo.com/visitor/genvisitor', video_id,
note='Generating first-visit data',
transform_source=strip_jsonp,
headers={'Referer': visitor_url},
data=urlencode_postdata({
'cb': 'gen_callback',
'fp': json.dumps({
'os': '2',
'browser': 'Gecko57,0,0,0',
'fonts': 'undefined',
'screenInfo': '1440*900*24',
'plugins': '',
}),
}))
tid = visitor_data['data']['tid']
cnfd = '%03d' % visitor_data['data']['confidence']
self._download_webpage(
'https://passport.weibo.com/visitor/visitor', video_id,
note='Running first-visit callback',
query={
'a': 'incarnate',
't': tid,
'w': 2,
'c': cnfd,
'cb': 'cross_domain',
'from': 'weibo',
'_rand': random.random(),
})
webpage = self._download_webpage(
url, video_id, note='Revisiting webpage')
title = self._html_extract_title(webpage)
video_formats = compat_parse_qs(self._search_regex(
r'video-sources=\\\"(.+?)\"', webpage, 'video_sources'))
formats = []
supported_resolutions = (480, 720)
for res in supported_resolutions:
vid_urls = video_formats.get(compat_str(res))
if not vid_urls or not isinstance(vid_urls, list):
continue
vid_url = vid_urls[0]
formats.append({
'url': vid_url,
'height': res,
self._download_webpage(
'https://passport.weibo.com/visitor/visitor', video_id,
note='Running first-visit callback to get guest cookies',
query={
'a': 'incarnate',
't': visitor_data['data']['tid'],
'w': 2,
'c': '%03d' % visitor_data['data']['confidence'],
'cb': 'cross_domain',
'from': 'weibo',
'_rand': random.random(),
})
uploader = self._og_search_property(
'nick-name', webpage, 'uploader', default=None)
def _weibo_download_json(self, url, video_id, *args, fatal=True, note='Downloading JSON metadata', **kwargs):
webpage, urlh = self._download_webpage_handle(url, video_id, *args, fatal=fatal, note=note, **kwargs)
if urllib.parse.urlparse(urlh.url).netloc == 'passport.weibo.com':
self._update_visitor_cookies(video_id)
webpage = self._download_webpage(url, video_id, *args, fatal=fatal, note=note, **kwargs)
return self._parse_json(webpage, video_id, fatal=fatal)
def _extract_formats(self, video_info):
media_info = traverse_obj(video_info, ('page_info', 'media_info'))
formats = traverse_obj(media_info, (
'playback_list', lambda _, v: url_or_none(v['play_info']['url']), 'play_info', {
'url': 'url',
'format': ('quality_desc', {str}),
'format_id': ('label', {str}),
'ext': ('mime', {mimetype2ext}),
'tbr': ('bitrate', {int_or_none}, {lambda x: x or None}),
'vcodec': ('video_codecs', {str}),
'fps': ('fps', {int_or_none}),
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
'filesize': ('size', {int_or_none}),
'acodec': ('audio_codecs', {str}),
'asr': ('audio_sample_rate', {int_or_none}),
'audio_channels': ('audio_channels', {int_or_none}),
}))
if not formats: # fallback, should be barely used
for url in set(traverse_obj(media_info, (..., {url_or_none}))):
if 'label=' in url: # filter out non-video urls
format_id, resolution = self._search_regex(
r'label=(\w+)&template=(\d+x\d+)', url, 'format info',
group=(1, 2), default=(None, None))
formats.append({
'url': url,
'format_id': format_id,
**parse_resolution(resolution),
**traverse_obj(media_info, (
'video_details', lambda _, v: v['label'].startswith(format_id), {
'size': ('size', {int_or_none}),
'tbr': ('bitrate', {int_or_none}),
}
), get_all=False),
})
return formats
def _parse_video_info(self, video_info, video_id=None):
return {
'id': video_id,
'title': title,
'uploader': uploader,
'formats': formats
'extractor_key': WeiboIE.ie_key(),
'extractor': WeiboIE.IE_NAME,
'formats': self._extract_formats(video_info),
'http_headers': {'Referer': 'https://weibo.com/'},
'_old_archive_ids': [make_archive_id('WeiboMobile', video_id)],
**traverse_obj(video_info, {
'id': (('id', 'id_str', 'mid'), {str_or_none}),
'display_id': ('mblogid', {str_or_none}),
'title': ('page_info', 'media_info', ('video_title', 'kol_title', 'name'), {str}, {lambda x: x or None}),
'description': ('text_raw', {str}),
'duration': ('page_info', 'media_info', 'duration', {int_or_none}),
'timestamp': ('page_info', 'media_info', 'video_publish_time', {int_or_none}),
'thumbnail': ('page_info', 'page_pic', {url_or_none}),
'uploader': ('user', 'screen_name', {str}),
'uploader_id': ('user', ('id', 'id_str'), {str_or_none}),
'uploader_url': ('user', 'profile_url', {lambda x: urljoin('https://weibo.com/', x)}),
'view_count': ('page_info', 'media_info', 'online_users_number', {int_or_none}),
'like_count': ('attitudes_count', {int_or_none}),
'repost_count': ('reposts_count', {int_or_none}),
}, get_all=False),
'tags': traverse_obj(video_info, ('topic_struct', ..., 'topic_title', {str})) or None,
}
class WeiboMobileIE(InfoExtractor):
_VALID_URL = r'https?://m\.weibo\.cn/status/(?P<id>[0-9]+)(\?.+)?'
_TEST = {
'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0',
class WeiboIE(WeiboBaseIE):
_VALID_URL = r'https?://(?:m\.weibo\.cn/status|(?:www\.)?weibo\.com/\d+)/(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'https://weibo.com/7827771738/N4xlMvjhI',
'info_dict': {
'id': '4910815147462302',
'ext': 'mp4',
'display_id': 'N4xlMvjhI',
'title': '【睡前消息暑假版第一期:拉泰国一把 对中国有好处】',
'description': 'md5:e2637a7673980d68694ea7c43cf12a5f',
'duration': 918,
'timestamp': 1686312819,
'upload_date': '20230609',
'thumbnail': r're:https://.*\.jpg',
'uploader': '睡前视频基地',
'uploader_id': '7827771738',
'uploader_url': 'https://weibo.com/u/7827771738',
'view_count': int,
'like_count': int,
'repost_count': int,
'tags': ['泰国大选远进党获胜', '睡前消息', '暑期版'],
},
}, {
'url': 'https://m.weibo.cn/status/4189191225395228',
'info_dict': {
'id': '4189191225395228',
'ext': 'mp4',
'title': '午睡当然是要甜甜蜜蜜的啦',
'uploader': '柴犬柴犬'
'display_id': 'FBqgOmDxO',
'title': '柴犬柴犬的秒拍视频',
'description': 'md5:80f461ab5cdae6bbdb70efbf5a1db24f',
'duration': 53,
'timestamp': 1514264429,
'upload_date': '20171226',
'thumbnail': r're:https://.*\.jpg',
'uploader': '柴犬柴犬',
'uploader_id': '5926682210',
'uploader_url': 'https://weibo.com/u/5926682210',
'view_count': int,
'like_count': int,
'repost_count': int,
}
}
}, {
'url': 'https://weibo.com/0/4224132150961381',
'note': 'no playback_list example',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
# to get Referer url for genvisitor
webpage = self._download_webpage(url, video_id, note='visit the page')
weibo_info = self._parse_json(self._search_regex(
r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\]\s*\|\|\s*{};',
webpage, 'js_code', flags=re.DOTALL),
video_id, transform_source=js_to_json)
return self._parse_video_info(self._weibo_download_json(
f'https://weibo.com/ajax/statuses/show?id={video_id}', video_id))
status_data = weibo_info.get('status', {})
page_info = status_data.get('page_info')
title = status_data['status_title']
uploader = status_data.get('user', {}).get('screen_name')
return {
'id': video_id,
'title': title,
'uploader': uploader,
'url': page_info['media_info']['stream_url']
class WeiboVideoIE(WeiboBaseIE):
_VALID_URL = r'https?://(?:www\.)?weibo\.com/tv/show/(?P<id>\d+:\d+)'
_TESTS = [{
'url': 'https://weibo.com/tv/show/1034:4797699866951785?from=old_pc_videoshow',
'info_dict': {
'id': '4797700463137878',
'ext': 'mp4',
'display_id': 'LEZDodaiW',
'title': '稍微了解了一下靡烟miya感觉这东西也太二了',
'description': '稍微了解了一下靡烟miya感觉这东西也太二了 http://t.cn/A6aerGsM ',
'duration': 76,
'timestamp': 1659344278,
'upload_date': '20220801',
'thumbnail': r're:https://.*\.jpg',
'uploader': '君子爱财陈平安',
'uploader_id': '3905382233',
'uploader_url': 'https://weibo.com/u/3905382233',
'view_count': int,
'like_count': int,
'repost_count': int,
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
post_data = f'data={{"Component_Play_Playinfo":{{"oid":"{video_id}"}}}}'.encode()
video_info = self._weibo_download_json(
f'https://weibo.com/tv/api/component?page=%2Ftv%2Fshow%2F{video_id.replace(":", "%3A")}',
video_id, headers={'Referer': url}, data=post_data)['data']['Component_Play_Playinfo']
return self.url_result(f'https://weibo.com/0/{video_info["mid"]}', WeiboIE)
class WeiboUserIE(WeiboBaseIE):
_VALID_URL = r'https?://(?:www\.)?weibo\.com/u/(?P<id>\d+)'
_TESTS = [{
'url': 'https://weibo.com/u/2066652961?tabtype=video',
'info_dict': {
'id': '2066652961',
'title': '萧影殿下的视频',
'description': '萧影殿下的全部视频',
'uploader': '萧影殿下',
},
'playlist_mincount': 195,
}]
def _fetch_page(self, uid, cursor=0, page=1):
return self._weibo_download_json(
'https://weibo.com/ajax/profile/getWaterFallContent',
uid, note=f'Downloading videos page {page}',
query={'uid': uid, 'cursor': cursor})['data']
def _entries(self, uid, first_page):
cursor = 0
for page in itertools.count(1):
response = first_page if page == 1 else self._fetch_page(uid, cursor, page)
for video_info in traverse_obj(response, ('list', ..., {dict})):
yield self._parse_video_info(video_info)
cursor = response.get('next_cursor')
if (int_or_none(cursor) or -1) < 0:
break
def _real_extract(self, url):
uid = self._match_id(url)
first_page = self._fetch_page(uid)
uploader = traverse_obj(first_page, ('list', ..., 'user', 'screen_name', {str}), get_all=False)
metainfo = {
'title': f'{uploader}的视频',
'description': f'{uploader}的全部视频',
'uploader': uploader,
} if uploader else {}
return self.playlist_result(self._entries(uid, first_page), uid, **metainfo)

View File

@ -9,6 +9,7 @@ from ..utils import (
traverse_obj,
try_call,
unescapeHTML,
url_basename,
url_or_none,
)
@ -45,12 +46,14 @@ class ZaikoIE(ZaikoBaseIE):
'uploader_id': '454',
'uploader': 'ZAIKO ZERO',
'release_timestamp': 1583809200,
'thumbnail': r're:https://[a-z0-9]+.cloudfront.net/[a-z0-9_]+/[a-z0-9_]+',
'thumbnail': r're:^https://[\w.-]+/\w+/\w+',
'thumbnails': 'maxcount:2',
'release_date': '20200310',
'categories': ['Tech House'],
'live_status': 'was_live',
},
'params': {'skip_download': 'm3u8'},
'skip': 'Your account does not have tickets to this event',
}]
def _real_extract(self, url):
@ -83,6 +86,12 @@ class ZaikoIE(ZaikoBaseIE):
if not formats:
self.raise_no_formats(msg, expected=expected)
thumbnail_urls = [
traverse_obj(player_meta, ('initial_event_info', 'poster_url')),
self._og_search_thumbnail(self._download_webpage(
f'https://zaiko.io/event/{video_id}', video_id, 'Downloading event page', fatal=False) or ''),
]
return {
'id': video_id,
'formats': formats,
@ -96,8 +105,8 @@ class ZaikoIE(ZaikoBaseIE):
}),
**traverse_obj(player_meta, ('initial_event_info', {
'alt_title': ('title', {str}),
'thumbnail': ('poster_url', {url_or_none}),
})),
'thumbnails': [{'url': url, 'id': url_basename(url)} for url in thumbnail_urls if url_or_none(url)]
}

View File

@ -127,6 +127,7 @@ class ZoomIE(InfoExtractor):
return {
'id': video_id,
'title': str_or_none(traverse_obj(data, ('meet', 'topic'))),
'duration': int_or_none(data.get('duration')),
'subtitles': subtitles,
'formats': formats,
'http_headers': {

View File

@ -2,6 +2,7 @@ from __future__ import annotations
import contextlib
import functools
import socket
import ssl
import sys
import typing
@ -206,3 +207,59 @@ def wrap_request_errors(func):
e.handler = self
raise
return wrapper
def _socket_connect(ip_addr, timeout, source_address):
af, socktype, proto, canonname, sa = ip_addr
sock = socket.socket(af, socktype, proto)
try:
if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
sock.settimeout(timeout)
if source_address:
sock.bind(source_address)
sock.connect(sa)
return sock
except socket.error:
sock.close()
raise
def create_connection(
address,
timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
source_address=None,
*,
_create_socket_func=_socket_connect
):
# Work around socket.create_connection() which tries all addresses from getaddrinfo() including IPv6.
# This filters the addresses based on the given source_address.
# Based on: https://github.com/python/cpython/blob/main/Lib/socket.py#L810
host, port = address
ip_addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
if not ip_addrs:
raise socket.error('getaddrinfo returns an empty list')
if source_address is not None:
af = socket.AF_INET if ':' not in source_address[0] else socket.AF_INET6
ip_addrs = [addr for addr in ip_addrs if addr[0] == af]
if not ip_addrs:
raise OSError(
f'No remote IPv{4 if af == socket.AF_INET else 6} addresses available for connect. '
f'Can\'t use "{source_address[0]}" as source address')
err = None
for ip_addr in ip_addrs:
try:
sock = _create_socket_func(ip_addr, timeout, source_address)
# Explicitly break __traceback__ reference cycle
# https://bugs.python.org/issue36820
err = None
return sock
except socket.error as e:
err = e
try:
raise err
finally:
# Explicitly break __traceback__ reference cycle
# https://bugs.python.org/issue36820
err = None

View File

@ -23,6 +23,7 @@ from urllib.request import (
from ._helper import (
InstanceStoreMixin,
add_accept_encoding_header,
create_connection,
get_redirect_method,
make_socks_proxy_opts,
select_proxy,
@ -54,44 +55,10 @@ if brotli:
def _create_http_connection(http_class, source_address, *args, **kwargs):
hc = http_class(*args, **kwargs)
if hasattr(hc, '_create_connection'):
hc._create_connection = create_connection
if source_address is not None:
# This is to workaround _create_connection() from socket where it will try all
# address data from getaddrinfo() including IPv6. This filters the result from
# getaddrinfo() based on the source_address value.
# This is based on the cpython socket.create_connection() function.
# https://github.com/python/cpython/blob/master/Lib/socket.py#L691
def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
host, port = address
err = None
addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
ip_addrs = [addr for addr in addrs if addr[0] == af]
if addrs and not ip_addrs:
ip_version = 'v4' if af == socket.AF_INET else 'v6'
raise OSError(
"No remote IP%s addresses available for connect, can't use '%s' as source address"
% (ip_version, source_address[0]))
for res in ip_addrs:
af, socktype, proto, canonname, sa = res
sock = None
try:
sock = socket.socket(af, socktype, proto)
if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
sock.settimeout(timeout)
sock.bind(source_address)
sock.connect(sa)
err = None # Explicitly break reference cycle
return sock
except OSError as _:
err = _
if sock is not None:
sock.close()
if err is not None:
raise err
else:
raise OSError('getaddrinfo returns an empty list')
if hasattr(hc, '_create_connection'):
hc._create_connection = _create_connection
hc.source_address = (source_address, 0)
return hc
@ -220,13 +187,28 @@ def make_socks_conn_class(base_class, socks_proxy):
proxy_args = make_socks_proxy_opts(socks_proxy)
class SocksConnection(base_class):
def connect(self):
self.sock = sockssocket()
self.sock.setproxy(**proxy_args)
if type(self.timeout) in (int, float): # noqa: E721
self.sock.settimeout(self.timeout)
self.sock.connect((self.host, self.port))
_create_connection = create_connection
def connect(self):
def sock_socket_connect(ip_addr, timeout, source_address):
af, socktype, proto, canonname, sa = ip_addr
sock = sockssocket(af, socktype, proto)
try:
connect_proxy_args = proxy_args.copy()
connect_proxy_args.update({'addr': sa[0], 'port': sa[1]})
sock.setproxy(**connect_proxy_args)
if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: # noqa: E721
sock.settimeout(timeout)
if source_address:
sock.bind(source_address)
sock.connect((self.host, self.port))
return sock
except socket.error:
sock.close()
raise
self.sock = create_connection(
(proxy_args['addr'], proxy_args['port']), timeout=self.timeout,
source_address=self.source_address, _create_socket_func=sock_socket_connect)
if isinstance(self, http.client.HTTPSConnection):
self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
@ -429,7 +411,7 @@ class UrllibRH(RequestHandler, InstanceStoreMixin):
except urllib.error.HTTPError as e:
if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
# Prevent file object from being closed when urllib.error.HTTPError is destroyed.
e._closer.file = None
e._closer.close_called = True
raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
raise # unexpected
except urllib.error.URLError as e:

View File

@ -115,7 +115,7 @@ class _CompatHTTPError(urllib.error.HTTPError, HTTPError):
hdrs=http_error.response.headers,
fp=http_error.response
)
self._closer.file = None # Disable auto close
self._closer.close_called = True # Disable auto close
self._http_error = http_error
HTTPError.__init__(self, http_error.response, redirect_loop=http_error.redirect_loop)

View File

@ -134,26 +134,31 @@ class sockssocket(socket.socket):
self.close()
raise InvalidVersionError(expected_version, got_version)
def _resolve_address(self, destaddr, default, use_remote_dns):
try:
return socket.inet_aton(destaddr)
except OSError:
if use_remote_dns and self._proxy.remote_dns:
return default
else:
return socket.inet_aton(socket.gethostbyname(destaddr))
def _resolve_address(self, destaddr, default, use_remote_dns, family=None):
for f in (family,) if family else (socket.AF_INET, socket.AF_INET6):
try:
return f, socket.inet_pton(f, destaddr)
except OSError:
continue
if use_remote_dns and self._proxy.remote_dns:
return 0, default
else:
res = socket.getaddrinfo(destaddr, None, family=family or 0)
f, _, _, _, ipaddr = res[0]
return f, socket.inet_pton(f, ipaddr[0])
def _setup_socks4(self, address, is_4a=False):
destaddr, port = address
ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a)
_, ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a, family=socket.AF_INET)
packet = struct.pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr
username = (self._proxy.username or '').encode()
packet += username + b'\x00'
if is_4a and self._proxy.remote_dns:
if is_4a and self._proxy.remote_dns and ipaddr == SOCKS4_DEFAULT_DSTIP:
packet += destaddr.encode() + b'\x00'
self.sendall(packet)
@ -210,7 +215,7 @@ class sockssocket(socket.socket):
def _setup_socks5(self, address):
destaddr, port = address
ipaddr = self._resolve_address(destaddr, None, use_remote_dns=True)
family, ipaddr = self._resolve_address(destaddr, None, use_remote_dns=True)
self._socks5_auth()
@ -220,8 +225,10 @@ class sockssocket(socket.socket):
destaddr = destaddr.encode()
packet += struct.pack('!B', Socks5AddressType.ATYP_DOMAINNAME)
packet += self._len_and_data(destaddr)
else:
elif family == socket.AF_INET:
packet += struct.pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr
elif family == socket.AF_INET6:
packet += struct.pack('!B', Socks5AddressType.ATYP_IPV6) + ipaddr
packet += struct.pack('!H', port)
self.sendall(packet)

View File

@ -669,6 +669,7 @@ def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
def sanitize_path(s, force=False):
"""Sanitizes and normalizes path on Windows"""
# XXX: this handles drive relative paths (c:sth) incorrectly
if sys.platform == 'win32':
force = False
drive_or_unc, _ = os.path.splitdrive(s)
@ -687,7 +688,10 @@ def sanitize_path(s, force=False):
sanitized_path.insert(0, drive_or_unc + os.path.sep)
elif force and s and s[0] == os.path.sep:
sanitized_path.insert(0, os.path.sep)
return os.path.join(*sanitized_path)
# TODO: Fix behavioral differences <3.12
# The workaround using `normpath` only superficially passes tests
# Ref: https://github.com/python/cpython/pull/100351
return os.path.normpath(os.path.join(*sanitized_path))
def sanitize_url(url, *, scheme='http'):
@ -1256,7 +1260,7 @@ def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
if precision == 'auto':
auto_precision = True
precision = 'microsecond'
today = datetime_round(datetime.datetime.utcnow(), precision)
today = datetime_round(datetime.datetime.now(datetime.timezone.utc), precision)
if date_str in ('now', 'today'):
return today
if date_str == 'yesterday':
@ -1319,8 +1323,8 @@ def datetime_round(dt, precision='day'):
'second': 1,
}
roundto = lambda x, n: ((x + n / 2) // n) * n
timestamp = calendar.timegm(dt.timetuple())
return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
timestamp = roundto(calendar.timegm(dt.timetuple()), unit_seconds[precision])
return datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
def hyphenate_date(date_str):
@ -2847,6 +2851,7 @@ def mimetype2ext(mt, default=NO_DEFAULT):
'quicktime': 'mov',
'webm': 'webm',
'vp9': 'vp9',
'video/ogg': 'ogv',
'x-flv': 'flv',
'x-m4v': 'm4v',
'x-matroska': 'mkv',