Merge remote-tracking branch 'upstream/master'

This commit is contained in:
bergoid 2023-09-21 22:58:17 +02:00
commit d8d31be98e
68 changed files with 2986 additions and 1077 deletions

View file

@ -13,13 +13,16 @@ jobs:
matrix: matrix:
os: [ubuntu-latest] os: [ubuntu-latest]
# CPython 3.11 is in quick-test # CPython 3.11 is in quick-test
python-version: ['3.8', '3.9', '3.10', pypy-3.7, pypy-3.8] python-version: ['3.8', '3.9', '3.10', '3.12-dev', pypy-3.7, pypy-3.8, pypy-3.10]
run-tests-ext: [sh] run-tests-ext: [sh]
include: include:
# atleast one of each CPython/PyPy tests must be in windows # atleast one of each CPython/PyPy tests must be in windows
- os: windows-latest - os: windows-latest
python-version: '3.7' python-version: '3.7'
run-tests-ext: bat run-tests-ext: bat
- os: windows-latest
python-version: '3.12-dev'
run-tests-ext: bat
- os: windows-latest - os: windows-latest
python-version: pypy-3.9 python-version: pypy-3.9
run-tests-ext: bat run-tests-ext: bat

View file

@ -76,7 +76,7 @@
# NEW FEATURES # NEW FEATURES
* Forked from [**yt-dlc@f9401f2**](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee) and merged with [**youtube-dl@42f2d4**](https://github.com/ytdl-org/youtube-dl/commit/07af47960f3bb262ead02490ce65c8c45c01741e) ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21)) * Forked from [**yt-dlc@f9401f2**](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee) and merged with [**youtube-dl@66ab08**](https://github.com/ytdl-org/youtube-dl/commit/66ab0814c4baa2dc79c2dd5287bc0ad61a37c5b9) ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21))
* **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in YouTube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API * **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in YouTube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API
@ -1854,7 +1854,7 @@ #### rokfinchannel
* `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks` * `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks`
#### twitter #### twitter
* `legacy_api`: Force usage of the legacy Twitter API instead of the GraphQL API for tweet extraction. Has no effect if login cookies are passed * `api`: Select one of `graphql` (default), `legacy` or `syndication` as the API for tweet extraction. Has no effect if logged in
#### stacommu, wrestleuniverse #### stacommu, wrestleuniverse
* `device_id`: UUID value assigned by the website and used to enforce device limits for paid livestream content. Can be found in browser local storage * `device_id`: UUID value assigned by the website and used to enforce device limits for paid livestream content. Can be found in browser local storage

View file

@ -68,6 +68,25 @@
{ {
"action": "change", "action": "change",
"when": "b03fa7834579a01cc5fba48c0e73488a16683d48", "when": "b03fa7834579a01cc5fba48c0e73488a16683d48",
"short": "[ie/twitter] Revert 92315c03774cfabb3a921884326beb4b981f786b" "short": "[ie/twitter] Revert 92315c03774cfabb3a921884326beb4b981f786b",
"authors": ["pukkandan"]
},
{
"action": "change",
"when": "fcd6a76adc49d5cd8783985c7ce35384b72e545f",
"short": "[test] Add tests for socks proxies (#7908)",
"authors": ["coletdjnz"]
},
{
"action": "change",
"when": "4bf912282a34b58b6b35d8f7e6be535770c89c76",
"short": "[rh:urllib] Remove dot segments during URL normalization (#7662)",
"authors": ["coletdjnz"]
},
{
"action": "change",
"when": "59e92b1f1833440bb2190f847eb735cf0f90bc85",
"short": "[rh:urllib] Simplify gzip decoding (#7611)",
"authors": ["Grub4K"]
} }
] ]

View file

@ -31,35 +31,27 @@ class CommitGroup(enum.Enum):
EXTRACTOR = 'Extractor' EXTRACTOR = 'Extractor'
DOWNLOADER = 'Downloader' DOWNLOADER = 'Downloader'
POSTPROCESSOR = 'Postprocessor' POSTPROCESSOR = 'Postprocessor'
NETWORKING = 'Networking'
MISC = 'Misc.' MISC = 'Misc.'
@classmethod
@property
def ignorable_prefixes(cls):
return ('core', 'downloader', 'extractor', 'misc', 'postprocessor', 'upstream')
@classmethod @classmethod
@lru_cache @lru_cache
def commit_lookup(cls): def subgroup_lookup(cls):
return { return {
name: group name: group
for group, names in { for group, names in {
cls.PRIORITY: {'priority'},
cls.CORE: { cls.CORE: {
'aes', 'aes',
'cache', 'cache',
'compat_utils', 'compat_utils',
'compat', 'compat',
'cookies', 'cookies',
'core',
'dependencies', 'dependencies',
'formats', 'formats',
'jsinterp', 'jsinterp',
'networking',
'outtmpl', 'outtmpl',
'plugins', 'plugins',
'update', 'update',
'upstream',
'utils', 'utils',
}, },
cls.MISC: { cls.MISC: {
@ -67,23 +59,40 @@ def commit_lookup(cls):
'cleanup', 'cleanup',
'devscripts', 'devscripts',
'docs', 'docs',
'misc',
'test', 'test',
}, },
cls.EXTRACTOR: {'extractor', 'ie'}, cls.NETWORKING: {
cls.DOWNLOADER: {'downloader', 'fd'}, 'rh',
cls.POSTPROCESSOR: {'postprocessor', 'pp'}, },
}.items() }.items()
for name in names for name in names
} }
@classmethod @classmethod
def get(cls, value): @lru_cache
result = cls.commit_lookup().get(value) def group_lookup(cls):
if result: result = {
logger.debug(f'Mapped {value!r} => {result.name}') 'fd': cls.DOWNLOADER,
'ie': cls.EXTRACTOR,
'pp': cls.POSTPROCESSOR,
'upstream': cls.CORE,
}
result.update({item.name.lower(): item for item in iter(cls)})
return result return result
@classmethod
def get(cls, value: str) -> tuple[CommitGroup | None, str | None]:
group, _, subgroup = (group.strip().lower() for group in value.partition('/'))
result = cls.group_lookup().get(group)
if not result:
if subgroup:
return None, value
subgroup = group
result = cls.subgroup_lookup().get(subgroup)
return result, subgroup or None
@dataclass @dataclass
class Commit: class Commit:
@ -198,19 +207,23 @@ def _prepare_cleanup_misc_items(self, items):
for commit_infos in cleanup_misc_items.values(): for commit_infos in cleanup_misc_items.values():
sorted_items.append(CommitInfo( sorted_items.append(CommitInfo(
'cleanup', ('Miscellaneous',), ', '.join( 'cleanup', ('Miscellaneous',), ', '.join(
self._format_message_link(None, info.commit.hash).strip() self._format_message_link(None, info.commit.hash)
for info in sorted(commit_infos, key=lambda item: item.commit.hash or '')), for info in sorted(commit_infos, key=lambda item: item.commit.hash or '')),
[], Commit(None, '', commit_infos[0].commit.authors), [])) [], Commit(None, '', commit_infos[0].commit.authors), []))
return sorted_items return sorted_items
def format_single_change(self, info): def format_single_change(self, info: CommitInfo):
message = self._format_message_link(info.message, info.commit.hash) message, sep, rest = info.message.partition('\n')
if '[' not in message:
# If the message doesn't already contain markdown links, try to add a link to the commit
message = self._format_message_link(message, info.commit.hash)
if info.issues: if info.issues:
message = message.replace('\n', f' ({self._format_issues(info.issues)})\n', 1) message = f'{message} ({self._format_issues(info.issues)})'
if info.commit.authors: if info.commit.authors:
message = message.replace('\n', f' by {self._format_authors(info.commit.authors)}\n', 1) message = f'{message} by {self._format_authors(info.commit.authors)}'
if info.fixes: if info.fixes:
fix_message = ', '.join(f'{self._format_message_link(None, fix.hash)}' for fix in info.fixes) fix_message = ', '.join(f'{self._format_message_link(None, fix.hash)}' for fix in info.fixes)
@ -219,16 +232,14 @@ def format_single_change(self, info):
if authors != info.commit.authors: if authors != info.commit.authors:
fix_message = f'{fix_message} by {self._format_authors(authors)}' fix_message = f'{fix_message} by {self._format_authors(authors)}'
message = message.replace('\n', f' (With fixes in {fix_message})\n', 1) message = f'{message} (With fixes in {fix_message})'
return message[:-1] return message if not sep else f'{message}{sep}{rest}'
def _format_message_link(self, message, hash): def _format_message_link(self, message, hash):
assert message or hash, 'Improperly defined commit message or override' assert message or hash, 'Improperly defined commit message or override'
message = message if message else hash[:HASH_LENGTH] message = message if message else hash[:HASH_LENGTH]
if not hash: return f'[{message}]({self.repo_url}/commit/{hash})' if hash else message
return f'{message}\n'
return f'[{message}\n'.replace('\n', f']({self.repo_url}/commit/{hash})\n', 1)
def _format_issues(self, issues): def _format_issues(self, issues):
return ', '.join(f'[#{issue}]({self.repo_url}/issues/{issue})' for issue in issues) return ', '.join(f'[#{issue}]({self.repo_url}/issues/{issue})' for issue in issues)
@ -318,7 +329,7 @@ def _get_commits_and_fixes(self, default_author):
for commitish, revert_commit in reverts.items(): for commitish, revert_commit in reverts.items():
reverted = commits.pop(commitish, None) reverted = commits.pop(commitish, None)
if reverted: if reverted:
logger.debug(f'{commit} fully reverted {reverted}') logger.debug(f'{commitish} fully reverted {reverted}')
else: else:
commits[revert_commit.hash] = revert_commit commits[revert_commit.hash] = revert_commit
@ -337,7 +348,7 @@ def apply_overrides(self, overrides):
for override in overrides: for override in overrides:
when = override.get('when') when = override.get('when')
if when and when not in self and when != self._start: if when and when not in self and when != self._start:
logger.debug(f'Ignored {when!r}, not in commits {self._start!r}') logger.debug(f'Ignored {when!r} override')
continue continue
override_hash = override.get('hash') or when override_hash = override.get('hash') or when
@ -365,7 +376,7 @@ def groups(self):
for commit in self: for commit in self:
upstream_re = self.UPSTREAM_MERGE_RE.search(commit.short) upstream_re = self.UPSTREAM_MERGE_RE.search(commit.short)
if upstream_re: if upstream_re:
commit.short = f'[core/upstream] Merged with youtube-dl {upstream_re.group(1)}' commit.short = f'[upstream] Merged with youtube-dl {upstream_re.group(1)}'
match = self.MESSAGE_RE.fullmatch(commit.short) match = self.MESSAGE_RE.fullmatch(commit.short)
if not match: if not match:
@ -410,25 +421,20 @@ def details_from_prefix(prefix):
if not prefix: if not prefix:
return CommitGroup.CORE, None, () return CommitGroup.CORE, None, ()
prefix, _, details = prefix.partition('/') prefix, *sub_details = prefix.split(':')
prefix = prefix.strip()
details = details.strip()
group = CommitGroup.get(prefix.lower()) group, details = CommitGroup.get(prefix)
if group is CommitGroup.PRIORITY: if group is CommitGroup.PRIORITY and details:
prefix, _, details = details.partition('/') details = details.partition('/')[2].strip()
if not details and prefix and prefix not in CommitGroup.ignorable_prefixes: if details and '/' in details:
logger.debug(f'Replaced details with {prefix!r}') logger.error(f'Prefix is overnested, using first part: {prefix}')
details = prefix or None details = details.partition('/')[0].strip()
if details == 'common': if details == 'common':
details = None details = None
elif group is CommitGroup.NETWORKING and details == 'rh':
if details: details = 'Request Handler'
details, *sub_details = details.split(':')
else:
sub_details = []
return group, details, sub_details return group, details, sub_details

View file

@ -10,14 +10,14 @@
import argparse import argparse
import contextlib import contextlib
import sys import sys
from datetime import datetime from datetime import datetime, timezone
from devscripts.utils import read_version, run_process, write_file from devscripts.utils import read_version, run_process, write_file
def get_new_version(version, revision): def get_new_version(version, revision):
if not version: if not version:
version = datetime.utcnow().strftime('%Y.%m.%d') version = datetime.now(timezone.utc).strftime('%Y.%m.%d')
if revision: if revision:
assert revision.isdigit(), 'Revision must be a number' assert revision.isdigit(), 'Revision must be a number'

View file

@ -281,17 +281,13 @@ def test_socks4_auth(self, handler, ctx):
rh, proxies={'all': f'socks4://user:@{server_address}'}) rh, proxies={'all': f'socks4://user:@{server_address}'})
assert response['version'] == 4 assert response['version'] == 4
@pytest.mark.parametrize('handler,ctx', [ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
pytest.param('Urllib', 'http', marks=pytest.mark.xfail(
reason='socks4a implementation currently broken when destination is not a domain name'))
], indirect=True)
def test_socks4a_ipv4_target(self, handler, ctx): def test_socks4a_ipv4_target(self, handler, ctx):
with ctx.socks_server(Socks4ProxyHandler) as server_address: with ctx.socks_server(Socks4ProxyHandler) as server_address:
with handler(proxies={'all': f'socks4a://{server_address}'}) as rh: with handler(proxies={'all': f'socks4a://{server_address}'}) as rh:
response = ctx.socks_info_request(rh, target_domain='127.0.0.1') response = ctx.socks_info_request(rh, target_domain='127.0.0.1')
assert response['version'] == 4 assert response['version'] == 4
assert response['ipv4_address'] == '127.0.0.1' assert (response['ipv4_address'] == '127.0.0.1') != (response['domain_address'] == '127.0.0.1')
assert response['domain_address'] is None
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
def test_socks4a_domain_target(self, handler, ctx): def test_socks4a_domain_target(self, handler, ctx):
@ -302,10 +298,7 @@ def test_socks4a_domain_target(self, handler, ctx):
assert response['ipv4_address'] is None assert response['ipv4_address'] is None
assert response['domain_address'] == 'localhost' assert response['domain_address'] == 'localhost'
@pytest.mark.parametrize('handler,ctx', [ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
pytest.param('Urllib', 'http', marks=pytest.mark.xfail(
reason='source_address is not yet supported for socks4 proxies'))
], indirect=True)
def test_ipv4_client_source_address(self, handler, ctx): def test_ipv4_client_source_address(self, handler, ctx):
with ctx.socks_server(Socks4ProxyHandler) as server_address: with ctx.socks_server(Socks4ProxyHandler) as server_address:
source_address = f'127.0.0.{random.randint(5, 255)}' source_address = f'127.0.0.{random.randint(5, 255)}'
@ -327,10 +320,7 @@ def test_socks4_errors(self, handler, ctx, reply_code):
with pytest.raises(ProxyError): with pytest.raises(ProxyError):
ctx.socks_info_request(rh) ctx.socks_info_request(rh)
@pytest.mark.parametrize('handler,ctx', [ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
pytest.param('Urllib', 'http', marks=pytest.mark.xfail(
reason='IPv6 socks4 proxies are not yet supported'))
], indirect=True)
def test_ipv6_socks4_proxy(self, handler, ctx): def test_ipv6_socks4_proxy(self, handler, ctx):
with ctx.socks_server(Socks4ProxyHandler, bind_ip='::1') as server_address: with ctx.socks_server(Socks4ProxyHandler, bind_ip='::1') as server_address:
with handler(proxies={'all': f'socks4://{server_address}'}) as rh: with handler(proxies={'all': f'socks4://{server_address}'}) as rh:
@ -342,7 +332,7 @@ def test_ipv6_socks4_proxy(self, handler, ctx):
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
def test_timeout(self, handler, ctx): def test_timeout(self, handler, ctx):
with ctx.socks_server(Socks4ProxyHandler, sleep=2) as server_address: with ctx.socks_server(Socks4ProxyHandler, sleep=2) as server_address:
with handler(proxies={'all': f'socks4://{server_address}'}, timeout=1) as rh: with handler(proxies={'all': f'socks4://{server_address}'}, timeout=0.5) as rh:
with pytest.raises(TransportError): with pytest.raises(TransportError):
ctx.socks_info_request(rh) ctx.socks_info_request(rh)
@ -383,7 +373,7 @@ def test_socks5_domain_target(self, handler, ctx):
with ctx.socks_server(Socks5ProxyHandler) as server_address: with ctx.socks_server(Socks5ProxyHandler) as server_address:
with handler(proxies={'all': f'socks5://{server_address}'}) as rh: with handler(proxies={'all': f'socks5://{server_address}'}) as rh:
response = ctx.socks_info_request(rh, target_domain='localhost') response = ctx.socks_info_request(rh, target_domain='localhost')
assert response['ipv4_address'] == '127.0.0.1' assert (response['ipv4_address'] == '127.0.0.1') != (response['ipv6_address'] == '::1')
assert response['version'] == 5 assert response['version'] == 5
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
@ -404,22 +394,15 @@ def test_socks5h_ip_target(self, handler, ctx):
assert response['domain_address'] is None assert response['domain_address'] is None
assert response['version'] == 5 assert response['version'] == 5
@pytest.mark.parametrize('handler,ctx', [ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
pytest.param('Urllib', 'http', marks=pytest.mark.xfail(
reason='IPv6 destination addresses are not yet supported'))
], indirect=True)
def test_socks5_ipv6_destination(self, handler, ctx): def test_socks5_ipv6_destination(self, handler, ctx):
with ctx.socks_server(Socks5ProxyHandler) as server_address: with ctx.socks_server(Socks5ProxyHandler) as server_address:
with handler(proxies={'all': f'socks5://{server_address}'}) as rh: with handler(proxies={'all': f'socks5://{server_address}'}) as rh:
response = ctx.socks_info_request(rh, target_domain='[::1]') response = ctx.socks_info_request(rh, target_domain='[::1]')
assert response['ipv6_address'] == '::1' assert response['ipv6_address'] == '::1'
assert response['port'] == 80
assert response['version'] == 5 assert response['version'] == 5
@pytest.mark.parametrize('handler,ctx', [ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
pytest.param('Urllib', 'http', marks=pytest.mark.xfail(
reason='IPv6 socks5 proxies are not yet supported'))
], indirect=True)
def test_ipv6_socks5_proxy(self, handler, ctx): def test_ipv6_socks5_proxy(self, handler, ctx):
with ctx.socks_server(Socks5ProxyHandler, bind_ip='::1') as server_address: with ctx.socks_server(Socks5ProxyHandler, bind_ip='::1') as server_address:
with handler(proxies={'all': f'socks5://{server_address}'}) as rh: with handler(proxies={'all': f'socks5://{server_address}'}) as rh:
@ -430,10 +413,7 @@ def test_ipv6_socks5_proxy(self, handler, ctx):
# XXX: is there any feasible way of testing IPv6 source addresses? # XXX: is there any feasible way of testing IPv6 source addresses?
# Same would go for non-proxy source_address test... # Same would go for non-proxy source_address test...
@pytest.mark.parametrize('handler,ctx', [ @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True)
pytest.param('Urllib', 'http', marks=pytest.mark.xfail(
reason='source_address is not yet supported for socks5 proxies'))
], indirect=True)
def test_ipv4_client_source_address(self, handler, ctx): def test_ipv4_client_source_address(self, handler, ctx):
with ctx.socks_server(Socks5ProxyHandler) as server_address: with ctx.socks_server(Socks5ProxyHandler) as server_address:
source_address = f'127.0.0.{random.randint(5, 255)}' source_address = f'127.0.0.{random.randint(5, 255)}'

View file

@ -2591,7 +2591,7 @@ def _fill_common_fields(self, info_dict, final=True):
# Working around out-of-range timestamp values (e.g. negative ones on Windows, # Working around out-of-range timestamp values (e.g. negative ones on Windows,
# see http://bugs.python.org/issue1646728) # see http://bugs.python.org/issue1646728)
with contextlib.suppress(ValueError, OverflowError, OSError): with contextlib.suppress(ValueError, OverflowError, OSError):
upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) upload_date = datetime.datetime.fromtimestamp(info_dict[ts_key], datetime.timezone.utc)
info_dict[date_key] = upload_date.strftime('%Y%m%d') info_dict[date_key] = upload_date.strftime('%Y%m%d')
live_keys = ('is_live', 'was_live') live_keys = ('is_live', 'was_live')

View file

@ -15,7 +15,7 @@ def get_package_info(module):
name=getattr(module, '_yt_dlp__identifier', module.__name__), name=getattr(module, '_yt_dlp__identifier', module.__name__),
version=str(next(filter(None, ( version=str(next(filter(None, (
getattr(module, attr, None) getattr(module, attr, None)
for attr in ('__version__', 'version_string', 'version') for attr in ('_yt_dlp__version', '__version__', 'version_string', 'version')
)), None))) )), None)))

View file

@ -43,6 +43,8 @@
try: try:
import sqlite3 import sqlite3
# We need to get the underlying `sqlite` version, see https://github.com/yt-dlp/yt-dlp/issues/8152
sqlite3._yt_dlp__version = sqlite3.sqlite_version
except ImportError: except ImportError:
# although sqlite3 is part of the standard library, it is possible to compile python without # although sqlite3 is part of the standard library, it is possible to compile python without
# sqlite support. See: https://github.com/yt-dlp/yt-dlp/issues/544 # sqlite support. See: https://github.com/yt-dlp/yt-dlp/issues/544

View file

@ -122,7 +122,6 @@
from .archiveorg import ( from .archiveorg import (
ArchiveOrgIE, ArchiveOrgIE,
YoutubeWebArchiveIE, YoutubeWebArchiveIE,
VLiveWebArchiveIE,
) )
from .arcpublishing import ArcPublishingIE from .arcpublishing import ArcPublishingIE
from .arkena import ArkenaIE from .arkena import ArkenaIE
@ -165,6 +164,7 @@
AWAANLiveIE, AWAANLiveIE,
AWAANSeasonIE, AWAANSeasonIE,
) )
from .axs import AxsIE
from .azmedien import AZMedienIE from .azmedien import AZMedienIE
from .baidu import BaiduVideoIE from .baidu import BaiduVideoIE
from .banbye import ( from .banbye import (
@ -223,7 +223,11 @@
BiliBiliPlayerIE, BiliBiliPlayerIE,
BilibiliSpaceVideoIE, BilibiliSpaceVideoIE,
BilibiliSpaceAudioIE, BilibiliSpaceAudioIE,
BilibiliSpacePlaylistIE, BilibiliCollectionListIE,
BilibiliSeriesListIE,
BilibiliFavoritesListIE,
BilibiliWatchlaterIE,
BilibiliPlaylistIE,
BiliIntlIE, BiliIntlIE,
BiliIntlSeriesIE, BiliIntlSeriesIE,
BiliLiveIE, BiliLiveIE,
@ -292,9 +296,11 @@
from .camsoda import CamsodaIE from .camsoda import CamsodaIE
from .camtasia import CamtasiaEmbedIE from .camtasia import CamtasiaEmbedIE
from .camwithher import CamWithHerIE from .camwithher import CamWithHerIE
from .canal1 import Canal1IE
from .canalalpha import CanalAlphaIE from .canalalpha import CanalAlphaIE
from .canalplus import CanalplusIE from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE from .canalc2 import Canalc2IE
from .caracoltv import CaracolTvPlayIE
from .carambatv import ( from .carambatv import (
CarambaTVIE, CarambaTVIE,
CarambaTVPageIE, CarambaTVPageIE,
@ -561,6 +567,7 @@
EpiconIE, EpiconIE,
EpiconSeriesIE, EpiconSeriesIE,
) )
from .eplus import EplusIbIE
from .epoch import EpochIE from .epoch import EpochIE
from .eporner import EpornerIE from .eporner import EpornerIE
from .eroprofile import ( from .eroprofile import (
@ -1501,6 +1508,7 @@
from .popcorntimes import PopcorntimesIE from .popcorntimes import PopcorntimesIE
from .popcorntv import PopcornTVIE from .popcorntv import PopcornTVIE
from .porn91 import Porn91IE from .porn91 import Porn91IE
from .pornbox import PornboxIE
from .porncom import PornComIE from .porncom import PornComIE
from .pornflip import PornFlipIE from .pornflip import PornFlipIE
from .pornhd import PornHdIE from .pornhd import PornHdIE
@ -1519,7 +1527,7 @@
PuhuTVIE, PuhuTVIE,
PuhuTVSerieIE, PuhuTVSerieIE,
) )
from .pr0gramm import Pr0grammStaticIE, Pr0grammIE from .pr0gramm import Pr0grammIE
from .prankcast import PrankCastIE from .prankcast import PrankCastIE
from .premiershiprugby import PremiershipRugbyIE from .premiershiprugby import PremiershipRugbyIE
from .presstv import PressTVIE from .presstv import PressTVIE
@ -1555,7 +1563,14 @@
from .radiode import RadioDeIE from .radiode import RadioDeIE
from .radiojavan import RadioJavanIE from .radiojavan import RadioJavanIE
from .radiobremen import RadioBremenIE from .radiobremen import RadioBremenIE
from .radiofrance import FranceCultureIE, RadioFranceIE from .radiofrance import (
FranceCultureIE,
RadioFranceIE,
RadioFranceLiveIE,
RadioFrancePodcastIE,
RadioFranceProfileIE,
RadioFranceProgramScheduleIE,
)
from .radiozet import RadioZetPodcastIE from .radiozet import RadioZetPodcastIE
from .radiokapital import ( from .radiokapital import (
RadioKapitalIE, RadioKapitalIE,
@ -1586,6 +1601,7 @@
from .rbgtum import ( from .rbgtum import (
RbgTumIE, RbgTumIE,
RbgTumCourseIE, RbgTumCourseIE,
RbgTumNewCourseIE,
) )
from .rcs import ( from .rcs import (
RCSIE, RCSIE,
@ -1710,7 +1726,10 @@
RuvIE, RuvIE,
RuvSpilaIE RuvSpilaIE
) )
from .s4c import S4CIE from .s4c import (
S4CIE,
S4CSeriesIE
)
from .safari import ( from .safari import (
SafariIE, SafariIE,
SafariApiIE, SafariApiIE,
@ -1791,7 +1810,10 @@
from .slutload import SlutloadIE from .slutload import SlutloadIE
from .smotrim import SmotrimIE from .smotrim import SmotrimIE
from .snotr import SnotrIE from .snotr import SnotrIE
from .sohu import SohuIE from .sohu import (
SohuIE,
SohuVIE,
)
from .sonyliv import ( from .sonyliv import (
SonyLIVIE, SonyLIVIE,
SonyLIVSeriesIE, SonyLIVSeriesIE,
@ -2354,7 +2376,8 @@
) )
from .weibo import ( from .weibo import (
WeiboIE, WeiboIE,
WeiboMobileIE WeiboVideoIE,
WeiboUserIE,
) )
from .weiqitv import WeiqiTVIE from .weiqitv import WeiqiTVIE
from .weverse import ( from .weverse import (

View file

@ -12,7 +12,7 @@
import urllib.request import urllib.request
import urllib.response import urllib.response
import uuid import uuid
from ..utils.networking import clean_proxies
from .common import InfoExtractor from .common import InfoExtractor
from ..aes import aes_ecb_decrypt from ..aes import aes_ecb_decrypt
from ..utils import ( from ..utils import (
@ -35,7 +35,10 @@ def add_opener(ydl, handler): # FIXME: Create proper API in .networking
rh = ydl._request_director.handlers['Urllib'] rh = ydl._request_director.handlers['Urllib']
if 'abematv-license' in rh._SUPPORTED_URL_SCHEMES: if 'abematv-license' in rh._SUPPORTED_URL_SCHEMES:
return return
opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=ydl.proxies) headers = ydl.params['http_headers'].copy()
proxies = ydl.proxies.copy()
clean_proxies(proxies, headers)
opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=proxies)
assert isinstance(opener, urllib.request.OpenerDirector) assert isinstance(opener, urllib.request.OpenerDirector)
opener.add_handler(handler) opener.add_handler(handler)
rh._SUPPORTED_URL_SCHEMES = (*rh._SUPPORTED_URL_SCHEMES, 'abematv-license') rh._SUPPORTED_URL_SCHEMES = (*rh._SUPPORTED_URL_SCHEMES, 'abematv-license')

View file

@ -22,8 +22,11 @@ def _call_api(self, asin, data=None, note=None):
resp = self._download_json( resp = self._download_json(
f'https://www.amazon.in/minitv/api/web/{"graphql" if data else "prs"}', f'https://www.amazon.in/minitv/api/web/{"graphql" if data else "prs"}',
asin, note=note, headers={'Content-Type': 'application/json'}, asin, note=note, headers={
data=json.dumps(data).encode() if data else None, 'Content-Type': 'application/json',
'currentpageurl': '/',
'currentplatform': 'dWeb'
}, data=json.dumps(data).encode() if data else None,
query=None if data else { query=None if data else {
'deviceType': 'A1WMMUXPCUJL4N', 'deviceType': 'A1WMMUXPCUJL4N',
'contentId': asin, 'contentId': asin,
@ -46,7 +49,7 @@ class AmazonMiniTVIE(AmazonMiniTVBaseIE):
'ext': 'mp4', 'ext': 'mp4',
'title': 'May I Kiss You?', 'title': 'May I Kiss You?',
'language': 'Hindi', 'language': 'Hindi',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
'description': 'md5:a549bfc747973e04feb707833474e59d', 'description': 'md5:a549bfc747973e04feb707833474e59d',
'release_timestamp': 1644710400, 'release_timestamp': 1644710400,
'release_date': '20220213', 'release_date': '20220213',
@ -68,7 +71,7 @@ class AmazonMiniTVIE(AmazonMiniTVBaseIE):
'ext': 'mp4', 'ext': 'mp4',
'title': 'Jahaan', 'title': 'Jahaan',
'language': 'Hindi', 'language': 'Hindi',
'thumbnail': r're:^https?://.*\.jpg', 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'description': 'md5:05eb765a77bf703f322f120ec6867339', 'description': 'md5:05eb765a77bf703f322f120ec6867339',
'release_timestamp': 1647475200, 'release_timestamp': 1647475200,
'release_date': '20220317', 'release_date': '20220317',

View file

@ -3,7 +3,6 @@
import urllib.parse import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from .naver import NaverBaseIE
from .youtube import YoutubeBaseInfoExtractor, YoutubeIE from .youtube import YoutubeBaseInfoExtractor, YoutubeIE
from ..compat import compat_urllib_parse_unquote from ..compat import compat_urllib_parse_unquote
from ..networking import HEADRequest from ..networking import HEADRequest
@ -947,237 +946,3 @@ def _real_extract(self, url):
if not info.get('title'): if not info.get('title'):
info['title'] = video_id info['title'] = video_id
return info return info
class VLiveWebArchiveIE(InfoExtractor):
IE_NAME = 'web.archive:vlive'
IE_DESC = 'web.archive.org saved vlive videos'
_VALID_URL = r'''(?x)
(?:https?://)?web\.archive\.org/
(?:web/)?(?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional
(?:https?(?::|%3[Aa])//)?(?:
(?:(?:www|m)\.)?vlive\.tv(?::(?:80|443))?/(?:video|embed)/(?P<id>[0-9]+) # VLive URL
)
'''
_TESTS = [{
'url': 'https://web.archive.org/web/20221221144331/http://www.vlive.tv/video/1326',
'md5': 'cc7314812855ce56de70a06a27314983',
'info_dict': {
'id': '1326',
'ext': 'mp4',
'title': "Girl's Day's Broadcast",
'creator': "Girl's Day",
'view_count': int,
'uploader_id': 'muploader_a',
'uploader_url': None,
'uploader': None,
'upload_date': '20150817',
'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
'timestamp': 1439816449,
'like_count': int,
'channel': 'Girl\'s Day',
'channel_id': 'FDF27',
'comment_count': int,
'release_timestamp': 1439818140,
'release_date': '20150817',
'duration': 1014,
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://web.archive.org/web/20221221182103/http://www.vlive.tv/video/16937',
'info_dict': {
'id': '16937',
'ext': 'mp4',
'title': '첸백시 걍방',
'creator': 'EXO',
'view_count': int,
'subtitles': 'mincount:12',
'uploader_id': 'muploader_j',
'uploader_url': 'http://vlive.tv',
'uploader': None,
'upload_date': '20161112',
'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
'timestamp': 1478923074,
'like_count': int,
'channel': 'EXO',
'channel_id': 'F94BD',
'comment_count': int,
'release_timestamp': 1478924280,
'release_date': '20161112',
'duration': 906,
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://web.archive.org/web/20221127190050/http://www.vlive.tv/video/101870',
'info_dict': {
'id': '101870',
'ext': 'mp4',
'title': '[ⓓ xV] “레벨이들 매력에 반해? 안 반해?” 움직이는 HD 포토 (레드벨벳:Red Velvet)',
'creator': 'Dispatch',
'view_count': int,
'subtitles': 'mincount:6',
'uploader_id': 'V__FRA08071',
'uploader_url': 'http://vlive.tv',
'uploader': None,
'upload_date': '20181130',
'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
'timestamp': 1543601327,
'like_count': int,
'channel': 'Dispatch',
'channel_id': 'C796F3',
'comment_count': int,
'release_timestamp': 1543601040,
'release_date': '20181130',
'duration': 279,
},
'params': {
'skip_download': True,
},
}]
# The wayback machine has special timestamp and "mode" values:
# timestamp:
# 1 = the first capture
# 2 = the last capture
# mode:
# id_ = Identity - perform no alterations of the original resource, return it as it was archived.
_WAYBACK_BASE_URL = 'https://web.archive.org/web/2id_/'
def _download_archived_page(self, url, video_id, *, timestamp='2', **kwargs):
for retry in self.RetryManager():
try:
return self._download_webpage(f'https://web.archive.org/web/{timestamp}id_/{url}', video_id, **kwargs)
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 404:
raise ExtractorError('Page was not archived', expected=True)
retry.error = e
continue
def _download_archived_json(self, url, video_id, **kwargs):
page = self._download_archived_page(url, video_id, **kwargs)
if not page:
raise ExtractorError('Page was not archived', expected=True)
else:
return self._parse_json(page, video_id)
def _extract_formats_from_m3u8(self, m3u8_url, params, video_id):
m3u8_doc = self._download_archived_page(m3u8_url, video_id, note='Downloading m3u8', query=params, fatal=False)
if not m3u8_doc:
return
# M3U8 document should be changed to archive domain
m3u8_doc = m3u8_doc.splitlines()
url_base = m3u8_url.rsplit('/', 1)[0]
first_segment = None
for i, line in enumerate(m3u8_doc):
if not line.startswith('#'):
m3u8_doc[i] = f'{self._WAYBACK_BASE_URL}{url_base}/{line}?{urllib.parse.urlencode(params)}'
first_segment = first_segment or m3u8_doc[i]
# Segments may not have been archived. See https://web.archive.org/web/20221127190050/http://www.vlive.tv/video/101870
urlh = self._request_webpage(HEADRequest(first_segment), video_id, errnote=False,
fatal=False, note='Check first segment availablity')
if urlh:
formats, subtitles = self._parse_m3u8_formats_and_subtitles('\n'.join(m3u8_doc), ext='mp4', video_id=video_id)
if subtitles:
self._report_ignoring_subs('m3u8')
return formats
# Closely follows the logic of the ArchiveTeam grab script
# See: https://github.com/ArchiveTeam/vlive-grab/blob/master/vlive.lua
def _real_extract(self, url):
video_id, url_date = self._match_valid_url(url).group('id', 'date')
webpage = self._download_archived_page(f'https://www.vlive.tv/video/{video_id}', video_id, timestamp=url_date)
player_info = self._search_json(r'__PRELOADED_STATE__\s*=', webpage, 'player info', video_id)
user_country = traverse_obj(player_info, ('common', 'userCountry'))
main_script_url = self._search_regex(r'<script\s+src="([^"]+/js/main\.[^"]+\.js)"', webpage, 'main script url')
main_script = self._download_archived_page(main_script_url, video_id, note='Downloading main script')
app_id = self._search_regex(r'appId\s*=\s*"([^"]+)"', main_script, 'app id')
inkey = self._download_archived_json(
f'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/{video_id}/inkey', video_id, note='Fetching inkey', query={
'appId': app_id,
'platformType': 'PC',
'gcc': user_country,
'locale': 'en_US',
}, fatal=False)
vod_id = traverse_obj(player_info, ('postDetail', 'post', 'officialVideo', 'vodId'))
vod_data = self._download_archived_json(
f'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{vod_id}', video_id, note='Fetching vod data', query={
'key': inkey.get('inkey'),
'pid': 'rmcPlayer_16692457559726800', # partially unix time and partially random. Fixed value used by archiveteam project
'sid': '2024',
'ver': '2.0',
'devt': 'html5_pc',
'doct': 'json',
'ptc': 'https',
'sptc': 'https',
'cpt': 'vtt',
'ctls': '%7B%22visible%22%3A%7B%22fullscreen%22%3Atrue%2C%22logo%22%3Afalse%2C%22playbackRate%22%3Afalse%2C%22scrap%22%3Afalse%2C%22playCount%22%3Atrue%2C%22commentCount%22%3Atrue%2C%22title%22%3Atrue%2C%22writer%22%3Atrue%2C%22expand%22%3Afalse%2C%22subtitles%22%3Atrue%2C%22thumbnails%22%3Atrue%2C%22quality%22%3Atrue%2C%22setting%22%3Atrue%2C%22script%22%3Afalse%2C%22logoDimmed%22%3Atrue%2C%22badge%22%3Atrue%2C%22seekingTime%22%3Atrue%2C%22muted%22%3Atrue%2C%22muteButton%22%3Afalse%2C%22viewerNotice%22%3Afalse%2C%22linkCount%22%3Afalse%2C%22createTime%22%3Afalse%2C%22thumbnail%22%3Atrue%7D%2C%22clicked%22%3A%7B%22expand%22%3Afalse%2C%22subtitles%22%3Afalse%7D%7D',
'pv': '4.26.9',
'dr': '1920x1080',
'cpl': 'en_US',
'lc': 'en_US',
'adi': '%5B%7B%22type%22%3A%22pre%22%2C%22exposure%22%3Afalse%2C%22replayExposure%22%3Afalse%7D%5D',
'adu': '%2F',
'videoId': vod_id,
'cc': user_country,
})
formats = []
streams = traverse_obj(vod_data, ('streams', ...))
if len(streams) > 1:
self.report_warning('Multiple streams found. Only the first stream will be downloaded.')
stream = streams[0]
max_stream = max(
stream.get('videos') or [],
key=lambda v: traverse_obj(v, ('bitrate', 'video'), default=0), default=None)
if max_stream is not None:
params = {arg.get('name'): arg.get('value') for arg in stream.get('keys', []) if arg.get('type') == 'param'}
formats = self._extract_formats_from_m3u8(max_stream.get('source'), params, video_id) or []
# For parts of the project MP4 files were archived
max_video = max(
traverse_obj(vod_data, ('videos', 'list', ...)),
key=lambda v: traverse_obj(v, ('bitrate', 'video'), default=0), default=None)
if max_video is not None:
video_url = self._WAYBACK_BASE_URL + max_video.get('source')
urlh = self._request_webpage(HEADRequest(video_url), video_id, errnote=False,
fatal=False, note='Check video availablity')
if urlh:
formats.append({'url': video_url})
return {
'id': video_id,
'formats': formats,
**traverse_obj(player_info, ('postDetail', 'post', {
'title': ('officialVideo', 'title', {str}),
'creator': ('author', 'nickname', {str}),
'channel': ('channel', 'channelName', {str}),
'channel_id': ('channel', 'channelCode', {str}),
'duration': ('officialVideo', 'playTime', {int_or_none}),
'view_count': ('officialVideo', 'playCount', {int_or_none}),
'like_count': ('officialVideo', 'likeCount', {int_or_none}),
'comment_count': ('officialVideo', 'commentCount', {int_or_none}),
'timestamp': ('officialVideo', 'createdAt', {lambda x: int_or_none(x, scale=1000)}),
'release_timestamp': ('officialVideo', 'willStartAt', {lambda x: int_or_none(x, scale=1000)}),
})),
**traverse_obj(vod_data, ('meta', {
'uploader_id': ('user', 'id', {str}),
'uploader': ('user', 'name', {str}),
'uploader_url': ('user', 'url', {url_or_none}),
'thumbnail': ('cover', 'source', {url_or_none}),
}), expected_type=lambda x: x or None),
**NaverBaseIE.process_subtitles(vod_data, lambda x: [self._WAYBACK_BASE_URL + x]),
}

View file

@ -12,7 +12,7 @@ class AWSIE(InfoExtractor): # XXX: Conventionally, base classes should end with
def _aws_execute_api(self, aws_dict, video_id, query=None): def _aws_execute_api(self, aws_dict, video_id, query=None):
query = query or {} query = query or {}
amz_date = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') amz_date = datetime.datetime.now(datetime.timezone.utc).strftime('%Y%m%dT%H%M%SZ')
date = amz_date[:8] date = amz_date[:8]
headers = { headers = {
'Accept': 'application/json', 'Accept': 'application/json',

87
yt_dlp/extractor/axs.py Normal file
View file

@ -0,0 +1,87 @@
from .common import InfoExtractor
from ..utils import (
float_or_none,
js_to_json,
parse_iso8601,
traverse_obj,
url_or_none,
)
class AxsIE(InfoExtractor):
IE_NAME = 'axs.tv'
_VALID_URL = r'https?://(?:www\.)?axs\.tv/(?:channel/(?:[^/?#]+/)+)?video/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.axs.tv/video/5f4dc776b70e4f1c194f22ef/',
'md5': '8d97736ae8e50c64df528e5e676778cf',
'info_dict': {
'id': '5f4dc776b70e4f1c194f22ef',
'title': 'Small Town',
'ext': 'mp4',
'description': 'md5:e314d28bfaa227a4d7ec965fae19997f',
'upload_date': '20230602',
'timestamp': 1685729564,
'duration': 1284.216,
'series': 'Rock & Roll Road Trip with Sammy Hagar',
'season': 2,
'episode': '3',
'thumbnail': 'https://images.dotstudiopro.com/5f4e9d330a0c3b295a7e8394',
},
}, {
'url': 'https://www.axs.tv/channel/rock-star-interview/video/daryl-hall',
'md5': '300ae795cd8f9984652c0949734ffbdc',
'info_dict': {
'id': '5f488148b70e4f392572977c',
'display_id': 'daryl-hall',
'title': 'Daryl Hall',
'ext': 'mp4',
'description': 'md5:e54ecaa0f4b5683fc9259e9e4b196628',
'upload_date': '20230214',
'timestamp': 1676403615,
'duration': 2570.668,
'series': 'The Big Interview with Dan Rather',
'season': 3,
'episode': '5',
'thumbnail': 'https://images.dotstudiopro.com/5f4d1901f340b50d937cec32',
},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
webpage_json_data = self._search_json(
r'mountObj\s*=', webpage, 'video ID data', display_id,
transform_source=js_to_json)
video_id = webpage_json_data['video_id']
company_id = webpage_json_data['company_id']
meta = self._download_json(
f'https://api.myspotlight.tv/dotplayer/video/{company_id}/{video_id}',
video_id, query={'device_type': 'desktop_web'})['video']
formats = self._extract_m3u8_formats(
meta['video_m3u8'], video_id, 'mp4', m3u8_id='hls')
subtitles = {}
for cc in traverse_obj(meta, ('closeCaption', lambda _, v: url_or_none(v['srtPath']))):
subtitles.setdefault(cc.get('srtShortLang') or 'en', []).append(
{'ext': cc.get('srtExt'), 'url': cc['srtPath']})
return {
'id': video_id,
'display_id': display_id,
'formats': formats,
**traverse_obj(meta, {
'title': ('title', {str}),
'description': ('description', {str}),
'series': ('seriestitle', {str}),
'season': ('season', {int}),
'episode': ('episode', {str}),
'duration': ('duration', {float_or_none}),
'timestamp': ('updated_at', {parse_iso8601}),
'thumbnail': ('thumb', {url_or_none}),
}),
'subtitles': subtitles,
}

View file

@ -31,7 +31,7 @@ def _extract_playlist(self, playlist_id):
class BanByeIE(BanByeBaseIE): class BanByeIE(BanByeBaseIE):
_VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P<id>\w+)' _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P<id>[\w-]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T', 'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T',
'md5': '2f4ea15c5ca259a73d909b2cfd558eb5', 'md5': '2f4ea15c5ca259a73d909b2cfd558eb5',
@ -59,7 +59,27 @@ class BanByeIE(BanByeBaseIE):
'title': 'Krzysztof Karoń', 'title': 'Krzysztof Karoń',
'id': 'p_Ld82N6gBw_OJ', 'id': 'p_Ld82N6gBw_OJ',
}, },
'playlist_count': 9, 'playlist_mincount': 9,
}, {
'url': 'https://banbye.com/watch/v_kb6_o1Kyq-CD',
'info_dict': {
'id': 'v_kb6_o1Kyq-CD',
'ext': 'mp4',
'title': 'Co tak naprawdę dzieje się we Francji?! Czy Warszawa a potem cała Polska będzie drugim Paryżem?!🤔🇵🇱',
'description': 'md5:82be4c0e13eae8ea1ca8b9f2e07226a8',
'uploader': 'Marcin Rola - MOIM ZDANIEM!🇵🇱',
'channel_id': 'ch_QgWnHvDG2fo5',
'channel_url': 'https://banbye.com/channel/ch_QgWnHvDG2fo5',
'duration': 597,
'timestamp': 1688642656,
'upload_date': '20230706',
'thumbnail': 'https://cdn.banbye.com/video/v_kb6_o1Kyq-CD/96.webp',
'tags': ['Paryż', 'Francja', 'Polska', 'Imigranci', 'Morawiecki', 'Tusk'],
'like_count': int,
'dislike_count': int,
'view_count': int,
'comment_count': int,
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View file

@ -15,11 +15,13 @@
float_or_none, float_or_none,
get_element_by_class, get_element_by_class,
int_or_none, int_or_none,
join_nonempty,
js_to_json, js_to_json,
parse_duration, parse_duration,
parse_iso8601, parse_iso8601,
parse_qs, parse_qs,
strip_or_none, strip_or_none,
traverse_obj,
try_get, try_get,
unescapeHTML, unescapeHTML,
unified_timestamp, unified_timestamp,
@ -41,7 +43,6 @@ class BBCCoUkIE(InfoExtractor):
iplayer(?:/[^/]+)?/(?:episode/|playlist/)| iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
music/(?:clips|audiovideo/popular)[/#]| music/(?:clips|audiovideo/popular)[/#]|
radio/player/| radio/player/|
sounds/play/|
events/[^/]+/play/[^/]+/ events/[^/]+/play/[^/]+/
) )
(?P<id>%s)(?!/(?:episodes|broadcasts|clips)) (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
@ -218,20 +219,6 @@ class BBCCoUkIE(InfoExtractor):
# rtmp download # rtmp download
'skip_download': True, 'skip_download': True,
}, },
}, {
'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
'note': 'Audio',
'info_dict': {
'id': 'm0007jz9',
'ext': 'mp4',
'title': 'BBC Proms, 2019, Prom 34: WestEastern Divan Orchestra',
'description': "Live BBC Proms. WestEastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
'duration': 9840,
},
'params': {
# rtmp download
'skip_download': True,
}
}, { }, {
'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
'only_matching': True, 'only_matching': True,
@ -844,6 +831,20 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'upload_date': '20190604', 'upload_date': '20190604',
'categories': ['Psychology'], 'categories': ['Psychology'],
}, },
}, {
# BBC Sounds
'url': 'https://www.bbc.co.uk/sounds/play/m001q78b',
'info_dict': {
'id': 'm001q789',
'ext': 'mp4',
'title': 'The Night Tracks Mix - Music for the darkling hour',
'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg',
'chapters': 'count:8',
'description': 'md5:815fb51cbdaa270040aab8145b3f1d67',
'uploader': 'Radio 3',
'duration': 1800,
'uploader_id': 'bbc_radio_three',
},
}, { # onion routes }, { # onion routes
'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576', 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
'only_matching': True, 'only_matching': True,
@ -1128,6 +1129,13 @@ def _real_extract(self, url):
'uploader_id': network.get('id'), 'uploader_id': network.get('id'),
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
'chapters': traverse_obj(preload_state, (
'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), {
'title': ('titles', {lambda x: join_nonempty(
'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
'start_time': ('offset', 'start', {float_or_none}),
'end_time': ('offset', 'end', {float_or_none}),
})) or None,
} }
bbc3_config = self._parse_json( bbc3_config = self._parse_json(

View file

@ -1,6 +1,7 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
traverse_obj,
unescapeHTML, unescapeHTML,
) )
@ -8,7 +9,8 @@
class BildIE(InfoExtractor): class BildIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bild\.de/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+)(?:,auto=true)?\.bild\.html' _VALID_URL = r'https?://(?:www\.)?bild\.de/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+)(?:,auto=true)?\.bild\.html'
IE_DESC = 'Bild.de' IE_DESC = 'Bild.de'
_TEST = { _TESTS = [{
'note': 'static MP4 only',
'url': 'http://www.bild.de/video/clip/apple-ipad-air/das-koennen-die-neuen-ipads-38184146.bild.html', 'url': 'http://www.bild.de/video/clip/apple-ipad-air/das-koennen-die-neuen-ipads-38184146.bild.html',
'md5': 'dd495cbd99f2413502a1713a1156ac8a', 'md5': 'dd495cbd99f2413502a1713a1156ac8a',
'info_dict': { 'info_dict': {
@ -19,7 +21,19 @@ class BildIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 196, 'duration': 196,
} }
} }, {
'note': 'static MP4 and HLS',
'url': 'https://www.bild.de/video/clip/news-ausland/deftiger-abgang-vom-10m-turm-bademeister-sorgt-fuer-skandal-85158620.bild.html',
'md5': 'fb0ed4f09c495d4ba7ce2eee0bb90de1',
'info_dict': {
'id': '85158620',
'ext': 'mp4',
'title': 'Der Sprungturm-Skandal',
'description': 'md5:709b543c24dc31bbbffee73bccda34ad',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 69,
}
}]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
@ -27,11 +41,23 @@ def _real_extract(self, url):
video_data = self._download_json( video_data = self._download_json(
url.split('.bild.html')[0] + ',view=json.bild.html', video_id) url.split('.bild.html')[0] + ',view=json.bild.html', video_id)
formats = []
for src in traverse_obj(video_data, ('clipList', 0, 'srces', lambda _, v: v['src'])):
src_type = src.get('type')
if src_type == 'application/x-mpegURL':
formats.extend(
self._extract_m3u8_formats(
src['src'], video_id, 'mp4', m3u8_id='hls', fatal=False))
elif src_type == 'video/mp4':
formats.append({'url': src['src'], 'format_id': 'http-mp4'})
else:
self.report_warning(f'Skipping unsupported format type: "{src_type}"')
return { return {
'id': video_id, 'id': video_id,
'title': unescapeHTML(video_data['title']).strip(), 'title': unescapeHTML(video_data['title']).strip(),
'description': unescapeHTML(video_data.get('description')), 'description': unescapeHTML(video_data.get('description')),
'url': video_data['clipList'][0]['srces'][0]['src'], 'formats': formats,
'thumbnail': video_data.get('poster'), 'thumbnail': video_data.get('poster'),
'duration': int_or_none(video_data.get('durationSec')), 'duration': int_or_none(video_data.get('durationSec')),
} }

View file

@ -3,6 +3,7 @@
import hashlib import hashlib
import itertools import itertools
import math import math
import re
import time import time
import urllib.parse import urllib.parse
@ -14,6 +15,7 @@
GeoRestrictedError, GeoRestrictedError,
InAdvancePagedList, InAdvancePagedList,
OnDemandPagedList, OnDemandPagedList,
bool_or_none,
filter_dict, filter_dict,
float_or_none, float_or_none,
format_field, format_field,
@ -34,27 +36,31 @@
unsmuggle_url, unsmuggle_url,
url_or_none, url_or_none,
urlencode_postdata, urlencode_postdata,
variadic,
) )
class BilibiliBaseIE(InfoExtractor): class BilibiliBaseIE(InfoExtractor):
_FORMAT_ID_RE = re.compile(r'-(\d+)\.m4s\?')
def extract_formats(self, play_info): def extract_formats(self, play_info):
format_names = { format_names = {
r['quality']: traverse_obj(r, 'new_description', 'display_desc') r['quality']: traverse_obj(r, 'new_description', 'display_desc')
for r in traverse_obj(play_info, ('support_formats', lambda _, v: v['quality'])) for r in traverse_obj(play_info, ('support_formats', lambda _, v: v['quality']))
} }
audios = traverse_obj(play_info, ('dash', 'audio', ...)) audios = traverse_obj(play_info, ('dash', (None, 'dolby'), 'audio', ..., {dict}))
flac_audio = traverse_obj(play_info, ('dash', 'flac', 'audio')) flac_audio = traverse_obj(play_info, ('dash', 'flac', 'audio'))
if flac_audio: if flac_audio:
audios.append(flac_audio) audios.append(flac_audio)
formats = [{ formats = [{
'url': traverse_obj(audio, 'baseUrl', 'base_url', 'url'), 'url': traverse_obj(audio, 'baseUrl', 'base_url', 'url'),
'ext': mimetype2ext(traverse_obj(audio, 'mimeType', 'mime_type')), 'ext': mimetype2ext(traverse_obj(audio, 'mimeType', 'mime_type')),
'acodec': audio.get('codecs'), 'acodec': traverse_obj(audio, ('codecs', {str.lower})),
'vcodec': 'none', 'vcodec': 'none',
'tbr': float_or_none(audio.get('bandwidth'), scale=1000), 'tbr': float_or_none(audio.get('bandwidth'), scale=1000),
'filesize': int_or_none(audio.get('size')) 'filesize': int_or_none(audio.get('size')),
'format_id': str_or_none(audio.get('id')),
} for audio in audios] } for audio in audios]
formats.extend({ formats.extend({
@ -65,9 +71,13 @@ def extract_formats(self, play_info):
'height': int_or_none(video.get('height')), 'height': int_or_none(video.get('height')),
'vcodec': video.get('codecs'), 'vcodec': video.get('codecs'),
'acodec': 'none' if audios else None, 'acodec': 'none' if audios else None,
'dynamic_range': {126: 'DV', 125: 'HDR10'}.get(int_or_none(video.get('id'))),
'tbr': float_or_none(video.get('bandwidth'), scale=1000), 'tbr': float_or_none(video.get('bandwidth'), scale=1000),
'filesize': int_or_none(video.get('size')), 'filesize': int_or_none(video.get('size')),
'quality': int_or_none(video.get('id')), 'quality': int_or_none(video.get('id')),
'format_id': traverse_obj(
video, (('baseUrl', 'base_url'), {self._FORMAT_ID_RE.search}, 1),
('id', {str_or_none}), get_all=False),
'format': format_names.get(video.get('id')), 'format': format_names.get(video.get('id')),
} for video in traverse_obj(play_info, ('dash', 'video', ...))) } for video in traverse_obj(play_info, ('dash', 'video', ...)))
@ -149,7 +159,7 @@ def _get_episodes_from_season(self, ss_id, url):
class BiliBiliIE(BilibiliBaseIE): class BiliBiliIE(BilibiliBaseIE):
_VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)' _VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.bilibili.com/video/BV13x41117TL', 'url': 'https://www.bilibili.com/video/BV13x41117TL',
@ -245,7 +255,7 @@ class BiliBiliIE(BilibiliBaseIE):
'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4', 'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4',
'duration': 313.557, 'duration': 313.557,
'upload_date': '20220709', 'upload_date': '20220709',
'uploader': '小夫Tech', 'uploader': '小夫太渴',
'timestamp': 1657347907, 'timestamp': 1657347907,
'uploader_id': '1326814124', 'uploader_id': '1326814124',
'comment_count': int, 'comment_count': int,
@ -502,7 +512,7 @@ def _real_extract(self, url):
class BiliBiliBangumiMediaIE(BilibiliBaseIE): class BiliBiliBangumiMediaIE(BilibiliBaseIE):
_VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/media/md(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.bilibili.com/bangumi/media/md24097891', 'url': 'https://www.bilibili.com/bangumi/media/md24097891',
'info_dict': { 'info_dict': {
@ -521,7 +531,7 @@ def _real_extract(self, url):
class BiliBiliBangumiSeasonIE(BilibiliBaseIE): class BiliBiliBangumiSeasonIE(BilibiliBaseIE):
_VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/ss(?P<id>\d+)' _VALID_URL = r'(?x)https?://(?:www\.)?bilibili\.com/bangumi/play/ss(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.bilibili.com/bangumi/play/ss26801', 'url': 'https://www.bilibili.com/bangumi/play/ss26801',
'info_dict': { 'info_dict': {
@ -672,13 +682,35 @@ def get_entries(page_data):
return self.playlist_result(paged_list, playlist_id) return self.playlist_result(paged_list, playlist_id)
class BilibiliSpacePlaylistIE(BilibiliSpaceBaseIE): class BilibiliSpaceListBaseIE(BilibiliSpaceBaseIE):
_VALID_URL = r'https?://space.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail\?sid=(?P<sid>\d+)' def _get_entries(self, page_data, bvid_keys, ending_key='bvid'):
for bvid in traverse_obj(page_data, (*variadic(bvid_keys, (str, bytes, dict, set)), ..., ending_key, {str})):
yield self.url_result(f'https://www.bilibili.com/video/{bvid}', BiliBiliIE, bvid)
def _get_uploader(self, uid, playlist_id):
webpage = self._download_webpage(f'https://space.bilibili.com/{uid}', playlist_id, fatal=False)
return self._search_regex(r'(?s)<title\b[^>]*>([^<]+)的个人空间-', webpage, 'uploader', fatal=False)
def _extract_playlist(self, fetch_page, get_metadata, get_entries):
metadata, page_list = super()._extract_playlist(fetch_page, get_metadata, get_entries)
metadata.pop('page_count', None)
metadata.pop('page_size', None)
return metadata, page_list
class BilibiliCollectionListIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://space\.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail/?\?sid=(?P<sid>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://space.bilibili.com/2142762/channel/collectiondetail?sid=57445', 'url': 'https://space.bilibili.com/2142762/channel/collectiondetail?sid=57445',
'info_dict': { 'info_dict': {
'id': '2142762_57445', 'id': '2142762_57445',
'title': '《底特律 变人》' 'title': '【完结】《底特律 变人》全结局流程解说',
'description': '',
'uploader': '老戴在此',
'uploader_id': '2142762',
'timestamp': int,
'upload_date': str,
'thumbnail': 'https://archive.biliimg.com/bfs/archive/e0e543ae35ad3df863ea7dea526bc32e70f4c091.jpg',
}, },
'playlist_mincount': 31, 'playlist_mincount': 31,
}] }]
@ -699,22 +731,251 @@ def get_metadata(page_data):
return { return {
'page_count': math.ceil(entry_count / page_size), 'page_count': math.ceil(entry_count / page_size),
'page_size': page_size, 'page_size': page_size,
'title': traverse_obj(page_data, ('meta', 'name')) 'uploader': self._get_uploader(mid, playlist_id),
**traverse_obj(page_data, {
'title': ('meta', 'name', {str}),
'description': ('meta', 'description', {str}),
'uploader_id': ('meta', 'mid', {str_or_none}),
'timestamp': ('meta', 'ptime', {int_or_none}),
'thumbnail': ('meta', 'cover', {url_or_none}),
})
} }
def get_entries(page_data): def get_entries(page_data):
for entry in page_data.get('archives', []): return self._get_entries(page_data, 'archives')
yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}',
BiliBiliIE, entry['bvid'])
metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries) metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
return self.playlist_result(paged_list, playlist_id, metadata['title']) return self.playlist_result(paged_list, playlist_id, **metadata)
class BilibiliSeriesListIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://space\.bilibili\.com/(?P<mid>\d+)/channel/seriesdetail/?\?\bsid=(?P<sid>\d+)'
_TESTS = [{
'url': 'https://space.bilibili.com/1958703906/channel/seriesdetail?sid=547718&ctype=0',
'info_dict': {
'id': '1958703906_547718',
'title': '直播回放',
'description': '直播回放',
'uploader': '靡烟miya',
'uploader_id': '1958703906',
'timestamp': 1637985853,
'upload_date': '20211127',
'modified_timestamp': int,
'modified_date': str,
},
'playlist_mincount': 513,
}]
def _real_extract(self, url):
mid, sid = self._match_valid_url(url).group('mid', 'sid')
playlist_id = f'{mid}_{sid}'
playlist_meta = traverse_obj(self._download_json(
f'https://api.bilibili.com/x/series/series?series_id={sid}', playlist_id, fatal=False
), {
'title': ('data', 'meta', 'name', {str}),
'description': ('data', 'meta', 'description', {str}),
'uploader_id': ('data', 'meta', 'mid', {str_or_none}),
'timestamp': ('data', 'meta', 'ctime', {int_or_none}),
'modified_timestamp': ('data', 'meta', 'mtime', {int_or_none}),
})
def fetch_page(page_idx):
return self._download_json(
'https://api.bilibili.com/x/series/archives',
playlist_id, note=f'Downloading page {page_idx}',
query={'mid': mid, 'series_id': sid, 'pn': page_idx + 1, 'ps': 30})['data']
def get_metadata(page_data):
page_size = page_data['page']['size']
entry_count = page_data['page']['total']
return {
'page_count': math.ceil(entry_count / page_size),
'page_size': page_size,
'uploader': self._get_uploader(mid, playlist_id),
**playlist_meta
}
def get_entries(page_data):
return self._get_entries(page_data, 'archives')
metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
return self.playlist_result(paged_list, playlist_id, **metadata)
class BilibiliFavoritesListIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://(?:space\.bilibili\.com/\d+/favlist/?\?fid=|(?:www\.)?bilibili\.com/medialist/detail/ml)(?P<id>\d+)'
_TESTS = [{
'url': 'https://space.bilibili.com/84912/favlist?fid=1103407912&ftype=create',
'info_dict': {
'id': '1103407912',
'title': '【V2】',
'description': '',
'uploader': '晓月春日',
'uploader_id': '84912',
'timestamp': 1604905176,
'upload_date': '20201109',
'modified_timestamp': int,
'modified_date': str,
'thumbnail': r"re:http://i\d\.hdslb\.com/bfs/archive/14b83c62aa8871b79083df1e9ab4fbc699ad16fe\.jpg",
'view_count': int,
'like_count': int,
},
'playlist_mincount': 22,
}, {
'url': 'https://www.bilibili.com/medialist/detail/ml1103407912',
'only_matching': True,
}]
def _real_extract(self, url):
fid = self._match_id(url)
list_info = self._download_json(
f'https://api.bilibili.com/x/v3/fav/resource/list?media_id={fid}&pn=1&ps=20',
fid, note='Downloading favlist metadata')
if list_info['code'] == -403:
self.raise_login_required(msg='This is a private favorites list. You need to log in as its owner')
entries = self._get_entries(self._download_json(
f'https://api.bilibili.com/x/v3/fav/resource/ids?media_id={fid}',
fid, note='Download favlist entries'), 'data')
return self.playlist_result(entries, fid, **traverse_obj(list_info, ('data', 'info', {
'title': ('title', {str}),
'description': ('intro', {str}),
'uploader': ('upper', 'name', {str}),
'uploader_id': ('upper', 'mid', {str_or_none}),
'timestamp': ('ctime', {int_or_none}),
'modified_timestamp': ('mtime', {int_or_none}),
'thumbnail': ('cover', {url_or_none}),
'view_count': ('cnt_info', 'play', {int_or_none}),
'like_count': ('cnt_info', 'thumb_up', {int_or_none}),
})))
class BilibiliWatchlaterIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/watchlater/?(?:[?#]|$)'
_TESTS = [{
'url': 'https://www.bilibili.com/watchlater/#/list',
'info_dict': {'id': 'watchlater'},
'playlist_mincount': 0,
'skip': 'login required',
}]
def _real_extract(self, url):
list_id = getattr(self._get_cookies(url).get('DedeUserID'), 'value', 'watchlater')
watchlater_info = self._download_json(
'https://api.bilibili.com/x/v2/history/toview/web?jsonp=jsonp', list_id)
if watchlater_info['code'] == -101:
self.raise_login_required(msg='You need to login to access your watchlater list')
entries = self._get_entries(watchlater_info, ('data', 'list'))
return self.playlist_result(entries, id=list_id, title='稍后再看')
class BilibiliPlaylistIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:medialist/play|list)/(?P<id>\w+)'
_TESTS = [{
'url': 'https://www.bilibili.com/list/1958703906?sid=547718',
'info_dict': {
'id': '5_547718',
'title': '直播回放',
'uploader': '靡烟miya',
'uploader_id': '1958703906',
'timestamp': 1637985853,
'upload_date': '20211127',
},
'playlist_mincount': 513,
}, {
'url': 'https://www.bilibili.com/medialist/play/1958703906?business=space_series&business_id=547718&desc=1',
'info_dict': {
'id': '5_547718',
},
'playlist_mincount': 513,
'skip': 'redirect url',
}, {
'url': 'https://www.bilibili.com/list/ml1103407912',
'info_dict': {
'id': '3_1103407912',
'title': '【V2】',
'uploader': '晓月春日',
'uploader_id': '84912',
'timestamp': 1604905176,
'upload_date': '20201109',
'thumbnail': r"re:http://i\d\.hdslb\.com/bfs/archive/14b83c62aa8871b79083df1e9ab4fbc699ad16fe\.jpg",
},
'playlist_mincount': 22,
}, {
'url': 'https://www.bilibili.com/medialist/play/ml1103407912',
'info_dict': {
'id': '3_1103407912',
},
'playlist_mincount': 22,
'skip': 'redirect url',
}, {
'url': 'https://www.bilibili.com/list/watchlater',
'info_dict': {'id': 'watchlater'},
'playlist_mincount': 0,
'skip': 'login required',
}, {
'url': 'https://www.bilibili.com/medialist/play/watchlater',
'info_dict': {'id': 'watchlater'},
'playlist_mincount': 0,
'skip': 'login required',
}]
def _extract_medialist(self, query, list_id):
for page_num in itertools.count(1):
page_data = self._download_json(
'https://api.bilibili.com/x/v2/medialist/resource/list',
list_id, query=query, note=f'getting playlist {query["biz_id"]} page {page_num}'
)['data']
yield from self._get_entries(page_data, 'media_list', ending_key='bv_id')
query['oid'] = traverse_obj(page_data, ('media_list', -1, 'id'))
if not page_data.get('has_more', False):
break
def _real_extract(self, url):
list_id = self._match_id(url)
webpage = self._download_webpage(url, list_id)
initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', list_id)
if traverse_obj(initial_state, ('error', 'code', {int_or_none})) != 200:
error_code = traverse_obj(initial_state, ('error', 'trueCode', {int_or_none}))
error_message = traverse_obj(initial_state, ('error', 'message', {str_or_none}))
if error_code == -400 and list_id == 'watchlater':
self.raise_login_required('You need to login to access your watchlater playlist')
elif error_code == -403:
self.raise_login_required('This is a private playlist. You need to login as its owner')
elif error_code == 11010:
raise ExtractorError('Playlist is no longer available', expected=True)
raise ExtractorError(f'Could not access playlist: {error_code} {error_message}')
query = {
'ps': 20,
'with_current': False,
**traverse_obj(initial_state, {
'type': ('playlist', 'type', {int_or_none}),
'biz_id': ('playlist', 'id', {int_or_none}),
'tid': ('tid', {int_or_none}),
'sort_field': ('sortFiled', {int_or_none}),
'desc': ('desc', {bool_or_none}, {str_or_none}, {str.lower}),
})
}
metadata = {
'id': f'{query["type"]}_{query["biz_id"]}',
**traverse_obj(initial_state, ('mediaListInfo', {
'title': ('title', {str}),
'uploader': ('upper', 'name', {str}),
'uploader_id': ('upper', 'mid', {str_or_none}),
'timestamp': ('ctime', {int_or_none}),
'thumbnail': ('cover', {url_or_none}),
})),
}
return self.playlist_result(self._extract_medialist(query, list_id), **metadata)
class BilibiliCategoryIE(InfoExtractor): class BilibiliCategoryIE(InfoExtractor):
IE_NAME = 'Bilibili category extractor' IE_NAME = 'Bilibili category extractor'
_MAX_RESULTS = 1000000 _MAX_RESULTS = 1000000
_VALID_URL = r'https?://www\.bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+' _VALID_URL = r'https?://(?:www\.)?bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+'
_TESTS = [{ _TESTS = [{
'url': 'https://www.bilibili.com/v/kichiku/mad', 'url': 'https://www.bilibili.com/v/kichiku/mad',
'info_dict': { 'info_dict': {
@ -1399,7 +1660,7 @@ def _real_extract(self, url):
class BiliLiveIE(InfoExtractor): class BiliLiveIE(InfoExtractor):
_VALID_URL = r'https?://live.bilibili.com/(?:blanc/)?(?P<id>\d+)' _VALID_URL = r'https?://live\.bilibili\.com/(?:blanc/)?(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://live.bilibili.com/196', 'url': 'https://live.bilibili.com/196',

View file

@ -1,56 +1,170 @@
import functools
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
clean_html,
extract_attributes,
get_element_text_and_html_by_tag,
get_elements_by_class,
join_nonempty,
js_to_json, js_to_json,
determine_ext, mimetype2ext,
unified_strdate,
url_or_none,
urljoin,
variadic,
) )
from ..utils.traversal import traverse_obj
def html_get_element(tag=None, cls=None):
assert tag or cls, 'One of tag or class is required'
if cls:
func = functools.partial(get_elements_by_class, cls, tag=tag)
else:
func = functools.partial(get_element_text_and_html_by_tag, tag)
def html_get_element_wrapper(html):
return variadic(func(html))[0]
return html_get_element_wrapper
class BpbIE(InfoExtractor): class BpbIE(InfoExtractor):
IE_DESC = 'Bundeszentrale für politische Bildung' IE_DESC = 'Bundeszentrale für politische Bildung'
_VALID_URL = r'https?://(?:www\.)?bpb\.de/mediathek/(?P<id>[0-9]+)/' _VALID_URL = r'https?://(?:www\.|m\.)?bpb\.de/(?:[^/?#]+/)*(?P<id>\d+)(?:[/?#]|$)'
_TEST = { _TESTS = [{
'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr',
'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f',
'info_dict': { 'info_dict': {
'id': '297', 'id': '297',
'ext': 'mp4', 'ext': 'mp4',
'creator': 'Kooperative Berlin',
'description': 'md5:f4f75885ba009d3e2b156247a8941ce6',
'release_date': '20160115',
'series': 'Interview auf dem Geschichtsforum 1989 | 2009',
'tags': ['Friedliche Revolution', 'Erinnerungskultur', 'Vergangenheitspolitik', 'DDR 1949 - 1990', 'Freiheitsrecht', 'BStU', 'Deutschland'],
'thumbnail': 'https://www.bpb.de/cache/images/7/297_teaser_16x9_1240.jpg?8839D',
'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR', 'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR',
'description': 'Joachim Gauck, erster Beauftragter für die Stasi-Unterlagen, spricht auf dem Geschichtsforum über die friedliche Revolution 1989 und eine "gewisse Traurigkeit" im Umgang mit der DDR-Vergangenheit.' 'uploader': 'Bundeszentrale für politische Bildung',
},
}, {
'url': 'https://www.bpb.de/mediathek/video/522184/krieg-flucht-und-falschmeldungen-wirstattdesinformation-2/',
'info_dict': {
'id': '522184',
'ext': 'mp4',
'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)',
'description': 'md5:f83c795ff8f825a69456a9e51fc15903',
'release_date': '20230621',
'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'],
'thumbnail': 'https://www.bpb.de/cache/images/4/522184_teaser_16x9_1240.png?EABFB',
'title': 'md5:9b01ccdbf58dbf9e5c9f6e771a803b1c',
'uploader': 'Bundeszentrale für politische Bildung',
},
}, {
'url': 'https://www.bpb.de/lernen/bewegtbild-und-politische-bildung/webvideo/518789/krieg-flucht-und-falschmeldungen-wirstattdesinformation-1/',
'info_dict': {
'id': '518789',
'ext': 'mp4',
'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)',
'description': 'md5:85228aed433e84ff0ff9bc582abd4ea8',
'release_date': '20230302',
'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'],
'thumbnail': 'https://www.bpb.de/cache/images/9/518789_teaser_16x9_1240.jpeg?56D0D',
'title': 'md5:3e956f264bb501f6383f10495a401da4',
'uploader': 'Bundeszentrale für politische Bildung',
},
}, {
'url': 'https://www.bpb.de/mediathek/podcasts/apuz-podcast/539727/apuz-20-china/',
'only_matching': True,
}, {
'url': 'https://www.bpb.de/mediathek/audio/315813/folge-1-eine-einfuehrung/',
'info_dict': {
'id': '315813',
'ext': 'mp3',
'creator': 'Axel Schröder',
'description': 'md5:eda9d1af34e5912efef5baf54fba4427',
'release_date': '20200921',
'series': 'Auf Endlagersuche. Der deutsche Weg zu einem sicheren Atommülllager',
'tags': ['Atomenergie', 'Endlager', 'hoch-radioaktiver Abfall', 'Endlagersuche', 'Atommüll', 'Atomendlager', 'Gorleben', 'Deutschland'],
'thumbnail': 'https://www.bpb.de/cache/images/3/315813_teaser_16x9_1240.png?92A94',
'title': 'Folge 1: Eine Einführung',
'uploader': 'Bundeszentrale für politische Bildung',
},
}, {
'url': 'https://www.bpb.de/517806/die-weltanschauung-der-neuen-rechten/',
'info_dict': {
'id': '517806',
'ext': 'mp3',
'creator': 'Bundeszentrale für politische Bildung',
'description': 'md5:594689600e919912aade0b2871cc3fed',
'release_date': '20230127',
'series': 'Vorträge des Fachtags "Modernisierer. Grenzgänger. Anstifter. Sechs Jahrzehnte \'Neue Rechte\'"',
'tags': ['Rechtsextremismus', 'Konservatismus', 'Konservativismus', 'neue Rechte', 'Rechtspopulismus', 'Schnellroda', 'Deutschland'],
'thumbnail': 'https://www.bpb.de/cache/images/6/517806_teaser_16x9_1240.png?7A7A0',
'title': 'Die Weltanschauung der "Neuen Rechten"',
'uploader': 'Bundeszentrale für politische Bildung',
},
}, {
'url': 'https://www.bpb.de/mediathek/reihen/zahlen-und-fakten-soziale-situation-filme/520153/zahlen-und-fakten-die-soziale-situation-in-deutschland-migration/',
'only_matching': True,
}]
_TITLE_RE = re.compile('(?P<title>[^<]*)<[^>]+>(?P<series>[^<]*)')
def _parse_vue_attributes(self, name, string, video_id):
attributes = extract_attributes(self._search_regex(rf'(<{name}(?:"[^"]*?"|[^>])*>)', string, name))
for key, value in attributes.items():
if key.startswith(':'):
attributes[key] = self._parse_json(value, video_id, transform_source=js_to_json, fatal=False)
return attributes
@staticmethod
def _process_source(source):
url = url_or_none(source['src'])
if not url:
return None
source_type = source.get('type', '')
extension = mimetype2ext(source_type)
is_video = source_type.startswith('video')
note = url.rpartition('.')[0].rpartition('_')[2] if is_video else None
return {
'url': url,
'ext': extension,
'vcodec': None if is_video else 'none',
'quality': 10 if note == 'high' else 0,
'format_note': note,
'format_id': join_nonempty(extension, note),
} }
}
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
title = self._html_search_regex( title_result = traverse_obj(webpage, ({html_get_element(cls='opening-header__title')}, {self._TITLE_RE.match}))
r'<h2 class="white">(.*?)</h2>', webpage, 'title') json_lds = list(self._yield_json_ld(webpage, video_id, fatal=False))
video_info_dicts = re.findall(
r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage)
formats = []
for video_info in video_info_dicts:
video_info = self._parse_json(
video_info, video_id, transform_source=js_to_json, fatal=False)
if not video_info:
continue
video_url = video_info.get('src')
if not video_url:
continue
quality = 'high' if '_high' in video_url else 'low'
formats.append({
'url': video_url,
'quality': 10 if quality == 'high' else 0,
'format_note': quality,
'format_id': '%s-%s' % (quality, determine_ext(video_url)),
})
return { return {
'id': video_id, 'id': video_id,
'formats': formats, 'title': traverse_obj(title_result, ('title', {str.strip})) or None,
'title': title, # This metadata could be interpreted otherwise, but it fits "series" the most
'description': self._og_search_description(webpage), 'series': traverse_obj(title_result, ('series', {str.strip})) or None,
'description': join_nonempty(*traverse_obj(webpage, [(
{html_get_element(cls='opening-intro')},
[{html_get_element(tag='bpb-accordion-item')}, {html_get_element(cls='text-content')}],
), {clean_html}]), delim='\n\n') or None,
'creator': self._html_search_meta('author', webpage),
'uploader': self._html_search_meta('publisher', webpage),
'release_date': unified_strdate(self._html_search_meta('date', webpage)),
'tags': traverse_obj(json_lds, (..., 'keywords', {lambda x: x.split(',')}, ...)),
**traverse_obj(self._parse_vue_attributes('bpb-player', webpage, video_id), {
'formats': (':sources', ..., {self._process_source}),
'thumbnail': ('poster', {lambda x: urljoin(url, x)}),
}),
} }

View file

@ -0,0 +1,39 @@
from .common import InfoExtractor
class Canal1IE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|noticias\.)?canal1\.com\.co/(?:[^?#&])+/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://canal1.com.co/noticias/napa-i-una-cadena-de-produccion-de-arroz-que-se-quedo-en-veremos-y-abandonada-en-el-departamento-del-choco/',
'info_dict': {
'id': '63b39f6b354977084b85ab54',
'display_id': 'napa-i-una-cadena-de-produccion-de-arroz-que-se-quedo-en-veremos-y-abandonada-en-el-departamento-del-choco',
'title': 'Ñapa I Una cadena de producción de arroz que se quedó en veremos y abandonada en el departamento del Chocó',
'description': 'md5:bc49c6d64d20610ea1e7daf079a0d013',
'thumbnail': r're:^https?://[^?#]+63b39f6b354977084b85ab54',
'ext': 'mp4',
},
}, {
'url': 'https://noticias.canal1.com.co/noticias/tres-i-el-triste-record-que-impuso-elon-musk-el-dueno-de-tesla-y-de-twitter/',
'info_dict': {
'id': '63b39e93f5fd223aa32250fb',
'display_id': 'tres-i-el-triste-record-que-impuso-elon-musk-el-dueno-de-tesla-y-de-twitter',
'title': 'Tres I El triste récord que impuso Elon Musk, el dueño de Tesla y de Twitter',
'description': 'md5:d9f691f131a21ce6767ca6c05d17d791',
'thumbnail': r're:^https?://[^?#]+63b39e93f5fd223aa32250fb',
'ext': 'mp4',
},
}, {
# Geo-restricted to Colombia
'url': 'https://canal1.com.co/programas/guerreros-canal-1/video-inedito-guerreros-despedida-kewin-zarate/',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
return self.url_result(
self._search_regex(r'"embedUrl"\s*:\s*"([^"]+)', webpage, 'embed url'),
display_id=display_id, url_transparent=True)

View file

@ -0,0 +1,136 @@
import base64
import json
import uuid
from .common import InfoExtractor
from ..utils import (
int_or_none,
js_to_json,
traverse_obj,
urljoin,
)
class CaracolTvPlayIE(InfoExtractor):
_VALID_URL = r'https?://play\.caracoltv\.com/videoDetails/(?P<id>[^/?#]+)'
_NETRC_MACHINE = 'caracoltv-play'
_TESTS = [{
'url': 'https://play.caracoltv.com/videoDetails/OTo4NGFmNjUwOWQ2ZmM0NTg2YWRiOWU0MGNhOWViOWJkYQ==',
'info_dict': {
'id': 'OTo4NGFmNjUwOWQ2ZmM0NTg2YWRiOWU0MGNhOWViOWJkYQ==',
'title': 'La teoría del promedio',
'description': 'md5:1cdd6d2c13f19ef0d9649ab81a023ac3',
},
'playlist_count': 6,
}, {
'url': 'https://play.caracoltv.com/videoDetails/OTo3OWM4ZTliYzQxMmM0MTMxYTk4Mjk2YjdjNGQ4NGRkOQ==/ella?season=0',
'info_dict': {
'id': 'OTo3OWM4ZTliYzQxMmM0MTMxYTk4Mjk2YjdjNGQ4NGRkOQ==',
'title': 'Ella',
'description': 'md5:a639b1feb5ddcc0cff92a489b4e544b8',
},
'playlist_count': 10,
}, {
'url': 'https://play.caracoltv.com/videoDetails/OTpiYTY1YTVmOTI5MzI0ZWJhOGZiY2Y3MmRlOWZlYmJkOA==/la-vuelta-al-mundo-en-80-risas-2022?season=0',
'info_dict': {
'id': 'OTpiYTY1YTVmOTI5MzI0ZWJhOGZiY2Y3MmRlOWZlYmJkOA==',
'title': 'La vuelta al mundo en 80 risas 2022',
'description': 'md5:e97aac36106e5c37ebf947b3350106a4',
},
'playlist_count': 17,
}, {
'url': 'https://play.caracoltv.com/videoDetails/MzoxX3BwbjRmNjB1',
'only_matching': True,
}]
_USER_TOKEN = None
def _extract_app_token(self, webpage):
config_js_path = self._search_regex(
r'<script[^>]+src\s*=\s*"([^"]+coreConfig.js[^"]+)', webpage, 'config js url', fatal=False)
mediation_config = {} if not config_js_path else self._search_json(
r'mediation\s*:', self._download_webpage(
urljoin('https://play.caracoltv.com/', config_js_path), None, fatal=False, note='Extracting JS config'),
'mediation_config', None, transform_source=js_to_json, fatal=False)
key = traverse_obj(
mediation_config, ('live', 'key')) or '795cd9c089a1fc48094524a5eba85a3fca1331817c802f601735907c8bbb4f50'
secret = traverse_obj(
mediation_config, ('live', 'secret')) or '64dec00a6989ba83d087621465b5e5d38bdac22033b0613b659c442c78976fa0'
return base64.b64encode(f'{key}:{secret}'.encode()).decode()
def _perform_login(self, email, password):
webpage = self._download_webpage('https://play.caracoltv.com/', None, fatal=False)
app_token = self._extract_app_token(webpage)
bearer_token = self._download_json(
'https://eu-gateway.inmobly.com/applications/oauth', None, data=b'', note='Retrieving bearer token',
headers={'Authorization': f'Basic {app_token}'})['token']
self._USER_TOKEN = self._download_json(
'https://eu-gateway.inmobly.com/user/login', None, note='Performing login', headers={
'Content-Type': 'application/json',
'Authorization': f'Bearer {bearer_token}',
}, data=json.dumps({
'device_data': {
'device_id': str(uuid.uuid4()),
'device_token': '',
'device_type': 'web'
},
'login_data': {
'enabled': True,
'email': email,
'password': password,
}
}).encode())['user_token']
def _extract_video(self, video_data, series_id=None, season_id=None, season_number=None):
formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_data['stream_url'], series_id, 'mp4')
return {
'id': video_data['id'],
'title': video_data.get('name'),
'description': video_data.get('description'),
'formats': formats,
'subtitles': subtitles,
'thumbnails': traverse_obj(
video_data, ('extra_thumbs', ..., {'url': 'thumb_url', 'height': 'height', 'width': 'width'})),
'series_id': series_id,
'season_id': season_id,
'season_number': int_or_none(season_number),
'episode_number': int_or_none(video_data.get('item_order')),
'is_live': video_data.get('entry_type') == 3,
}
def _extract_series_seasons(self, seasons, series_id):
for season in seasons:
api_response = self._download_json(
'https://eu-gateway.inmobly.com/feed', series_id, query={'season_id': season['id']},
headers={'Authorization': f'Bearer {self._USER_TOKEN}'})
season_number = season.get('order')
for episode in api_response['items']:
yield self._extract_video(episode, series_id, season['id'], season_number)
def _real_extract(self, url):
series_id = self._match_id(url)
if self._USER_TOKEN is None:
self._perform_login('guest@inmobly.com', 'Test@gus1')
api_response = self._download_json(
'https://eu-gateway.inmobly.com/feed', series_id, query={'include_ids': series_id},
headers={'Authorization': f'Bearer {self._USER_TOKEN}'})['items'][0]
if not api_response.get('seasons'):
return self._extract_video(api_response)
return self.playlist_result(
self._extract_series_seasons(api_response['seasons'], series_id),
series_id, **traverse_obj(api_response, {
'title': 'name',
'description': 'description',
}))

View file

@ -339,12 +339,12 @@ def _new_claims_token(self, email, password):
data = json.dumps({'jwt': sig}).encode() data = json.dumps({'jwt': sig}).encode()
headers = {'content-type': 'application/json', 'ott-device-type': 'web'} headers = {'content-type': 'application/json', 'ott-device-type': 'web'}
resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/token', resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/token',
None, data=data, headers=headers) None, data=data, headers=headers, expected_status=426)
cbc_access_token = resp['accessToken'] cbc_access_token = resp['accessToken']
headers = {'content-type': 'application/json', 'ott-device-type': 'web', 'ott-access-token': cbc_access_token} headers = {'content-type': 'application/json', 'ott-device-type': 'web', 'ott-access-token': cbc_access_token}
resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/profile', resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/profile',
None, headers=headers) None, headers=headers, expected_status=426)
return resp['claimsToken'] return resp['claimsToken']
def _get_claims_token_expiry(self): def _get_claims_token_expiry(self):

View file

@ -90,10 +90,17 @@ class CCCPlaylistIE(InfoExtractor):
'id': '30c3', 'id': '30c3',
}, },
'playlist_count': 135, 'playlist_count': 135,
}, {
'url': 'https://media.ccc.de/c/DS2023',
'info_dict': {
'title': 'Datenspuren 2023',
'id': 'DS2023',
},
'playlist_count': 37
}] }]
def _real_extract(self, url): def _real_extract(self, url):
playlist_id = self._match_id(url).lower() playlist_id = self._match_id(url)
conf = self._download_json( conf = self._download_json(
'https://media.ccc.de/public/conferences/' + playlist_id, 'https://media.ccc.de/public/conferences/' + playlist_id,

View file

@ -1,31 +1,72 @@
import time import time
import hashlib import hashlib
import re
import urllib import urllib
import uuid
from .common import InfoExtractor from .common import InfoExtractor
from .openload import PhantomJSwrapper
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
UserNotLive,
determine_ext,
int_or_none,
js_to_json,
parse_resolution,
str_or_none,
traverse_obj,
unescapeHTML, unescapeHTML,
unified_strdate, url_or_none,
urlencode_postdata,
urljoin, urljoin,
) )
class DouyuTVIE(InfoExtractor): class DouyuBaseIE(InfoExtractor):
IE_DESC = '斗鱼' def _download_cryptojs_md5(self, video_id):
for url in [
'https://cdnjs.cloudflare.com/ajax/libs/crypto-js/3.1.2/rollups/md5.js',
'https://cdn.bootcdn.net/ajax/libs/crypto-js/3.1.2/rollups/md5.js',
]:
js_code = self._download_webpage(
url, video_id, note='Downloading signing dependency', fatal=False)
if js_code:
self.cache.store('douyu', 'crypto-js-md5', js_code)
return js_code
raise ExtractorError('Unable to download JS dependency (crypto-js/md5)')
def _get_cryptojs_md5(self, video_id):
return self.cache.load('douyu', 'crypto-js-md5') or self._download_cryptojs_md5(video_id)
def _calc_sign(self, sign_func, video_id, a):
b = uuid.uuid4().hex
c = round(time.time())
js_script = f'{self._get_cryptojs_md5(video_id)};{sign_func};console.log(ub98484234("{a}","{b}","{c}"))'
phantom = PhantomJSwrapper(self)
result = phantom.execute(js_script, video_id,
note='Executing JS signing script').strip()
return {i: v[0] for i, v in urllib.parse.parse_qs(result).items()}
def _search_js_sign_func(self, webpage, fatal=True):
# The greedy look-behind ensures last possible script tag is matched
return self._search_regex(
r'(?:<script.*)?<script[^>]*>(.*?ub98484234.*?)</script>', webpage, 'JS sign func', fatal=fatal)
class DouyuTVIE(DouyuBaseIE):
IE_DESC = '斗鱼直播'
_VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(topic/\w+\?rid=|(?:[^/]+/))*(?P<id>[A-Za-z0-9]+)' _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(topic/\w+\?rid=|(?:[^/]+/))*(?P<id>[A-Za-z0-9]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://www.douyutv.com/iseven', 'url': 'https://www.douyu.com/pigff',
'info_dict': { 'info_dict': {
'id': '17732', 'id': '24422',
'display_id': 'iseven', 'display_id': 'pigff',
'ext': 'flv', 'ext': 'mp4',
'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'title': 're:^【PIGFF】.* [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r're:.*m7show@163\.com.*', 'description': r'≥15级牌子看鱼吧置顶帖进粉丝vx群',
'thumbnail': r're:^https?://.*\.png', 'thumbnail': str,
'uploader': '7师傅', 'uploader': 'pigff',
'is_live': True, 'is_live': True,
'live_status': 'is_live',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -85,15 +126,43 @@ class DouyuTVIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
def _get_sign_func(self, room_id, video_id):
return self._download_json(
f'https://www.douyu.com/swf_api/homeH5Enc?rids={room_id}', video_id,
note='Getting signing script')['data'][f'room{room_id}']
def _extract_stream_formats(self, stream_formats):
formats = []
for stream_info in traverse_obj(stream_formats, (..., 'data')):
stream_url = urljoin(
traverse_obj(stream_info, 'rtmp_url'), traverse_obj(stream_info, 'rtmp_live'))
if stream_url:
rate_id = traverse_obj(stream_info, ('rate', {int_or_none}))
rate_info = traverse_obj(stream_info, ('multirates', lambda _, v: v['rate'] == rate_id), get_all=False)
ext = determine_ext(stream_url)
formats.append({
'url': stream_url,
'format_id': str_or_none(rate_id),
'ext': 'mp4' if ext == 'm3u8' else ext,
'protocol': 'm3u8_native' if ext == 'm3u8' else 'https',
'quality': rate_id % -10000 if rate_id is not None else None,
**traverse_obj(rate_info, {
'format': ('name', {str_or_none}),
'tbr': ('bit', {int_or_none}),
}),
})
return formats
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
if video_id.isdigit(): webpage = self._download_webpage(url, video_id)
room_id = video_id room_id = self._search_regex(r'\$ROOM\.room_id\s*=\s*(\d+)', webpage, 'room id')
else:
page = self._download_webpage(url, video_id) if self._search_regex(r'"videoLoop"\s*:\s*(\d+)', webpage, 'loop', default='') == '1':
room_id = self._html_search_regex( raise UserNotLive('The channel is auto-playing VODs', video_id=video_id)
r'"room_id\\?"\s*:\s*(\d+),', page, 'room id') if self._search_regex(r'\$ROOM\.show_status\s*=\s*(\d+)', webpage, 'status', default='') == '2':
raise UserNotLive(video_id=video_id)
# Grab metadata from API # Grab metadata from API
params = { params = {
@ -102,110 +171,136 @@ def _real_extract(self, url):
'time': int(time.time()), 'time': int(time.time()),
} }
params['auth'] = hashlib.md5( params['auth'] = hashlib.md5(
f'room/{video_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest() f'room/{room_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest()
room = self._download_json( room = traverse_obj(self._download_json(
f'http://www.douyutv.com/api/v1/room/{room_id}', video_id, f'http://www.douyutv.com/api/v1/room/{room_id}', video_id,
note='Downloading room info', query=params)['data'] note='Downloading room info', query=params, fatal=False), 'data')
# 1 = live, 2 = offline # 1 = live, 2 = offline
if room.get('show_status') == '2': if traverse_obj(room, 'show_status') == '2':
raise ExtractorError('Live stream is offline', expected=True) raise UserNotLive(video_id=video_id)
video_url = urljoin('https://hls3-akm.douyucdn.cn/', self._search_regex(r'(live/.*)', room['hls_url'], 'URL')) js_sign_func = self._search_js_sign_func(webpage, fatal=False) or self._get_sign_func(room_id, video_id)
formats, subs = self._extract_m3u8_formats_and_subtitles(video_url, room_id) form_data = {
'rate': 0,
**self._calc_sign(js_sign_func, video_id, room_id),
}
stream_formats = [self._download_json(
f'https://www.douyu.com/lapi/live/getH5Play/{room_id}',
video_id, note="Downloading livestream format",
data=urlencode_postdata(form_data))]
title = unescapeHTML(room['room_name']) for rate_id in traverse_obj(stream_formats[0], ('data', 'multirates', ..., 'rate')):
description = room.get('show_details') if rate_id != traverse_obj(stream_formats[0], ('data', 'rate')):
thumbnail = room.get('room_src') form_data['rate'] = rate_id
uploader = room.get('nickname') stream_formats.append(self._download_json(
f'https://www.douyu.com/lapi/live/getH5Play/{room_id}',
video_id, note=f'Downloading livestream format {rate_id}',
data=urlencode_postdata(form_data)))
return { return {
'id': room_id, 'id': room_id,
'display_id': video_id, 'formats': self._extract_stream_formats(stream_formats),
'title': title,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'is_live': True, 'is_live': True,
'subtitles': subs, **traverse_obj(room, {
'formats': formats, 'display_id': ('url', {str}, {lambda i: i[1:]}),
'title': ('room_name', {unescapeHTML}),
'description': ('show_details', {str}),
'uploader': ('nickname', {str}),
'thumbnail': ('room_src', {url_or_none}),
})
} }
class DouyuShowIE(InfoExtractor): class DouyuShowIE(DouyuBaseIE):
_VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)' _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw', 'url': 'https://v.douyu.com/show/mPyq7oVNe5Yv1gLY',
'md5': '0c2cfd068ee2afe657801269b2d86214',
'info_dict': { 'info_dict': {
'id': 'rjNBdvnVXNzvE2yw', 'id': 'mPyq7oVNe5Yv1gLY',
'ext': 'mp4', 'ext': 'mp4',
'title': '陈一发儿:砒霜 我有个室友系列04-01 22点场', 'title': '四川人小时候的味道“蒜苗回锅肉”,传统菜不能丢,要常做来吃',
'duration': 7150.08, 'duration': 633,
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': str,
'uploader': '陈一发儿', 'uploader': '美食作家王刚V',
'uploader_id': 'XrZwYelr5wbK', 'uploader_id': 'OVAO4NVx1m7Q',
'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK', 'timestamp': 1661850002,
'upload_date': '20170402', 'upload_date': '20220830',
'view_count': int,
'tags': ['美食', '美食综合'],
}, },
}, { }, {
'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw', 'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw',
'only_matching': True, 'only_matching': True,
}] }]
_FORMATS = {
'super': '原画',
'high': '超清',
'normal': '高清',
}
_QUALITIES = {
'super': -1,
'high': -2,
'normal': -3,
}
_RESOLUTIONS = {
'super': '1920x1080',
'high': '1280x720',
'normal': '852x480',
}
def _real_extract(self, url): def _real_extract(self, url):
url = url.replace('vmobile.', 'v.') url = url.replace('vmobile.', 'v.')
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
room_info = self._parse_json(self._search_regex( video_info = self._search_json(
r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id) r'<script>\s*window\.\$DATA\s*=', webpage,
'video info', video_id, transform_source=js_to_json)
video_info = None js_sign_func = self._search_js_sign_func(webpage)
form_data = {
'vid': video_id,
**self._calc_sign(js_sign_func, video_id, video_info['ROOM']['point_id']),
}
url_info = self._download_json(
'https://v.douyu.com/api/stream/getStreamUrl', video_id,
data=urlencode_postdata(form_data), note="Downloading video formats")
for trial in range(5): formats = []
# Sometimes Douyu rejects our request. Let's try it more times for name, url in traverse_obj(url_info, ('data', 'thumb_video', {dict.items}, ...)):
try: video_url = traverse_obj(url, ('url', {url_or_none}))
video_info = self._download_json( if video_url:
'https://vmobile.douyu.com/video/getInfo', video_id, ext = determine_ext(video_url)
query={'vid': video_id}, formats.append({
headers={ 'format': self._FORMATS.get(name),
'Referer': url, 'format_id': name,
'x-requested-with': 'XMLHttpRequest', 'url': video_url,
}) 'quality': self._QUALITIES.get(name),
break 'ext': 'mp4' if ext == 'm3u8' else ext,
except ExtractorError: 'protocol': 'm3u8_native' if ext == 'm3u8' else 'https',
self._sleep(1, video_id) **parse_resolution(self._RESOLUTIONS.get(name))
})
if not video_info: else:
raise ExtractorError('Can\'t fetch video info') self.to_screen(
f'"{self._FORMATS.get(name, name)}" format may require logging in. {self._login_hint()}')
formats = self._extract_m3u8_formats(
video_info['data']['video_url'], video_id,
entry_protocol='m3u8_native', ext='mp4')
upload_date = unified_strdate(self._html_search_regex(
r'<em>上传时间:</em><span>([^<]+)</span>', webpage,
'upload date', fatal=False))
uploader = uploader_id = uploader_url = None
mobj = re.search(
r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"',
webpage)
if mobj:
uploader_id, uploader = mobj.groups()
uploader_url = urljoin(url, '/author/' + uploader_id)
return { return {
'id': video_id, 'id': video_id,
'title': room_info['name'],
'formats': formats, 'formats': formats,
'duration': room_info.get('duration'), **traverse_obj(video_info, ('DATA', {
'thumbnail': room_info.get('pic'), 'title': ('content', 'title', {str}),
'upload_date': upload_date, 'uploader': ('content', 'author', {str}),
'uploader': uploader, 'uploader_id': ('content', 'up_id', {str_or_none}),
'uploader_id': uploader_id, 'duration': ('content', 'video_duration', {int_or_none}),
'uploader_url': uploader_url, 'thumbnail': ('content', 'video_pic', {url_or_none}),
'timestamp': ('content', 'create_time', {int_or_none}),
'view_count': ('content', 'view_num', {int_or_none}),
'tags': ('videoTag', ..., 'tagName', {str}),
}))
} }

96
yt_dlp/extractor/eplus.py Normal file
View file

@ -0,0 +1,96 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
try_call,
unified_timestamp,
)
class EplusIbIE(InfoExtractor):
IE_NAME = 'eplus:inbound'
IE_DESC = 'e+ (イープラス) overseas'
_VALID_URL = r'https?://live\.eplus\.jp/ex/player\?ib=(?P<id>(?:\w|%2B|%2F){86}%3D%3D)'
_TESTS = [{
'url': 'https://live.eplus.jp/ex/player?ib=YEFxb3Vyc2Dombnjg7blkrLlrablnJLjgrnjgq%2Fjg7zjg6vjgqLjgqTjg4njg6vlkIzlpb3kvJpgTGllbGxhIQ%3D%3D',
'info_dict': {
'id': '354502-0001-002',
'title': 'LoveLive!Series Presents COUNTDOWN LoveLive! 2021→2022LIVE with a smile!【Streaming+(配信)】',
'live_status': 'was_live',
'release_date': '20211231',
'release_timestamp': 1640952000,
'description': str,
},
'params': {
'skip_download': True,
'ignore_no_formats_error': True,
},
'expected_warnings': [
'Could not find the playlist URL. This event may not be accessible',
'No video formats found!',
'Requested format is not available',
],
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
data_json = self._search_json(r'<script>\s*var app\s*=', webpage, 'data json', video_id)
delivery_status = data_json.get('delivery_status')
archive_mode = data_json.get('archive_mode')
release_timestamp = try_call(lambda: unified_timestamp(data_json['event_datetime']) - 32400)
release_timestamp_str = data_json.get('event_datetime_text') # JST
self.write_debug(f'delivery_status = {delivery_status}, archive_mode = {archive_mode}')
if delivery_status == 'PREPARING':
live_status = 'is_upcoming'
elif delivery_status == 'STARTED':
live_status = 'is_live'
elif delivery_status == 'STOPPED':
if archive_mode != 'ON':
raise ExtractorError(
'This event has ended and there is no archive for this event', expected=True)
live_status = 'post_live'
elif delivery_status == 'WAIT_CONFIRM_ARCHIVED':
live_status = 'post_live'
elif delivery_status == 'CONFIRMED_ARCHIVE':
live_status = 'was_live'
else:
self.report_warning(f'Unknown delivery_status {delivery_status}, treat it as a live')
live_status = 'is_live'
formats = []
m3u8_playlist_urls = self._search_json(
r'var listChannels\s*=', webpage, 'hls URLs', video_id, contains_pattern=r'\[.+\]', default=[])
if not m3u8_playlist_urls:
if live_status == 'is_upcoming':
self.raise_no_formats(
f'Could not find the playlist URL. This live event will begin at {release_timestamp_str} JST', expected=True)
else:
self.raise_no_formats(
'Could not find the playlist URL. This event may not be accessible', expected=True)
elif live_status == 'is_upcoming':
self.raise_no_formats(f'This live event will begin at {release_timestamp_str} JST', expected=True)
elif live_status == 'post_live':
self.raise_no_formats('This event has ended, and the archive will be available shortly', expected=True)
else:
for m3u8_playlist_url in m3u8_playlist_urls:
formats.extend(self._extract_m3u8_formats(m3u8_playlist_url, video_id))
# FIXME: HTTP request headers need to be updated to continue download
warning = 'Due to technical limitations, the download will be interrupted after one hour'
if live_status == 'is_live':
self.report_warning(warning)
elif live_status == 'was_live':
self.report_warning(f'{warning}. You can restart to continue the download')
return {
'id': data_json['app_id'],
'title': data_json.get('app_name'),
'formats': formats,
'live_status': live_status,
'description': data_json.get('content'),
'release_timestamp': release_timestamp,
}

View file

@ -11,8 +11,8 @@ class ExpressenIE(InfoExtractor):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?:// https?://
(?:www\.)?(?:expressen|di)\.se/ (?:www\.)?(?:expressen|di)\.se/
(?:(?:tvspelare/video|videoplayer/embed)/)? (?:(?:tvspelare/video|video-?player/embed)/)?
tv/(?:[^/]+/)* (?:tv|nyheter)/(?:[^/?#]+/)*
(?P<id>[^/?#&]+) (?P<id>[^/?#&]+)
''' '''
_EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1'] _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1']
@ -42,6 +42,12 @@ class ExpressenIE(InfoExtractor):
}, { }, {
'url': 'https://www.di.se/videoplayer/embed/tv/ditv/borsmorgon/implantica-rusar-70--under-borspremiaren-hor-styrelsemedlemmen/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di', 'url': 'https://www.di.se/videoplayer/embed/tv/ditv/borsmorgon/implantica-rusar-70--under-borspremiaren-hor-styrelsemedlemmen/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.expressen.se/video-player/embed/tv/nyheter/ekero-fodda-olof-gustafsson-forvaltar-knarkbaronen-pablo-escobars-namn',
'only_matching': True,
}, {
'url': 'https://www.expressen.se/nyheter/efter-egna-telefonbluffen-escobar-stammer-klarna/',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View file

@ -74,6 +74,22 @@ class FacebookIE(InfoExtractor):
_VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary'
_TESTS = [{ _TESTS = [{
'url': 'https://www.facebook.com/radiokicksfm/videos/3676516585958356/',
'info_dict': {
'id': '3676516585958356',
'ext': 'mp4',
'title': 'dr Adam Przygoda',
'description': 'md5:34675bda53336b1d16400265c2bb9b3b',
'uploader': 'RADIO KICKS FM',
'upload_date': '20230818',
'timestamp': 1692346159,
'thumbnail': r're:^https?://.*',
'uploader_id': '100063551323670',
'duration': 3132.184,
'view_count': int,
'concurrent_view_count': 0,
},
}, {
'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',
'md5': '6a40d33c0eccbb1af76cf0485a052659', 'md5': '6a40d33c0eccbb1af76cf0485a052659',
'info_dict': { 'info_dict': {
@ -97,7 +113,7 @@ class FacebookIE(InfoExtractor):
'upload_date': '20140506', 'upload_date': '20140506',
'timestamp': 1399398998, 'timestamp': 1399398998,
'thumbnail': r're:^https?://.*', 'thumbnail': r're:^https?://.*',
'uploader_id': 'pfbid04scW44U4P9iTyLZAGy8y8W3pR3i2VugvHCimiRudUAVbN3MPp9eXBaYFcgVworZwl', 'uploader_id': 'pfbid028wxorhX2ErLFJ578N6P3crHD3PHmXTCqCvfBpsnbSLmbokwSY75p5hWBjHGkG4zxl',
'duration': 131.03, 'duration': 131.03,
'concurrent_view_count': int, 'concurrent_view_count': int,
}, },
@ -179,7 +195,7 @@ class FacebookIE(InfoExtractor):
'timestamp': 1486648217, 'timestamp': 1486648217,
'upload_date': '20170209', 'upload_date': '20170209',
'uploader': 'Yaroslav Korpan', 'uploader': 'Yaroslav Korpan',
'uploader_id': 'pfbid029y8j22EwH3ikeqgH3SEP9G3CAi9kmWKgXJJG9s5geV7mo3J2bvURqHCdgucRgAyhl', 'uploader_id': 'pfbid06AScABAWcW91qpiuGrLt99Ef9tvwHoXP6t8KeFYEqkSfreMtfa9nTveh8b2ZEVSWl',
'concurrent_view_count': int, 'concurrent_view_count': int,
'thumbnail': r're:^https?://.*', 'thumbnail': r're:^https?://.*',
'view_count': int, 'view_count': int,
@ -274,7 +290,7 @@ class FacebookIE(InfoExtractor):
'title': 'Josef', 'title': 'Josef',
'thumbnail': r're:^https?://.*', 'thumbnail': r're:^https?://.*',
'concurrent_view_count': int, 'concurrent_view_count': int,
'uploader_id': 'pfbid02gXHbDwxumkaKJQaTGUf3znYfYzTuidGEWawiramNx4YamSj2afwYSRkpcjtHtMRJl', 'uploader_id': 'pfbid0cibUN6tV7DYgdbJdsUFN46wc4jKpVSPAvJQhFofGqBGmVn3V3JtAs2tfUwziw2hUl',
'timestamp': 1549275572, 'timestamp': 1549275572,
'duration': 3.413, 'duration': 3.413,
'uploader': 'Josef Novak', 'uploader': 'Josef Novak',
@ -401,9 +417,9 @@ def _extract_from_url(self, url, video_id):
def extract_metadata(webpage): def extract_metadata(webpage):
post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall(
r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)] r'data-sjs>({.*?ScheduledServerJS.*?})</script>', webpage)]
post = traverse_obj(post_data, ( post = traverse_obj(post_data, (
..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or []
media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: (
k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict)
title = get_first(media, ('title', 'text')) title = get_first(media, ('title', 'text'))
@ -489,18 +505,17 @@ def process_formats(info):
# with non-browser User-Agent. # with non-browser User-Agent.
for f in info['formats']: for f in info['formats']:
f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
info['_format_sort_fields'] = ('res', 'quality')
def extract_relay_data(_filter): def extract_relay_data(_filter):
return self._parse_json(self._search_regex( return self._parse_json(self._search_regex(
r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter, r'data-sjs>({.*?%s.*?})</script>' % _filter,
webpage, 'replay data', default='{}'), video_id, fatal=False) or {} webpage, 'replay data', default='{}'), video_id, fatal=False) or {}
def extract_relay_prefetched_data(_filter): def extract_relay_prefetched_data(_filter):
replay_data = extract_relay_data(_filter) return traverse_obj(extract_relay_data(_filter), (
for require in (replay_data.get('require') or []): 'require', (None, (..., ..., ..., '__bbox', 'require')),
if require[0] == 'RelayPrefetchedStreamCache': lambda _, v: 'RelayPrefetchedStreamCache' in v, ..., ...,
return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} '__bbox', 'result', 'data', {dict}), get_all=False) or {}
if not video_data: if not video_data:
server_js_data = self._parse_json(self._search_regex([ server_js_data = self._parse_json(self._search_regex([
@ -511,7 +526,7 @@ def extract_relay_prefetched_data(_filter):
if not video_data: if not video_data:
data = extract_relay_prefetched_data( data = extract_relay_prefetched_data(
r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"') r'"(?:dash_manifest|playable_url(?:_quality_hd)?)')
if data: if data:
entries = [] entries = []
@ -526,7 +541,8 @@ def parse_graphql_video(video):
formats = [] formats = []
q = qualities(['sd', 'hd']) q = qualities(['sd', 'hd'])
for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
('playable_url_dash', '')): ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
('browser_native_sd_url', 'sd')):
playable_url = video.get(key) playable_url = video.get(key)
if not playable_url: if not playable_url:
continue continue
@ -535,7 +551,8 @@ def parse_graphql_video(video):
else: else:
formats.append({ formats.append({
'format_id': format_id, 'format_id': format_id,
'quality': q(format_id), # sd, hd formats w/o resolution info should be deprioritized below DASH
'quality': q(format_id) - 3,
'url': playable_url, 'url': playable_url,
}) })
extract_dash_manifest(video, formats) extract_dash_manifest(video, formats)
@ -702,9 +719,11 @@ def parse_attachment(attachment, key='media'):
for src_type in ('src', 'src_no_ratelimit'): for src_type in ('src', 'src_no_ratelimit'):
src = f[0].get('%s_%s' % (quality, src_type)) src = f[0].get('%s_%s' % (quality, src_type))
if src: if src:
preference = -10 if format_id == 'progressive' else -1 # sd, hd formats w/o resolution info should be deprioritized below DASH
# TODO: investigate if progressive or src formats still exist
preference = -10 if format_id == 'progressive' else -3
if quality == 'hd': if quality == 'hd':
preference += 5 preference += 1
formats.append({ formats.append({
'format_id': '%s_%s_%s' % (format_id, quality, src_type), 'format_id': '%s_%s_%s' % (format_id, quality, src_type),
'url': src, 'url': src,

View file

@ -60,6 +60,7 @@ class Funker530IE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
info = {}
rumble_url = list(RumbleEmbedIE._extract_embed_urls(url, webpage)) rumble_url = list(RumbleEmbedIE._extract_embed_urls(url, webpage))
if rumble_url: if rumble_url:
info = {'url': rumble_url[0], 'ie_key': RumbleEmbedIE.ie_key()} info = {'url': rumble_url[0], 'ie_key': RumbleEmbedIE.ie_key()}

View file

@ -2370,7 +2370,7 @@ def _extract_kvs(self, url, webpage, video_id):
'id': flashvars['video_id'], 'id': flashvars['video_id'],
'display_id': display_id, 'display_id': display_id,
'title': title, 'title': title,
'thumbnail': thumbnail, 'thumbnail': urljoin(url, thumbnail),
'formats': formats, 'formats': formats,
} }

View file

@ -66,7 +66,7 @@ def _entries(self, file_id):
query_params = { query_params = {
'contentId': file_id, 'contentId': file_id,
'token': self._TOKEN, 'token': self._TOKEN,
'websiteToken': 12345, 'websiteToken': '7fd94ds12fds4', # From https://gofile.io/dist/js/alljs.js
} }
password = self.get_param('videopassword') password = self.get_param('videopassword')
if password: if password:

View file

@ -383,9 +383,9 @@ def __get_current_timestamp():
months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
time_now = datetime.datetime.utcnow() time_now = datetime.datetime.now(datetime.timezone.utc)
format_string = "{} {} {} %H:%M:%S UTC %Y".format(days[time_now.weekday()], months[time_now.month], time_now.day) format_string = "{} {} {} %H:%M:%S UTC %Y".format(days[time_now.weekday()], months[time_now.month], time_now.day)
time_string = datetime.datetime.utcnow().strftime(format_string) time_string = time_now.strftime(format_string)
return time_string return time_string
def __str__(self): def __str__(self):

View file

@ -1,9 +1,9 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
parse_age_limit, parse_age_limit,
parse_iso8601, parse_iso8601,
time_seconds,
update_url_query, update_url_query,
) )
@ -11,15 +11,14 @@
class IndavideoEmbedIE(InfoExtractor): class IndavideoEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P<id>[\da-f]+)' _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P<id>[\da-f]+)'
# Some example URLs covered by generic extractor: # Some example URLs covered by generic extractor:
# http://indavideo.hu/video/Vicces_cica_1 # https://indavideo.hu/video/Vicces_cica_1
# http://index.indavideo.hu/video/2015_0728_beregszasz # https://index.indavideo.hu/video/Hod_Nemetorszagban
# http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko # https://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko
# http://erotika.indavideo.hu/video/Amator_tini_punci # https://film.indavideo.hu/video/f_farkaslesen
# http://film.indavideo.hu/video/f_hrom_nagymamm_volt # https://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes
# http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)//embed\.indavideo\.hu/player/video/[\da-f]+)']
_EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)']
_TESTS = [{ _TESTS = [{
'url': 'http://indavideo.hu/player/video/1bdc3c6d80/', 'url': 'https://indavideo.hu/player/video/1bdc3c6d80/',
'md5': 'c8a507a1c7410685f83a06eaeeaafeab', 'md5': 'c8a507a1c7410685f83a06eaeeaafeab',
'info_dict': { 'info_dict': {
'id': '1837039', 'id': '1837039',
@ -36,21 +35,33 @@ class IndavideoEmbedIE(InfoExtractor):
'tags': ['tánc', 'cica', 'cuki', 'cukiajanlo', 'newsroom'], 'tags': ['tánc', 'cica', 'cuki', 'cukiajanlo', 'newsroom'],
}, },
}, { }, {
'url': 'http://embed.indavideo.hu/player/video/1bdc3c6d80?autostart=1&hide=1', 'url': 'https://embed.indavideo.hu/player/video/1bdc3c6d80?autostart=1&hide=1',
'only_matching': True,
}, {
'url': 'http://assets.indavideo.hu/swf/player.swf?v=fe25e500&vID=1bdc3c6d80&autostart=1&hide=1&i=1',
'only_matching': True, 'only_matching': True,
}] }]
_WEBPAGE_TESTS = [{
'url': 'https://indavideo.hu/video/Vicces_cica_1',
'info_dict': {
'id': '1335611',
'ext': 'mp4',
'title': 'Vicces cica',
'description': 'Játszik a tablettel. :D',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Jet_Pack',
'uploader_id': '491217',
'timestamp': 1390821212,
'upload_date': '20140127',
'duration': 7,
'age_limit': 0,
'tags': ['cica', 'Jet_Pack'],
},
}]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
video = self._download_json( video = self._download_json(
'https://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/%s' % video_id, f'https://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/{video_id}/',
video_id)['data'] video_id, query={'_': time_seconds()})['data']
title = video['title']
video_urls = [] video_urls = []
@ -60,33 +71,21 @@ def _real_extract(self, url):
elif isinstance(video_files, dict): elif isinstance(video_files, dict):
video_urls.extend(video_files.values()) video_urls.extend(video_files.values())
video_file = video.get('video_file')
if video:
video_urls.append(video_file)
video_urls = list(set(video_urls)) video_urls = list(set(video_urls))
video_prefix = video_urls[0].rsplit('/', 1)[0] filesh = video.get('filesh') or {}
for flv_file in video.get('flv_files', []):
flv_url = '%s/%s' % (video_prefix, flv_file)
if flv_url not in video_urls:
video_urls.append(flv_url)
filesh = video.get('filesh')
formats = [] formats = []
for video_url in video_urls: for video_url in video_urls:
height = int_or_none(self._search_regex( height = int_or_none(self._search_regex(
r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)) r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None))
if filesh: if not height and len(filesh) == 1:
if not height: height = int_or_none(list(filesh.keys())[0])
continue token = filesh.get(str(height))
token = filesh.get(compat_str(height)) if token is None:
if token is None: continue
continue
video_url = update_url_query(video_url, {'token': token})
formats.append({ formats.append({
'url': video_url, 'url': update_url_query(video_url, {'token': token}),
'height': height, 'height': height,
}) })
@ -103,7 +102,7 @@ def _real_extract(self, url):
return { return {
'id': video.get('id') or video_id, 'id': video.get('id') or video_id,
'title': title, 'title': video.get('title'),
'description': video.get('description'), 'description': video.get('description'),
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'uploader': video.get('user_name'), 'uploader': video.get('user_name'),

View file

@ -57,8 +57,8 @@ class LecturioIE(LecturioBaseIE):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https:// https://
(?: (?:
app\.lecturio\.com/([^/]+/(?P<nt>[^/?#&]+)\.lecture|(?:\#/)?lecture/c/\d+/(?P<id>\d+))| app\.lecturio\.com/([^/?#]+/(?P<nt>[^/?#&]+)\.lecture|(?:\#/)?lecture/c/\d+/(?P<id>\d+))|
(?:www\.)?lecturio\.de/[^/]+/(?P<nt_de>[^/?#&]+)\.vortrag (?:www\.)?lecturio\.de/(?:[^/?#]+/)+(?P<nt_de>[^/?#&]+)\.vortrag
) )
''' '''
_TESTS = [{ _TESTS = [{
@ -73,6 +73,9 @@ class LecturioIE(LecturioBaseIE):
}, { }, {
'url': 'https://www.lecturio.de/jura/oeffentliches-recht-staatsexamen.vortrag', 'url': 'https://www.lecturio.de/jura/oeffentliches-recht-staatsexamen.vortrag',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.lecturio.de/jura/oeffentliches-recht-at-1-staatsexamen/oeffentliches-recht-staatsexamen.vortrag',
'only_matching': True,
}, { }, {
'url': 'https://app.lecturio.com/#/lecture/c/6434/39634', 'url': 'https://app.lecturio.com/#/lecture/c/6434/39634',
'only_matching': True, 'only_matching': True,

View file

@ -17,11 +17,12 @@ class MassengeschmackTVIE(InfoExtractor):
_TEST = { _TEST = {
'url': 'https://massengeschmack.tv/play/fktv202', 'url': 'https://massengeschmack.tv/play/fktv202',
'md5': 'a9e054db9c2b5a08f0a0527cc201e8d3', 'md5': '9996f314994a49fefe5f39aa1b07ae21',
'info_dict': { 'info_dict': {
'id': 'fktv202', 'id': 'fktv202',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Fernsehkritik-TV - Folge 202', 'title': 'Fernsehkritik-TV #202',
'thumbnail': 'https://cache.massengeschmack.tv/img/mag/fktv202.jpg'
}, },
} }
@ -29,9 +30,6 @@ def _real_extract(self, url):
episode = self._match_id(url) episode = self._match_id(url)
webpage = self._download_webpage(url, episode) webpage = self._download_webpage(url, episode)
title = clean_html(self._html_search_regex(
'<h3>([^<]+)</h3>', webpage, 'title'))
thumbnail = self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False)
sources = self._parse_json(self._search_regex(r'(?s)MEDIA\s*=\s*(\[.+?\]);', webpage, 'media'), episode, js_to_json) sources = self._parse_json(self._search_regex(r'(?s)MEDIA\s*=\s*(\[.+?\]);', webpage, 'media'), episode, js_to_json)
formats = [] formats = []
@ -67,7 +65,8 @@ def _real_extract(self, url):
return { return {
'id': episode, 'id': episode,
'title': title, 'title': clean_html(self._html_search_regex(
r'<span[^>]+\bid=["\']clip-title["\'][^>]*>([^<]+)', webpage, 'title', fatal=False)),
'formats': formats, 'formats': formats,
'thumbnail': thumbnail, 'thumbnail': self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False),
} }

View file

@ -1,5 +1,8 @@
from ..utils import ( from ..utils import (
unified_strdate ExtractorError,
traverse_obj,
unified_strdate,
url_or_none,
) )
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
@ -15,7 +18,7 @@ class MediaKlikkIE(InfoExtractor):
(?P<id>[^/#?_]+)''' (?P<id>[^/#?_]+)'''
_TESTS = [{ _TESTS = [{
# mediaklikk. date in html. # (old) mediaklikk. date in html.
'url': 'https://mediaklikk.hu/video/hazajaro-delnyugat-bacska-a-duna-menten-palankatol-doroszloig/', 'url': 'https://mediaklikk.hu/video/hazajaro-delnyugat-bacska-a-duna-menten-palankatol-doroszloig/',
'info_dict': { 'info_dict': {
'id': '4754129', 'id': '4754129',
@ -23,9 +26,21 @@ class MediaKlikkIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'upload_date': '20210901', 'upload_date': '20210901',
'thumbnail': 'http://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg' 'thumbnail': 'http://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg'
},
'skip': 'Webpage redirects to 404 page',
}, {
# mediaklikk. date in html.
'url': 'https://mediaklikk.hu/video/hazajaro-fabova-hegyseg-kishont-koronaja/',
'info_dict': {
'id': '6696133',
'title': 'Hazajáró, Fabova-hegység - Kishont koronája',
'display_id': 'hazajaro-fabova-hegyseg-kishont-koronaja',
'ext': 'mp4',
'upload_date': '20230903',
'thumbnail': 'https://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg'
} }
}, { }, {
# m4sport # (old) m4sport
'url': 'https://m4sport.hu/video/2021/08/30/gyemant-liga-parizs/', 'url': 'https://m4sport.hu/video/2021/08/30/gyemant-liga-parizs/',
'info_dict': { 'info_dict': {
'id': '4754999', 'id': '4754999',
@ -33,6 +48,18 @@ class MediaKlikkIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'upload_date': '20210830', 'upload_date': '20210830',
'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/08/vlcsnap-2021-08-30-18h21m20s10-1024x576.jpg' 'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/08/vlcsnap-2021-08-30-18h21m20s10-1024x576.jpg'
},
'skip': 'Webpage redirects to 404 page',
}, {
# m4sport
'url': 'https://m4sport.hu/sportkozvetitesek/video/2023/09/08/atletika-gyemant-liga-brusszel/',
'info_dict': {
'id': '6711136',
'title': 'Atlétika Gyémánt Liga, Brüsszel',
'display_id': 'atletika-gyemant-liga-brusszel',
'ext': 'mp4',
'upload_date': '20230908',
'thumbnail': 'https://m4sport.hu/wp-content/uploads/sites/4/2023/09/vlcsnap-2023-09-08-22h43m18s691.jpg'
} }
}, { }, {
# m4sport with *video/ url and no date # m4sport with *video/ url and no date
@ -40,20 +67,33 @@ class MediaKlikkIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '4492099', 'id': '4492099',
'title': 'Real Madrid - Chelsea 1-1', 'title': 'Real Madrid - Chelsea 1-1',
'display_id': 'real-madrid-chelsea-1-1',
'ext': 'mp4', 'ext': 'mp4',
'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/04/Sequence-01.Still001-1024x576.png' 'thumbnail': 'https://m4sport.hu/wp-content/uploads/sites/4/2021/04/Sequence-01.Still001-1024x576.png'
} }
}, { }, {
# hirado # (old) hirado
'url': 'https://hirado.hu/videok/felteteleket-szabott-a-fovaros/', 'url': 'https://hirado.hu/videok/felteteleket-szabott-a-fovaros/',
'info_dict': { 'info_dict': {
'id': '4760120', 'id': '4760120',
'title': 'Feltételeket szabott a főváros', 'title': 'Feltételeket szabott a főváros',
'ext': 'mp4', 'ext': 'mp4',
'thumbnail': 'http://hirado.hu/wp-content/uploads/sites/4/2021/09/vlcsnap-2021-09-01-20h20m37s165.jpg' 'thumbnail': 'http://hirado.hu/wp-content/uploads/sites/4/2021/09/vlcsnap-2021-09-01-20h20m37s165.jpg'
},
'skip': 'Webpage redirects to video list page',
}, {
# hirado
'url': 'https://hirado.hu/belfold/video/2023/09/11/marad-az-eves-elszamolas-a-napelemekre-beruhazo-csaladoknal',
'info_dict': {
'id': '6716068',
'title': 'Marad az éves elszámolás a napelemekre beruházó családoknál',
'display_id': 'marad-az-eves-elszamolas-a-napelemekre-beruhazo-csaladoknal',
'ext': 'mp4',
'upload_date': '20230911',
'thumbnail': 'https://hirado.hu/wp-content/uploads/sites/4/2023/09/vlcsnap-2023-09-11-09h16m09s882.jpg'
} }
}, { }, {
# petofilive # (old) petofilive
'url': 'https://petofilive.hu/video/2021/06/07/tha-shudras-az-akusztikban/', 'url': 'https://petofilive.hu/video/2021/06/07/tha-shudras-az-akusztikban/',
'info_dict': { 'info_dict': {
'id': '4571948', 'id': '4571948',
@ -61,6 +101,18 @@ class MediaKlikkIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'upload_date': '20210607', 'upload_date': '20210607',
'thumbnail': 'http://petofilive.hu/wp-content/uploads/sites/4/2021/06/vlcsnap-2021-06-07-22h14m23s915-1024x576.jpg' 'thumbnail': 'http://petofilive.hu/wp-content/uploads/sites/4/2021/06/vlcsnap-2021-06-07-22h14m23s915-1024x576.jpg'
},
'skip': 'Webpage redirects to empty page',
}, {
# petofilive
'url': 'https://petofilive.hu/video/2023/09/09/futball-fesztival-a-margitszigeten/',
'info_dict': {
'id': '6713233',
'title': 'Futball Fesztivál a Margitszigeten',
'display_id': 'futball-fesztival-a-margitszigeten',
'ext': 'mp4',
'upload_date': '20230909',
'thumbnail': 'https://petofilive.hu/wp-content/uploads/sites/4/2023/09/Clipboard11-2.jpg'
} }
}] }]
@ -84,8 +136,12 @@ def _real_extract(self, url):
player_data['video'] = player_data.pop('token') player_data['video'] = player_data.pop('token')
player_page = self._download_webpage('https://player.mediaklikk.hu/playernew/player.php', video_id, query=player_data) player_page = self._download_webpage('https://player.mediaklikk.hu/playernew/player.php', video_id, query=player_data)
playlist_url = self._proto_relative_url(compat_urllib_parse_unquote( player_json = self._search_json(
self._html_search_regex(r'\"file\":\s*\"(\\?/\\?/.*playlist\.m3u8)\"', player_page, 'playlist_url')).replace('\\/', '/')) r'\bpl\.setup\s*\(', player_page, 'player json', video_id, end_pattern=r'\);')
playlist_url = traverse_obj(
player_json, ('playlist', lambda _, v: v['type'] == 'hls', 'file', {url_or_none}), get_all=False)
if not playlist_url:
raise ExtractorError('Unable to extract playlist url')
formats = self._extract_wowza_formats( formats = self._extract_wowza_formats(
playlist_url, video_id, skip_protocols=['f4m', 'smil', 'dash']) playlist_url, video_id, skip_protocols=['f4m', 'smil', 'dash'])

View file

@ -14,7 +14,7 @@ class MediaStreamBaseIE(InfoExtractor):
_BASE_URL_RE = r'https?://mdstrm\.com/(?:embed|live-stream)' _BASE_URL_RE = r'https?://mdstrm\.com/(?:embed|live-stream)'
def _extract_mediastream_urls(self, webpage): def _extract_mediastream_urls(self, webpage):
yield from traverse_obj(list(self._yield_json_ld(webpage, None)), ( yield from traverse_obj(list(self._yield_json_ld(webpage, None, fatal=False)), (
lambda _, v: v['@type'] == 'VideoObject', ('embedUrl', 'contentUrl'), lambda _, v: v['@type'] == 'VideoObject', ('embedUrl', 'contentUrl'),
{lambda x: x if re.match(rf'{self._BASE_URL_RE}/\w+', x) else None})) {lambda x: x if re.match(rf'{self._BASE_URL_RE}/\w+', x) else None}))
@ -106,8 +106,12 @@ def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
if 'Debido a tu ubicación no puedes ver el contenido' in webpage: for message in [
self.raise_geo_restricted() 'Debido a tu ubicación no puedes ver el contenido',
'You are not allowed to watch this video: Geo Fencing Restriction'
]:
if message in webpage:
self.raise_geo_restricted()
player_config = self._search_json(r'window\.MDSTRM\.OPTIONS\s*=', webpage, 'metadata', video_id) player_config = self._search_json(r'window\.MDSTRM\.OPTIONS\s*=', webpage, 'metadata', video_id)

View file

@ -20,7 +20,7 @@ class MixcloudBaseIE(InfoExtractor):
def _call_api(self, object_type, object_fields, display_id, username, slug=None): def _call_api(self, object_type, object_fields, display_id, username, slug=None):
lookup_key = object_type + 'Lookup' lookup_key = object_type + 'Lookup'
return self._download_json( return self._download_json(
'https://www.mixcloud.com/graphql', display_id, query={ 'https://app.mixcloud.com/graphql', display_id, query={
'query': '''{ 'query': '''{
%s(lookup: {username: "%s"%s}) { %s(lookup: {username: "%s"%s}) {
%s %s
@ -46,7 +46,15 @@ class MixcloudIE(MixcloudBaseIE):
'view_count': int, 'view_count': int,
'timestamp': 1321359578, 'timestamp': 1321359578,
'upload_date': '20111115', 'upload_date': '20111115',
'uploader_url': 'https://www.mixcloud.com/dholbach/',
'artist': 'Submorphics & Chino , Telekinesis, Porter Robinson, Enei, Breakage ft Jess Mills',
'duration': 3723,
'tags': [],
'comment_count': int,
'repost_count': int,
'like_count': int,
}, },
'params': {'skip_download': 'm3u8'},
}, { }, {
'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/', 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/',
'info_dict': { 'info_dict': {
@ -60,7 +68,14 @@ class MixcloudIE(MixcloudBaseIE):
'view_count': int, 'view_count': int,
'timestamp': 1422987057, 'timestamp': 1422987057,
'upload_date': '20150203', 'upload_date': '20150203',
'uploader_url': 'https://www.mixcloud.com/gillespeterson/',
'duration': 2992,
'tags': [],
'comment_count': int,
'repost_count': int,
'like_count': int,
}, },
'params': {'skip_download': '404 playback error on site'},
}, { }, {
'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/', 'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/',
'only_matching': True, 'only_matching': True,
@ -259,9 +274,9 @@ def _real_extract(self, url):
cloudcast_url = cloudcast.get('url') cloudcast_url = cloudcast.get('url')
if not cloudcast_url: if not cloudcast_url:
continue continue
slug = try_get(cloudcast, lambda x: x['slug'], compat_str) item_slug = try_get(cloudcast, lambda x: x['slug'], compat_str)
owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str) owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str)
video_id = '%s_%s' % (owner_username, slug) if slug and owner_username else None video_id = f'{owner_username}_{item_slug}' if item_slug and owner_username else None
entries.append(self.url_result( entries.append(self.url_result(
cloudcast_url, MixcloudIE.ie_key(), video_id)) cloudcast_url, MixcloudIE.ie_key(), video_id))
@ -284,7 +299,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
'info_dict': { 'info_dict': {
'id': 'dholbach_uploads', 'id': 'dholbach_uploads',
'title': 'Daniel Holbach (uploads)', 'title': 'Daniel Holbach (uploads)',
'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b',
}, },
'playlist_mincount': 36, 'playlist_mincount': 36,
}, { }, {
@ -292,7 +307,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
'info_dict': { 'info_dict': {
'id': 'dholbach_uploads', 'id': 'dholbach_uploads',
'title': 'Daniel Holbach (uploads)', 'title': 'Daniel Holbach (uploads)',
'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b',
}, },
'playlist_mincount': 36, 'playlist_mincount': 36,
}, { }, {
@ -300,7 +315,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
'info_dict': { 'info_dict': {
'id': 'dholbach_favorites', 'id': 'dholbach_favorites',
'title': 'Daniel Holbach (favorites)', 'title': 'Daniel Holbach (favorites)',
'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b',
}, },
# 'params': { # 'params': {
# 'playlist_items': '1-100', # 'playlist_items': '1-100',
@ -323,9 +338,9 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
'info_dict': { 'info_dict': {
'id': 'FirstEar_stream', 'id': 'FirstEar_stream',
'title': 'First Ear (stream)', 'title': 'First Ear (stream)',
'description': 'Curators of good music\r\n\r\nfirstearmusic.com', 'description': 'we maraud for ears',
}, },
'playlist_mincount': 271, 'playlist_mincount': 269,
}] }]
_TITLE_KEY = 'displayName' _TITLE_KEY = 'displayName'

View file

@ -151,7 +151,7 @@ def _real_extract(self, url):
'd': 'days', 'd': 'days',
} }
kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta}
upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') upload_date = (datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(**kwargs)).strftime('%Y%m%d')
comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage)) comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage))
uploader_id = self._html_search_regex( uploader_id = self._html_search_regex(

View file

@ -33,7 +33,7 @@ def _real_extract(self, url):
class N1InfoIIE(InfoExtractor): class N1InfoIIE(InfoExtractor):
IE_NAME = 'N1Info:article' IE_NAME = 'N1Info:article'
_VALID_URL = r'https?://(?:(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)|nova\.rs)/(?:[^/]+/){1,2}(?P<id>[^/]+)' _VALID_URL = r'https?://(?:(?:\w+\.)?n1info\.\w+|nova\.rs)/(?:[^/?#]+/){1,2}(?P<id>[^/?#]+)'
_TESTS = [{ _TESTS = [{
# Youtube embedded # Youtube embedded
'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/', 'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/',
@ -94,6 +94,16 @@ class N1InfoIIE(InfoExtractor):
'upload_date': '20211102', 'upload_date': '20211102',
'timestamp': 1635861677, 'timestamp': 1635861677,
}, },
}, {
'url': 'https://n1info.rs/vesti/cuta-biti-u-kosovskoj-mitrovici-znaci-da-te-docekaju-eksplozivnim-napravama/',
'info_dict': {
'id': '1332368',
'ext': 'mp4',
'title': 'Ćuta: Biti u Kosovskoj Mitrovici znači da te dočekaju eksplozivnim napravama',
'upload_date': '20230620',
'timestamp': 1687290536,
'thumbnail': 'https://cdn.brid.tv/live/partners/26827/snapshot/1332368_th_6492013a8356f_1687290170.jpg'
},
}, { }, {
'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/', 'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/',
'only_matching': True, 'only_matching': True,
@ -105,19 +115,35 @@ def _real_extract(self, url):
title = self._html_search_regex(r'<h1[^>]+>(.+?)</h1>', webpage, 'title') title = self._html_search_regex(r'<h1[^>]+>(.+?)</h1>', webpage, 'title')
timestamp = unified_timestamp(self._html_search_meta('article:published_time', webpage)) timestamp = unified_timestamp(self._html_search_meta('article:published_time', webpage))
plugin_data = self._html_search_meta('BridPlugin', webpage)
videos = re.findall(r'(?m)(<video[^>]+>)', webpage)
entries = [] entries = []
for video in videos: if plugin_data:
video_data = extract_attributes(video) site_id = self._html_search_regex(r'site:(\d+)', webpage, 'site id')
entries.append({ for video_data in re.findall(r'\$bp\("Brid_\d+", (.+)\);', webpage):
'_type': 'url_transparent', video_id = self._parse_json(video_data, title)['video']
'url': video_data.get('data-url'), entries.append({
'id': video_data.get('id'), 'id': video_id,
'title': title, 'title': title,
'thumbnail': video_data.get('data-thumbnail'), 'timestamp': timestamp,
'timestamp': timestamp, 'thumbnail': self._html_search_meta('thumbnailURL', webpage),
'ie_key': 'N1InfoAsset'}) 'formats': self._extract_m3u8_formats(
f'https://cdn-uc.brid.tv/live/partners/{site_id}/streaming/{video_id}/{video_id}.m3u8',
video_id, fatal=False),
})
else:
# Old player still present in older articles
videos = re.findall(r'(?m)(<video[^>]+>)', webpage)
for video in videos:
video_data = extract_attributes(video)
entries.append({
'_type': 'url_transparent',
'url': video_data.get('data-url'),
'id': video_data.get('id'),
'title': title,
'thumbnail': video_data.get('data-thumbnail'),
'timestamp': timestamp,
'ie_key': 'N1InfoAsset',
})
embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage) embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage)
for embedded_video in embedded_videos: for embedded_video in embedded_videos:

View file

@ -21,7 +21,7 @@
class NaverBaseIE(InfoExtractor): class NaverBaseIE(InfoExtractor):
_CAPTION_EXT_RE = r'\.(?:ttml|vtt)' _CAPTION_EXT_RE = r'\.(?:ttml|vtt)'
@staticmethod # NB: Used in VLiveWebArchiveIE, WeverseIE @staticmethod # NB: Used in WeverseIE
def process_subtitles(vod_data, process_url): def process_subtitles(vod_data, process_url):
ret = {'subtitles': {}, 'automatic_captions': {}} ret = {'subtitles': {}, 'automatic_captions': {}}
for caption in traverse_obj(vod_data, ('captions', 'list', ...)): for caption in traverse_obj(vod_data, ('captions', 'list', ...)):

View file

@ -265,6 +265,26 @@ class NitterIE(InfoExtractor):
'repost_count': int, 'repost_count': int,
'comment_count': int, 'comment_count': int,
} }
}, { # no OpenGraph title
'url': f'https://{current_instance}/LocalBateman/status/1678455464038735895#m',
'info_dict': {
'id': '1678455464038735895',
'ext': 'mp4',
'title': 'Your Typical Local Man - Local man, what did Romanians ever do to you?',
'description': 'Local man, what did Romanians ever do to you?',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Your Typical Local Man',
'uploader_id': 'LocalBateman',
'uploader_url': f'https://{current_instance}/LocalBateman',
'upload_date': '20230710',
'timestamp': 1689009900,
'view_count': int,
'like_count': int,
'repost_count': int,
'comment_count': int,
},
'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
'params': {'skip_download': 'm3u8'},
} }
] ]
@ -292,7 +312,7 @@ def _real_extract(self, url):
'ext': ext 'ext': ext
}] }]
title = description = self._og_search_description(full_webpage) or self._html_search_regex( title = description = self._og_search_description(full_webpage, default=None) or self._html_search_regex(
r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False) r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)
uploader_id = self._html_search_regex( uploader_id = self._html_search_regex(

View file

@ -6,7 +6,6 @@
determine_ext, determine_ext,
int_or_none, int_or_none,
js_to_json, js_to_json,
qualities,
traverse_obj, traverse_obj,
unified_strdate, unified_strdate,
url_or_none, url_or_none,
@ -49,77 +48,52 @@ def _real_extract(self, url):
duration = None duration = None
formats = [] formats = []
player = self._parse_json( def process_format_list(format_list, format_id=""):
self._search_regex( nonlocal formats, has_drm
(r'(?:(?:replacePlaceholders|processAdTagModifier).*?:\s*)?(?:replacePlaceholders|processAdTagModifier)\s*\(\s*(?P<json>{.*?})\s*\)(?:\s*\))?\s*,', if not isinstance(format_list, list):
r'Player\.init\s*\([^,]+,(?P<cndn>\s*\w+\s*\?)?\s*(?P<json>{(?(cndn).+?|.+)})\s*(?(cndn):|,\s*{.+?}\s*\)\s*;)'), format_list = [format_list]
webpage, 'player', default='{}', group='json'), video_id, fatal=False) for format_dict in format_list:
if player: if not isinstance(format_dict, dict):
for format_id, format_list in player['tracks'].items(): continue
if not isinstance(format_list, list): if (not self.get_param('allow_unplayable_formats')
format_list = [format_list] and traverse_obj(format_dict, ('drm', 'keySystem'))):
for format_dict in format_list: has_drm = True
if not isinstance(format_dict, dict): continue
continue format_url = url_or_none(format_dict.get('src'))
if (not self.get_param('allow_unplayable_formats') format_type = format_dict.get('type')
and traverse_obj(format_dict, ('drm', 'keySystem'))): ext = determine_ext(format_url)
has_drm = True if (format_type == 'application/x-mpegURL'
continue or format_id == 'HLS' or ext == 'm3u8'):
format_url = url_or_none(format_dict.get('src')) formats.extend(self._extract_m3u8_formats(
format_type = format_dict.get('type') format_url, video_id, 'mp4',
ext = determine_ext(format_url) entry_protocol='m3u8_native', m3u8_id='hls',
if (format_type == 'application/x-mpegURL' fatal=False))
or format_id == 'HLS' or ext == 'm3u8'): elif (format_type == 'application/dash+xml'
formats.extend(self._extract_m3u8_formats( or format_id == 'DASH' or ext == 'mpd'):
format_url, video_id, 'mp4', formats.extend(self._extract_mpd_formats(
entry_protocol='m3u8_native', m3u8_id='hls', format_url, video_id, mpd_id='dash', fatal=False))
fatal=False)) else:
elif (format_type == 'application/dash+xml' formats.append({
or format_id == 'DASH' or ext == 'mpd'):
formats.extend(self._extract_mpd_formats(
format_url, video_id, mpd_id='dash', fatal=False))
else:
formats.append({
'url': format_url,
})
duration = int_or_none(player.get('duration'))
else:
# Old path, not actual as of 08.04.2020
bitrates = self._parse_json(
self._search_regex(
r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'),
video_id, transform_source=js_to_json)
QUALITIES = ('lq', 'mq', 'hq', 'hd')
quality_key = qualities(QUALITIES)
for format_id, format_list in bitrates.items():
if not isinstance(format_list, list):
format_list = [format_list]
for format_url in format_list:
format_url = url_or_none(format_url)
if not format_url:
continue
if format_id == 'hls':
formats.extend(self._extract_m3u8_formats(
format_url, video_id, ext='mp4',
entry_protocol='m3u8_native', m3u8_id='hls',
fatal=False))
continue
f = {
'url': format_url, 'url': format_url,
} })
f_id = format_id
for quality in QUALITIES: player = self._search_json(
if '%s.mp4' % quality in format_url: r'player:', webpage, 'player', video_id, fatal=False, end_pattern=r';\s*</script>')
f_id += '-%s' % quality if player:
f.update({ for src in traverse_obj(player, ('lib', 'source', 'sources', ...)):
'quality': quality_key(quality), process_format_list(src)
'format_note': quality.upper(), duration = traverse_obj(player, ('sourceInfo', 'duration', {int_or_none}))
}) if not formats and not has_drm:
break # older code path, in use before August 2023
f['format_id'] = f_id player = self._parse_json(
formats.append(f) self._search_regex(
(r'(?:(?:replacePlaceholders|processAdTagModifier).*?:\s*)?(?:replacePlaceholders|processAdTagModifier)\s*\(\s*(?P<json>{.*?})\s*\)(?:\s*\))?\s*,',
r'Player\.init\s*\([^,]+,(?P<cndn>\s*\w+\s*\?)?\s*(?P<json>{(?(cndn).+?|.+)})\s*(?(cndn):|,\s*{.+?}\s*\)\s*;)'),
webpage, 'player', group='json'), video_id)
if player:
for format_id, format_list in player['tracks'].items():
process_format_list(format_list, format_id)
duration = int_or_none(player.get('duration'))
if not formats and has_drm: if not formats and has_drm:
self.report_drm(video_id) self.report_drm(video_id)

View file

@ -1,7 +1,7 @@
import calendar import calendar
import json import json
import functools import functools
from datetime import datetime from datetime import datetime, timezone
from random import random from random import random
from .common import InfoExtractor from .common import InfoExtractor
@ -243,7 +243,7 @@ def _mark_watched(self, base_url, video_id, delivery_info):
invocation_id = delivery_info.get('InvocationId') invocation_id = delivery_info.get('InvocationId')
stream_id = traverse_obj(delivery_info, ('Delivery', 'Streams', ..., 'PublicID'), get_all=False, expected_type=str) stream_id = traverse_obj(delivery_info, ('Delivery', 'Streams', ..., 'PublicID'), get_all=False, expected_type=str)
if invocation_id and stream_id and duration: if invocation_id and stream_id and duration:
timestamp_str = f'/Date({calendar.timegm(datetime.utcnow().timetuple())}000)/' timestamp_str = f'/Date({calendar.timegm(datetime.now(timezone.utc).timetuple())}000)/'
data = { data = {
'streamRequests': [ 'streamRequests': [
{ {

113
yt_dlp/extractor/pornbox.py Normal file
View file

@ -0,0 +1,113 @@
from .common import InfoExtractor
from ..compat import functools
from ..utils import (
int_or_none,
parse_duration,
parse_iso8601,
qualities,
str_or_none,
traverse_obj,
url_or_none,
)
class PornboxIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?pornbox\.com/application/watch-page/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://pornbox.com/application/watch-page/212108',
'md5': '3ff6b6e206f263be4c5e987a3162ac6e',
'info_dict': {
'id': '212108',
'ext': 'mp4',
'title': 'md5:ececc5c6e6c9dd35d290c45fed05fd49',
'uploader': 'Lily Strong',
'timestamp': 1665871200,
'upload_date': '20221015',
'age_limit': 18,
'availability': 'needs_auth',
'duration': 1505,
'cast': ['Lily Strong', 'John Strong'],
'tags': 'count:11',
'description': 'md5:589c7f33e183aa8aa939537300efb859',
'thumbnail': r're:^https?://cdn-image\.gtflixtv\.com.*\.jpg.*$'
}
}, {
'url': 'https://pornbox.com/application/watch-page/216045',
'info_dict': {
'id': '216045',
'title': 'md5:3e48528e73a9a2b12f7a2772ed0b26a2',
'description': 'md5:3e631dcaac029f15ed434e402d1b06c7',
'uploader': 'VK Studio',
'timestamp': 1618264800,
'upload_date': '20210412',
'age_limit': 18,
'availability': 'premium_only',
'duration': 2710,
'cast': 'count:3',
'tags': 'count:29',
'thumbnail': r're:^https?://cdn-image\.gtflixtv\.com.*\.jpg.*$',
'subtitles': 'count:6'
},
'params': {
'skip_download': True,
'ignore_no_formats_error': True
},
'expected_warnings': [
'You are either not logged in or do not have access to this scene',
'No video formats found', 'Requested format is not available']
}]
def _real_extract(self, url):
video_id = self._match_id(url)
public_data = self._download_json(f'https://pornbox.com/contents/{video_id}', video_id)
subtitles = {country_code: [{
'url': f'https://pornbox.com/contents/{video_id}/subtitles/{country_code}',
'ext': 'srt'
}] for country_code in traverse_obj(public_data, ('subtitles', ..., {str}))}
is_free_scene = traverse_obj(
public_data, ('price', 'is_available_for_free', {bool}), default=False)
metadata = {
'id': video_id,
**traverse_obj(public_data, {
'title': ('scene_name', {str.strip}),
'description': ('small_description', {str.strip}),
'uploader': 'studio',
'duration': ('runtime', {parse_duration}),
'cast': (('models', 'male_models'), ..., 'model_name'),
'thumbnail': ('player_poster', {url_or_none}),
'tags': ('niches', ..., 'niche'),
}),
'age_limit': 18,
'timestamp': parse_iso8601(traverse_obj(
public_data, ('studios', 'release_date'), 'publish_date')),
'availability': self._availability(needs_auth=True, needs_premium=not is_free_scene),
'subtitles': subtitles,
}
if not public_data.get('is_purchased') or not is_free_scene:
self.raise_login_required(
'You are either not logged in or do not have access to this scene', metadata_available=True)
return metadata
media_id = traverse_obj(public_data, (
'medias', lambda _, v: v['title'] == 'Full video', 'media_id', {int}), get_all=False)
if not media_id:
self.raise_no_formats('Could not find stream id', video_id=video_id)
stream_data = self._download_json(
f'https://pornbox.com/media/{media_id}/stream', video_id=video_id, note='Getting manifest urls')
get_quality = qualities(['web', 'vga', 'hd', '1080p', '4k', '8k'])
metadata['formats'] = traverse_obj(stream_data, ('qualities', lambda _, v: v['src'], {
'url': 'src',
'vbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}),
'format_id': ('quality', {str_or_none}),
'quality': ('quality', {get_quality}),
'width': ('size', {lambda x: int(x[:-1])}),
}))
return metadata

View file

@ -1,97 +1,155 @@
import re import json
from datetime import date
from urllib.parse import unquote
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import merge_dicts from ..compat import functools
from ..utils import ExtractorError, make_archive_id, urljoin
from ..utils.traversal import traverse_obj
class Pr0grammStaticIE(InfoExtractor):
# Possible urls:
# https://pr0gramm.com/static/5466437
_VALID_URL = r'https?://pr0gramm\.com/static/(?P<id>[0-9]+)'
_TEST = {
'url': 'https://pr0gramm.com/static/5466437',
'md5': '52fa540d70d3edc286846f8ca85938aa',
'info_dict': {
'id': '5466437',
'ext': 'mp4',
'title': 'pr0gramm-5466437 by g11st',
'uploader': 'g11st',
'upload_date': '20221221',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
# Fetch media sources
entries = self._parse_html5_media_entries(url, webpage, video_id)
media_info = entries[0]
# Fetch author
uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader')
# Fetch approx upload timestamp from filename
# Have None-defaults in case the extraction fails
uploadDay = None
uploadMon = None
uploadYear = None
uploadTimestr = None
# (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4)
m = re.search(r'//img\.pr0gramm\.com/(?P<year>[\d]+)/(?P<mon>[\d]+)/(?P<day>[\d]+)/\w+\.\w{,4}', webpage)
if (m):
# Up to a day of accuracy should suffice...
uploadDay = m.groupdict().get('day')
uploadMon = m.groupdict().get('mon')
uploadYear = m.groupdict().get('year')
uploadTimestr = uploadYear + uploadMon + uploadDay
return merge_dicts({
'id': video_id,
'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''),
'uploader': uploader,
'upload_date': uploadTimestr
}, media_info)
# This extractor is for the primary url (used for sharing, and appears in the
# location bar) Since this page loads the DOM via JS, yt-dl can't find any
# video information here. So let's redirect to a compatibility version of
# the site, which does contain the <video>-element by itself, without requiring
# js to be ran.
class Pr0grammIE(InfoExtractor): class Pr0grammIE(InfoExtractor):
# Possible urls: _VALID_URL = r'https?://pr0gramm\.com\/(?:[^/?#]+/)+(?P<id>[\d]+)(?:[/?#:]|$)'
# https://pr0gramm.com/new/546637 _TESTS = [{
# https://pr0gramm.com/new/video/546637 # Tags require account
# https://pr0gramm.com/top/546637
# https://pr0gramm.com/top/video/546637
# https://pr0gramm.com/user/g11st/uploads/5466437
# https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290
# https://pr0gramm.com/user/froschler/reinziehen-1elf/5232030
# https://pr0gramm.com/user/froschler/1elf/5232030
# https://pr0gramm.com/new/5495710:comment62621020 <- this is not the id!
# https://pr0gramm.com/top/fruher war alles damals/5498175
_VALID_URL = r'https?:\/\/pr0gramm\.com\/(?!static/\d+).+?\/(?P<id>[\d]+)(:|$)'
_TEST = {
'url': 'https://pr0gramm.com/new/video/5466437', 'url': 'https://pr0gramm.com/new/video/5466437',
'info_dict': { 'info_dict': {
'id': '5466437', 'id': '5466437',
'ext': 'mp4', 'ext': 'mp4',
'title': 'pr0gramm-5466437 by g11st', 'title': 'pr0gramm-5466437 by g11st',
'tags': ['Neon Genesis Evangelion', 'Touhou Project', 'Fly me to the Moon', 'Marisad', 'Marisa Kirisame', 'video', 'sound', 'Marisa', 'Anime'],
'uploader': 'g11st', 'uploader': 'g11st',
'uploader_id': 394718,
'upload_timestamp': 1671590240,
'upload_date': '20221221', 'upload_date': '20221221',
} 'like_count': int,
} 'dislike_count': int,
'age_limit': 0,
'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
},
}, {
# Tags require account
'url': 'https://pr0gramm.com/new/3052805:comment28391322',
'info_dict': {
'id': '3052805',
'ext': 'mp4',
'title': 'pr0gramm-3052805 by Hansking1',
'tags': 'count:15',
'uploader': 'Hansking1',
'uploader_id': 385563,
'upload_timestamp': 1552930408,
'upload_date': '20190318',
'like_count': int,
'dislike_count': int,
'age_limit': 0,
'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
},
}, {
# Requires verified account
'url': 'https://pr0gramm.com/new/Gianna%20Michaels/5848332',
'info_dict': {
'id': '5848332',
'ext': 'mp4',
'title': 'pr0gramm-5848332 by erd0pfel',
'tags': 'count:18',
'uploader': 'erd0pfel',
'uploader_id': 349094,
'upload_timestamp': 1694489652,
'upload_date': '20230912',
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
},
}, {
'url': 'https://pr0gramm.com/static/5466437',
'only_matching': True,
}, {
'url': 'https://pr0gramm.com/new/rowan%20atkinson%20herr%20bohne/3052805',
'only_matching': True,
}, {
'url': 'https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290',
'only_matching': True,
}]
def _generic_title(): BASE_URL = 'https://pr0gramm.com'
return "oof"
@functools.cached_property
def _is_logged_in(self):
return 'pp' in self._get_cookies(self.BASE_URL)
@functools.cached_property
def _maximum_flags(self):
# We need to guess the flags for the content otherwise the api will raise an error
# We can guess the maximum allowed flags for the account from the cookies
# Bitflags are (msbf): nsfp, nsfl, nsfw, sfw
flags = 0b0001
if self._is_logged_in:
flags |= 0b1000
cookies = self._get_cookies(self.BASE_URL)
if 'me' not in cookies:
self._download_webpage(self.BASE_URL, None, 'Refreshing verification information')
if traverse_obj(cookies, ('me', {lambda x: x.value}, {unquote}, {json.loads}, 'verified')):
flags |= 0b0110
return flags
def _call_api(self, endpoint, video_id, query={}, note='Downloading API json'):
data = self._download_json(
f'https://pr0gramm.com/api/items/{endpoint}',
video_id, note, query=query, expected_status=403)
error = traverse_obj(data, ('error', {str}))
if error in ('nsfwRequired', 'nsflRequired', 'nsfpRequired', 'verificationRequired'):
if not self._is_logged_in:
self.raise_login_required()
raise ExtractorError(f'Unverified account cannot access NSFW/NSFL ({error})', expected=True)
elif error:
message = traverse_obj(data, ('msg', {str})) or error
raise ExtractorError(f'API returned error: {message}', expected=True)
return data
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
video_info = traverse_obj(
self._call_api('get', video_id, {'id': video_id, 'flags': self._maximum_flags}),
('items', 0, {dict}))
return self.url_result( source = urljoin('https://img.pr0gramm.com', video_info.get('image'))
'https://pr0gramm.com/static/' + video_id, if not source or not source.endswith('mp4'):
video_id=video_id, self.raise_no_formats('Could not extract a video', expected=bool(source), video_id=video_id)
ie=Pr0grammStaticIE.ie_key())
tags = None
if self._is_logged_in:
metadata = self._call_api('info', video_id, {'itemId': video_id})
tags = traverse_obj(metadata, ('tags', ..., 'tag', {str}))
# Sorted by "confidence", higher confidence = earlier in list
confidences = traverse_obj(metadata, ('tags', ..., 'confidence', ({int}, {float})))
if confidences:
tags = [tag for _, tag in sorted(zip(confidences, tags), reverse=True)]
return {
'id': video_id,
'title': f'pr0gramm-{video_id} by {video_info.get("user")}',
'formats': [{
'url': source,
'ext': 'mp4',
**traverse_obj(video_info, {
'width': ('width', {int}),
'height': ('height', {int}),
}),
}],
'tags': tags,
'age_limit': 18 if traverse_obj(video_info, ('flags', {0b110.__and__})) else 0,
'_old_archive_ids': [make_archive_id('Pr0grammStatic', video_id)],
**traverse_obj(video_info, {
'uploader': ('user', {str}),
'uploader_id': ('userId', {int}),
'like_count': ('up', {int}),
'dislike_count': ('down', {int}),
'upload_timestamp': ('created', {int}),
'upload_date': ('created', {int}, {date.fromtimestamp}, {lambda x: x.strftime('%Y%m%d')}),
'thumbnail': ('thumb', {lambda x: urljoin('https://thumb.pr0gramm.com', x)})
}),
}

View file

@ -1,7 +1,18 @@
import itertools
import re import re
import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import parse_duration, unified_strdate from ..utils import (
int_or_none,
join_nonempty,
js_to_json,
parse_duration,
strftime_or_none,
traverse_obj,
unified_strdate,
urljoin,
)
class RadioFranceIE(InfoExtractor): class RadioFranceIE(InfoExtractor):
@ -56,8 +67,32 @@ def _real_extract(self, url):
} }
class FranceCultureIE(InfoExtractor): class RadioFranceBaseIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/(?:franceculture|fip|francemusique|mouv|franceinter)/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d+)($|[?#])' _VALID_URL_BASE = r'https?://(?:www\.)?radiofrance\.fr'
_STATIONS_RE = '|'.join(map(re.escape, (
'franceculture',
'franceinfo',
'franceinter',
'francemusique',
'fip',
'mouv',
)))
def _extract_data_from_webpage(self, webpage, display_id, key):
return traverse_obj(self._search_json(
r'\bconst\s+data\s*=', webpage, key, display_id,
contains_pattern=r'(\[\{.*?\}\]);', transform_source=js_to_json),
(..., 'data', key, {dict}), get_all=False) or {}
class FranceCultureIE(RadioFranceBaseIE):
_VALID_URL = rf'''(?x)
{RadioFranceBaseIE._VALID_URL_BASE}
/(?:{RadioFranceBaseIE._STATIONS_RE})
/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d{{6,}})(?:$|[?#])
'''
_TESTS = [ _TESTS = [
{ {
'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487', 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487',
@ -67,14 +102,30 @@ class FranceCultureIE(InfoExtractor):
'ext': 'mp3', 'ext': 'mp3',
'title': 'La physique dEinstein aiderait-elle à comprendre le cerveau ?', 'title': 'La physique dEinstein aiderait-elle à comprendre le cerveau ?',
'description': 'Existerait-il un pont conceptuel entre la physique de lespace-temps et les neurosciences ?', 'description': 'Existerait-il un pont conceptuel entre la physique de lespace-temps et les neurosciences ?',
'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg', 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'upload_date': '20220514', 'upload_date': '20220514',
'duration': 2750, 'duration': 2750,
}, },
}, },
{
'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9-30/le-7-9-30-du-vendredi-10-mars-2023-2107675',
'info_dict': {
'id': '2107675',
'display_id': 'le-7-9-30-du-vendredi-10-mars-2023',
'title': 'Inflation alimentaire : comment en sortir ? - Régis Debray et Claude Grange - Cybèle Idelot',
'description': 'md5:36ee74351ede77a314fdebb94026b916',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'upload_date': '20230310',
'duration': 8977,
'ext': 'mp3',
},
},
{ {
'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507', 'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-sciences/sante-bientot-un-vaccin-contre-l-asthme-allergique-3057200',
'only_matching': True,
} }
] ]
@ -89,7 +140,6 @@ def _real_extract(self, url):
'id': video_id, 'id': video_id,
'display_id': display_id, 'display_id': display_id,
'url': video_data['contentUrl'], 'url': video_data['contentUrl'],
'ext': video_data.get('encodingFormat'),
'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None, 'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None,
'duration': parse_duration(video_data.get('duration')), 'duration': parse_duration(video_data.get('duration')),
'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>', 'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>',
@ -102,3 +152,322 @@ def _real_extract(self, url):
'upload_date': unified_strdate(self._search_regex( 'upload_date': unified_strdate(self._search_regex(
r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False)) r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False))
} }
class RadioFranceLiveIE(RadioFranceBaseIE):
_VALID_URL = rf'''(?x)
https?://(?:www\.)?radiofrance\.fr
/(?P<id>{RadioFranceBaseIE._STATIONS_RE})
/?(?P<substation_id>radio-[\w-]+)?(?:[#?]|$)
'''
_TESTS = [{
'url': 'https://www.radiofrance.fr/franceinter/',
'info_dict': {
'id': 'franceinter',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/franceculture',
'info_dict': {
'id': 'franceculture',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/mouv/radio-musique-kids-family',
'info_dict': {
'id': 'mouv-radio-musique-kids-family',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/mouv/radio-rnb-soul',
'info_dict': {
'id': 'mouv-radio-rnb-soul',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/mouv/radio-musique-mix',
'info_dict': {
'id': 'mouv-radio-musique-mix',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/fip/radio-rock',
'info_dict': {
'id': 'fip-radio-rock',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/mouv',
'only_matching': True,
}]
def _real_extract(self, url):
station_id, substation_id = self._match_valid_url(url).group('id', 'substation_id')
if substation_id:
webpage = self._download_webpage(url, station_id)
api_response = self._extract_data_from_webpage(webpage, station_id, 'webRadioData')
else:
api_response = self._download_json(
f'https://www.radiofrance.fr/{station_id}/api/live', station_id)
formats, subtitles = [], {}
for media_source in traverse_obj(api_response, (('now', None), 'media', 'sources', lambda _, v: v['url'])):
if media_source.get('format') == 'hls':
fmts, subs = self._extract_m3u8_formats_and_subtitles(media_source['url'], station_id, fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
else:
formats.append({
'url': media_source['url'],
'abr': media_source.get('bitrate'),
})
return {
'id': join_nonempty(station_id, substation_id),
'title': traverse_obj(api_response, ('visual', 'legend')) or join_nonempty(
('now', 'firstLine', 'title'), ('now', 'secondLine', 'title'), from_dict=api_response, delim=' - '),
'formats': formats,
'subtitles': subtitles,
'is_live': True,
}
class RadioFrancePlaylistBase(RadioFranceBaseIE):
"""Subclasses must set _METADATA_KEY"""
def _call_api(self, content_id, cursor, page_num):
raise NotImplementedError('This method must be implemented by subclasses')
def _generate_playlist_entries(self, content_id, content_response):
for page_num in itertools.count(2):
for entry in content_response['items']:
yield self.url_result(
f'https://www.radiofrance.fr/{entry["path"]}', url_transparent=True, **traverse_obj(entry, {
'title': 'title',
'description': 'standFirst',
'timestamp': ('publishedDate', {int_or_none}),
'thumbnail': ('visual', 'src'),
}))
next_cursor = traverse_obj(content_response, (('pagination', None), 'next'), get_all=False)
if not next_cursor:
break
content_response = self._call_api(content_id, next_cursor, page_num)
def _real_extract(self, url):
display_id = self._match_id(url)
metadata = self._download_json(
'https://www.radiofrance.fr/api/v2.1/path', display_id,
query={'value': urllib.parse.urlparse(url).path})['content']
content_id = metadata['id']
return self.playlist_result(
self._generate_playlist_entries(content_id, metadata[self._METADATA_KEY]), content_id,
display_id=display_id, **{**traverse_obj(metadata, {
'title': 'title',
'description': 'standFirst',
'thumbnail': ('visual', 'src'),
}), **traverse_obj(metadata, {
'title': 'name',
'description': 'role',
})})
class RadioFrancePodcastIE(RadioFrancePlaylistBase):
_VALID_URL = rf'''(?x)
{RadioFranceBaseIE._VALID_URL_BASE}
/(?:{RadioFranceBaseIE._STATIONS_RE})
/podcasts/(?P<id>[\w-]+)/?(?:[?#]|$)
'''
_TESTS = [{
'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-vert',
'info_dict': {
'id': 'eaf6ef81-a980-4f1c-a7d1-8a75ecd54b17',
'display_id': 'le-billet-vert',
'title': 'Le billet sciences',
'description': 'md5:eb1007b34b0c0a680daaa71525bbd4c1',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_mincount': 11,
}, {
'url': 'https://www.radiofrance.fr/franceinter/podcasts/jean-marie-le-pen-l-obsession-nationale',
'info_dict': {
'id': '566fd524-3074-4fbc-ac69-8696f2152a54',
'display_id': 'jean-marie-le-pen-l-obsession-nationale',
'title': 'Jean-Marie Le Pen, l\'obsession nationale',
'description': 'md5:a07c0cfb894f6d07a62d0ad12c4b7d73',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_count': 7,
}, {
'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine',
'info_dict': {
'id': '63c1ddc9-9f15-457a-98b2-411bac63f48d',
'display_id': 'serie-thomas-grjebine',
'title': 'Thomas Grjebine',
},
'playlist_count': 1,
}, {
'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip',
'info_dict': {
'id': '143dff38-e956-4a5d-8576-1c0b7242b99e',
'display_id': 'certains-l-aiment-fip',
'title': 'Certains laiment Fip',
'description': 'md5:ff974672ba00d4fd5be80fb001c5b27e',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_mincount': 321,
}, {
'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9',
'only_matching': True,
}, {
'url': 'https://www.radiofrance.fr/mouv/podcasts/dirty-mix',
'only_matching': True,
}]
_METADATA_KEY = 'expressions'
def _call_api(self, podcast_id, cursor, page_num):
return self._download_json(
f'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id,
note=f'Downloading page {page_num}', query={'pageCursor': cursor})
class RadioFranceProfileIE(RadioFrancePlaylistBase):
_VALID_URL = rf'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet?p=3',
'info_dict': {
'id': '86c62790-e481-11e2-9f7b-782bcb6744eb',
'display_id': 'thomas-pesquet',
'title': 'Thomas Pesquet',
'description': 'Astronaute à l\'agence spatiale européenne',
},
'playlist_mincount': 212,
}, {
'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie',
'info_dict': {
'id': '9593050b-0183-4972-a0b5-d8f699079e02',
'display_id': 'eugenie-bastie',
'title': 'Eugénie Bastié',
'description': 'Journaliste et essayiste',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_mincount': 39,
}, {
'url': 'https://www.radiofrance.fr/personnes/lea-salame',
'only_matching': True,
}]
_METADATA_KEY = 'documents'
def _call_api(self, profile_id, cursor, page_num):
resp = self._download_json(
f'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id,
note=f'Downloading page {page_num}', query={
'relation': 'personality',
'cursor': cursor,
})
resp['next'] = traverse_obj(resp, ('pagination', 'next'))
return resp
class RadioFranceProgramScheduleIE(RadioFranceBaseIE):
_VALID_URL = rf'''(?x)
{RadioFranceBaseIE._VALID_URL_BASE}
/(?P<station>{RadioFranceBaseIE._STATIONS_RE})
/grille-programmes(?:\?date=(?P<date>[\d-]+))?
'''
_TESTS = [{
'url': 'https://www.radiofrance.fr/franceinter/grille-programmes?date=17-02-2023',
'info_dict': {
'id': 'franceinter-program-20230217',
'upload_date': '20230217',
},
'playlist_count': 25,
}, {
'url': 'https://www.radiofrance.fr/franceculture/grille-programmes?date=01-02-2023',
'info_dict': {
'id': 'franceculture-program-20230201',
'upload_date': '20230201',
},
'playlist_count': 25,
}, {
'url': 'https://www.radiofrance.fr/mouv/grille-programmes?date=19-03-2023',
'info_dict': {
'id': 'mouv-program-20230319',
'upload_date': '20230319',
},
'playlist_count': 3,
}, {
'url': 'https://www.radiofrance.fr/francemusique/grille-programmes?date=18-03-2023',
'info_dict': {
'id': 'francemusique-program-20230318',
'upload_date': '20230318',
},
'playlist_count': 15,
}, {
'url': 'https://www.radiofrance.fr/franceculture/grille-programmes',
'only_matching': True,
}]
def _generate_playlist_entries(self, webpage_url, api_response):
for entry in traverse_obj(api_response, ('steps', lambda _, v: v['expression']['path'])):
yield self.url_result(
urljoin(webpage_url, f'/{entry["expression"]["path"]}'), ie=FranceCultureIE,
url_transparent=True, **traverse_obj(entry, {
'title': ('expression', 'title'),
'thumbnail': ('expression', 'visual', 'src'),
'timestamp': ('startTime', {int_or_none}),
'series_id': ('concept', 'id'),
'series': ('concept', 'title'),
}))
def _real_extract(self, url):
station, date = self._match_valid_url(url).group('station', 'date')
webpage = self._download_webpage(url, station)
grid_data = self._extract_data_from_webpage(webpage, station, 'grid')
upload_date = strftime_or_none(grid_data.get('date'), '%Y%m%d')
return self.playlist_result(
self._generate_playlist_entries(url, grid_data),
join_nonempty(station, 'program', upload_date), upload_date=upload_date)

View file

@ -1,10 +1,11 @@
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import parse_qs, remove_start, traverse_obj, ExtractorError
class RbgTumIE(InfoExtractor): class RbgTumIE(InfoExtractor):
_VALID_URL = r'https://live\.rbg\.tum\.de/w/(?P<id>.+)' _VALID_URL = r'https://(?:live\.rbg\.tum\.de|tum\.live)/w/(?P<id>[^?#]+)'
_TESTS = [{ _TESTS = [{
# Combined view # Combined view
'url': 'https://live.rbg.tum.de/w/cpp/22128', 'url': 'https://live.rbg.tum.de/w/cpp/22128',
@ -35,16 +36,18 @@ class RbgTumIE(InfoExtractor):
'title': 'Fachschaftsvollversammlung', 'title': 'Fachschaftsvollversammlung',
'series': 'Fachschaftsvollversammlung Informatik', 'series': 'Fachschaftsvollversammlung Informatik',
} }
}, {
'url': 'https://tum.live/w/linalginfo/27102',
'only_matching': True,
}, ] }, ]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
m3u8 = self._html_search_regex(r'(https://.+?\.m3u8)', webpage, 'm3u8') m3u8 = self._html_search_regex(r'"(https://[^"]+\.m3u8[^"]*)', webpage, 'm3u8')
lecture_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title') lecture_title = self._html_search_regex(r'<h1[^>]*>([^<]+)</h1>', webpage, 'title', fatal=False)
lecture_series_title = self._html_search_regex( lecture_series_title = remove_start(self._html_extract_title(webpage), 'TUM-Live | ')
r'(?s)<title\b[^>]*>\s*(?:TUM-Live\s\|\s?)?([^:]+):?.*?</title>', webpage, 'series')
formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
@ -57,9 +60,9 @@ def _real_extract(self, url):
class RbgTumCourseIE(InfoExtractor): class RbgTumCourseIE(InfoExtractor):
_VALID_URL = r'https://live\.rbg\.tum\.de/course/(?P<id>.+)' _VALID_URL = r'https://(?P<hostname>(?:live\.rbg\.tum\.de|tum\.live))/old/course/(?P<id>(?P<year>\d+)/(?P<term>\w+)/(?P<slug>[^/?#]+))'
_TESTS = [{ _TESTS = [{
'url': 'https://live.rbg.tum.de/course/2022/S/fpv', 'url': 'https://live.rbg.tum.de/old/course/2022/S/fpv',
'info_dict': { 'info_dict': {
'title': 'Funktionale Programmierung und Verifikation (IN0003)', 'title': 'Funktionale Programmierung und Verifikation (IN0003)',
'id': '2022/S/fpv', 'id': '2022/S/fpv',
@ -69,7 +72,7 @@ class RbgTumCourseIE(InfoExtractor):
}, },
'playlist_count': 13, 'playlist_count': 13,
}, { }, {
'url': 'https://live.rbg.tum.de/course/2022/W/set', 'url': 'https://live.rbg.tum.de/old/course/2022/W/set',
'info_dict': { 'info_dict': {
'title': 'SET FSMPIC', 'title': 'SET FSMPIC',
'id': '2022/W/set', 'id': '2022/W/set',
@ -78,16 +81,62 @@ class RbgTumCourseIE(InfoExtractor):
'noplaylist': False, 'noplaylist': False,
}, },
'playlist_count': 6, 'playlist_count': 6,
}, {
'url': 'https://tum.live/old/course/2023/S/linalginfo',
'only_matching': True,
}, ] }, ]
def _real_extract(self, url): def _real_extract(self, url):
course_id = self._match_id(url) course_id, hostname, year, term, slug = self._match_valid_url(url).group('id', 'hostname', 'year', 'term', 'slug')
webpage = self._download_webpage(url, course_id) meta = self._download_json(
f'https://{hostname}/api/courses/{slug}/', course_id, fatal=False,
query={'year': year, 'term': term}) or {}
lecture_series_title = meta.get('Name')
lectures = [self.url_result(f'https://{hostname}/w/{slug}/{stream_id}', RbgTumIE)
for stream_id in traverse_obj(meta, ('Streams', ..., 'ID'))]
lecture_series_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title') if not lectures:
webpage = self._download_webpage(url, course_id)
lecture_series_title = remove_start(self._html_extract_title(webpage), 'TUM-Live | ')
lectures = [self.url_result(f'https://{hostname}{lecture_path}', RbgTumIE)
for lecture_path in re.findall(r'href="(/w/[^/"]+/[^/"]+)"', webpage)]
lecture_urls = [] return self.playlist_result(lectures, course_id, lecture_series_title)
for lecture_url in re.findall(r'(?i)href="/w/(.+)(?<!/cam)(?<!/pres)(?<!/chat)"', webpage):
lecture_urls.append(self.url_result('https://live.rbg.tum.de/w/' + lecture_url, ie=RbgTumIE.ie_key()))
return self.playlist_result(lecture_urls, course_id, lecture_series_title)
class RbgTumNewCourseIE(InfoExtractor):
_VALID_URL = r'https://(?P<hostname>(?:live\.rbg\.tum\.de|tum\.live))/\?'
_TESTS = [{
'url': 'https://live.rbg.tum.de/?year=2022&term=S&slug=fpv&view=3',
'info_dict': {
'title': 'Funktionale Programmierung und Verifikation (IN0003)',
'id': '2022/S/fpv',
},
'params': {
'noplaylist': False,
},
'playlist_count': 13,
}, {
'url': 'https://live.rbg.tum.de/?year=2022&term=W&slug=set&view=3',
'info_dict': {
'title': 'SET FSMPIC',
'id': '2022/W/set',
},
'params': {
'noplaylist': False,
},
'playlist_count': 6,
}, {
'url': 'https://tum.live/?year=2023&term=S&slug=linalginfo&view=3',
'only_matching': True,
}]
def _real_extract(self, url):
query = parse_qs(url)
errors = [key for key in ('year', 'term', 'slug') if not query.get(key)]
if errors:
raise ExtractorError(f'Input URL is missing query parameters: {", ".join(errors)}')
year, term, slug = query['year'][0], query['term'][0], query['slug'][0]
hostname = self._match_valid_url(url).group('hostname')
return self.url_result(f'https://{hostname}/old/course/{year}/{term}/{slug}', RbgTumCourseIE)

View file

@ -319,16 +319,20 @@ def add_thumbnail(src):
'format_id': 'fallback', 'format_id': 'fallback',
'format_note': 'DASH video, mp4_dash', 'format_note': 'DASH video, mp4_dash',
}] }]
formats.extend(self._extract_m3u8_formats( hls_fmts, subtitles = self._extract_m3u8_formats_and_subtitles(
hls_playlist_url, display_id, 'mp4', m3u8_id='hls', fatal=False)) hls_playlist_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
formats.extend(self._extract_mpd_formats( formats.extend(hls_fmts)
dash_playlist_url, display_id, mpd_id='dash', fatal=False)) dash_fmts, dash_subs = self._extract_mpd_formats_and_subtitles(
dash_playlist_url, display_id, mpd_id='dash', fatal=False)
formats.extend(dash_fmts)
self._merge_subtitles(dash_subs, target=subtitles)
return { return {
**info, **info,
'id': video_id, 'id': video_id,
'display_id': display_id, 'display_id': display_id,
'formats': formats, 'formats': formats,
'subtitles': subtitles,
'duration': int_or_none(reddit_video.get('duration')), 'duration': int_or_none(reddit_video.get('duration')),
} }

View file

@ -1,6 +1,7 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none,
parse_duration, parse_duration,
traverse_obj, traverse_obj,
unified_timestamp, unified_timestamp,
@ -25,7 +26,7 @@ class RTVSLOIE(InfoExtractor):
'url': 'https://www.rtvslo.si/rtv365/arhiv/174842550?s=tv', 'url': 'https://www.rtvslo.si/rtv365/arhiv/174842550?s=tv',
'info_dict': { 'info_dict': {
'id': '174842550', 'id': '174842550',
'ext': 'flv', 'ext': 'mp4',
'release_timestamp': 1643140032, 'release_timestamp': 1643140032,
'upload_date': '20220125', 'upload_date': '20220125',
'series': 'Dnevnik', 'series': 'Dnevnik',
@ -69,7 +70,21 @@ class RTVSLOIE(InfoExtractor):
'tbr': 128000, 'tbr': 128000,
'release_date': '20220201', 'release_date': '20220201',
}, },
}, {
'url': 'https://365.rtvslo.si/arhiv/razred-zase/148350750',
'info_dict': {
'id': '148350750',
'ext': 'mp4',
'title': 'Prvi šolski dan, mozaična oddaja za mlade',
'series': 'Razred zase',
'series_id': '148185730',
'duration': 1481,
'upload_date': '20121019',
'timestamp': 1350672122,
'release_date': '20121019',
'release_timestamp': 1350672122,
'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/148185730/razred_zase_2014_logo_4d_wide2.jpg',
},
}, { }, {
'url': 'https://4d.rtvslo.si/arhiv/dnevnik/174842550', 'url': 'https://4d.rtvslo.si/arhiv/dnevnik/174842550',
'only_matching': True 'only_matching': True
@ -98,13 +113,14 @@ def _real_extract(self, url):
media = self._download_json(self._API_BASE.format('getMedia', v_id), v_id, query={'jwt': jwt})['response'] media = self._download_json(self._API_BASE.format('getMedia', v_id), v_id, query={'jwt': jwt})['response']
formats = [] formats = []
skip_protocols = ['smil', 'f4m', 'dash']
adaptive_url = traverse_obj(media, ('addaptiveMedia', 'hls_sec'), expected_type=url_or_none) adaptive_url = traverse_obj(media, ('addaptiveMedia', 'hls_sec'), expected_type=url_or_none)
if adaptive_url: if adaptive_url:
formats = self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=['smil']) formats = self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=skip_protocols)
adaptive_url = traverse_obj(media, ('addaptiveMedia_sl', 'hls_sec'), expected_type=url_or_none) adaptive_url = traverse_obj(media, ('addaptiveMedia_sl', 'hls_sec'), expected_type=url_or_none)
if adaptive_url: if adaptive_url:
for f in self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=['smil']): for f in self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=skip_protocols):
formats.append({ formats.append({
**f, **f,
'format_id': 'sign-' + f['format_id'], 'format_id': 'sign-' + f['format_id'],
@ -114,19 +130,19 @@ def _real_extract(self, url):
else f.get('language')) else f.get('language'))
}) })
formats.extend( for mediafile in traverse_obj(media, ('mediaFiles', lambda _, v: url_or_none(v['streams']['https']))):
{ formats.append(traverse_obj(mediafile, {
'url': f['streams'][strm], 'url': ('streams', 'https'),
'ext': traverse_obj(f, 'mediaType', expected_type=str.lower), 'ext': ('mediaType', {str.lower}),
'width': f.get('width'), 'width': ('width', {int_or_none}),
'height': f.get('height'), 'height': ('height', {int_or_none}),
'tbr': f.get('bitrate'), 'tbr': ('bitrate', {int_or_none}),
'filesize': f.get('filesize'), 'filesize': ('filesize', {int_or_none}),
} }))
for strm in ('http', 'https')
for f in media.get('mediaFiles') or [] for mediafile in traverse_obj(media, ('mediaFiles', lambda _, v: url_or_none(v['streams']['hls_sec']))):
if traverse_obj(f, ('streams', strm)) formats.extend(self._extract_wowza_formats(
) mediafile['streams']['hls_sec'], v_id, skip_protocols=skip_protocols))
if any('intermission.mp4' in x['url'] for x in formats): if any('intermission.mp4' in x['url'] for x in formats):
self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)

View file

@ -1,6 +1,6 @@
import re import re
from ..utils import parse_duration from ..utils import parse_duration, unescapeHTML
from .common import InfoExtractor from .common import InfoExtractor
@ -16,7 +16,8 @@ class Rule34VideoIE(InfoExtractor):
'title': 'Shot It-(mmd hmv)', 'title': 'Shot It-(mmd hmv)',
'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065157/preview.jpg', 'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065157/preview.jpg',
'duration': 347.0, 'duration': 347.0,
'age_limit': 18 'age_limit': 18,
'tags': 'count:14'
} }
}, },
{ {
@ -28,7 +29,8 @@ class Rule34VideoIE(InfoExtractor):
'title': 'Lara in Trouble Ep. 7 [WildeerStudio]', 'title': 'Lara in Trouble Ep. 7 [WildeerStudio]',
'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065296/preview.jpg', 'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065296/preview.jpg',
'duration': 938.0, 'duration': 938.0,
'age_limit': 18 'age_limit': 18,
'tags': 'count:50'
} }
}, },
] ]
@ -57,5 +59,7 @@ def _real_extract(self, url):
'title': title, 'title': title,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'duration': parse_duration(duration), 'duration': parse_duration(duration),
'age_limit': 18 'age_limit': 18,
'tags': list(map(unescapeHTML, re.findall(
r'<a class="tag_item"[^>]+\bhref="https://rule34video\.com/tags/\d+/"[^>]*>(?P<tag>[^>]*)</a>', webpage))),
} }

View file

@ -33,7 +33,7 @@ class RumbleEmbedIE(InfoExtractor):
'upload_date': '20191020', 'upload_date': '20191020',
'channel_url': 'https://rumble.com/c/WMAR', 'channel_url': 'https://rumble.com/c/WMAR',
'channel': 'WMAR', 'channel': 'WMAR',
'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.OvCc-small-WMAR-2-News-Latest-Headline.jpg', 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.qR4e-small-WMAR-2-News-Latest-Headline.jpg',
'duration': 234, 'duration': 234,
'uploader': 'WMAR', 'uploader': 'WMAR',
'live_status': 'not_live', 'live_status': 'not_live',
@ -84,7 +84,7 @@ class RumbleEmbedIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'v1essrt', 'id': 'v1essrt',
'ext': 'mp4', 'ext': 'mp4',
'title': 'startswith:lofi hip hop radio - beats to relax/study', 'title': 'startswith:lofi hip hop radio 📚 - beats to relax/study to',
'timestamp': 1661519399, 'timestamp': 1661519399,
'upload_date': '20220826', 'upload_date': '20220826',
'channel_url': 'https://rumble.com/c/LofiGirl', 'channel_url': 'https://rumble.com/c/LofiGirl',
@ -99,7 +99,7 @@ class RumbleEmbedIE(InfoExtractor):
'url': 'https://rumble.com/embed/v1amumr', 'url': 'https://rumble.com/embed/v1amumr',
'info_dict': { 'info_dict': {
'id': 'v1amumr', 'id': 'v1amumr',
'ext': 'webm', 'ext': 'mp4',
'fps': 60, 'fps': 60,
'title': 'Turning Point USA 2022 Student Action Summit DAY 1 - Rumble Exclusive Live', 'title': 'Turning Point USA 2022 Student Action Summit DAY 1 - Rumble Exclusive Live',
'timestamp': 1658518457, 'timestamp': 1658518457,
@ -129,7 +129,7 @@ class RumbleEmbedIE(InfoExtractor):
'duration': 92, 'duration': 92,
'title': '911 Audio From The Man Who Wanted To Kill Supreme Court Justice Kavanaugh', 'title': '911 Audio From The Man Who Wanted To Kill Supreme Court Justice Kavanaugh',
'channel_url': 'https://rumble.com/c/RichSementa', 'channel_url': 'https://rumble.com/c/RichSementa',
'thumbnail': 'https://sp.rmbl.ws/s8/1/P/j/f/A/PjfAe.OvCc-small-911-Audio-From-The-Man-Who-.jpg', 'thumbnail': 'https://sp.rmbl.ws/s8/1/P/j/f/A/PjfAe.qR4e-small-911-Audio-From-The-Man-Who-.jpg',
'timestamp': 1654892716, 'timestamp': 1654892716,
'uploader': 'Mr Producer Media', 'uploader': 'Mr Producer Media',
'upload_date': '20220610', 'upload_date': '20220610',
@ -144,7 +144,7 @@ def _extract_embed_urls(cls, url, webpage):
if embeds: if embeds:
return embeds return embeds
return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer( return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer(
r'<script>[^<]*\bRumble\(\s*"play"\s*,\s*{\s*[\'"]?video[\'"]?\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)] r'<script>[^<]*\bRumble\(\s*"play"\s*,\s*{[^}]*[\'"]?video[\'"]?\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
@ -236,7 +236,9 @@ def _real_extract(self, url):
class RumbleIE(InfoExtractor): class RumbleIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?rumble\.com/(?P<id>v(?!ideos)[\w.-]+)[^/]*$' _VALID_URL = r'https?://(?:www\.)?rumble\.com/(?P<id>v(?!ideos)[\w.-]+)[^/]*$'
_EMBED_REGEX = [r'<a class=video-item--a href=(?P<url>/v[\w.-]+\.html)>'] _EMBED_REGEX = [
r'<a class=video-item--a href=(?P<url>/v[\w.-]+\.html)>',
r'<a[^>]+class="videostream__link link"[^>]+href=(?P<url>/v[\w.-]+\.html)[^>]*>']
_TESTS = [{ _TESTS = [{
'add_ie': ['RumbleEmbed'], 'add_ie': ['RumbleEmbed'],
'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html', 'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html',
@ -254,6 +256,7 @@ class RumbleIE(InfoExtractor):
'thumbnail': r're:https://.+\.jpg', 'thumbnail': r're:https://.+\.jpg',
'duration': 103, 'duration': 103,
'like_count': int, 'like_count': int,
'dislike_count': int,
'view_count': int, 'view_count': int,
'live_status': 'not_live', 'live_status': 'not_live',
} }
@ -278,6 +281,9 @@ class RumbleIE(InfoExtractor):
'channel_url': 'https://rumble.com/c/Redacted', 'channel_url': 'https://rumble.com/c/Redacted',
'live_status': 'not_live', 'live_status': 'not_live',
'thumbnail': 'https://sp.rmbl.ws/s8/1/d/x/2/O/dx2Oi.qR4e-small-The-U.S.-CANNOT-hide-this-i.jpg', 'thumbnail': 'https://sp.rmbl.ws/s8/1/d/x/2/O/dx2Oi.qR4e-small-The-U.S.-CANNOT-hide-this-i.jpg',
'like_count': int,
'dislike_count': int,
'view_count': int,
}, },
}, { }, {
'url': 'https://rumble.com/v2e7fju-the-covid-twitter-files-drop-protecting-fauci-while-censoring-the-truth-wma.html', 'url': 'https://rumble.com/v2e7fju-the-covid-twitter-files-drop-protecting-fauci-while-censoring-the-truth-wma.html',
@ -296,12 +302,15 @@ class RumbleIE(InfoExtractor):
'channel_url': 'https://rumble.com/c/KimIversen', 'channel_url': 'https://rumble.com/c/KimIversen',
'channel': 'Kim Iversen', 'channel': 'Kim Iversen',
'thumbnail': 'https://sp.rmbl.ws/s8/1/6/b/w/O/6bwOi.qR4e-small-The-Covid-Twitter-Files-Dro.jpg', 'thumbnail': 'https://sp.rmbl.ws/s8/1/6/b/w/O/6bwOi.qR4e-small-The-Covid-Twitter-Files-Dro.jpg',
'like_count': int,
'dislike_count': int,
'view_count': int,
}, },
}] }]
_WEBPAGE_TESTS = [{ _WEBPAGE_TESTS = [{
'url': 'https://rumble.com/videos?page=2', 'url': 'https://rumble.com/videos?page=2',
'playlist_count': 25, 'playlist_mincount': 24,
'info_dict': { 'info_dict': {
'id': 'videos?page=2', 'id': 'videos?page=2',
'title': 'All videos', 'title': 'All videos',
@ -309,17 +318,16 @@ class RumbleIE(InfoExtractor):
'age_limit': 0, 'age_limit': 0,
}, },
}, { }, {
'url': 'https://rumble.com/live-videos', 'url': 'https://rumble.com/browse/live',
'playlist_mincount': 19, 'playlist_mincount': 25,
'info_dict': { 'info_dict': {
'id': 'live-videos', 'id': 'live',
'title': 'Live Videos', 'title': 'Browse',
'description': 'Live videos on Rumble.com',
'age_limit': 0, 'age_limit': 0,
}, },
}, { }, {
'url': 'https://rumble.com/search/video?q=rumble&sort=views', 'url': 'https://rumble.com/search/video?q=rumble&sort=views',
'playlist_count': 24, 'playlist_mincount': 24,
'info_dict': { 'info_dict': {
'id': 'video?q=rumble&sort=views', 'id': 'video?q=rumble&sort=views',
'title': 'Search results for: rumble', 'title': 'Search results for: rumble',
@ -334,19 +342,20 @@ def _real_extract(self, url):
if not url_info: if not url_info:
raise UnsupportedError(url) raise UnsupportedError(url)
release_ts_str = self._search_regex( return {
r'(?:Livestream begins|Streamed on):\s+<time datetime="([^"]+)', '_type': 'url_transparent',
webpage, 'release date', fatal=False, default=None) 'ie_key': url_info['ie_key'],
view_count_str = self._search_regex(r'<span class="media-heading-info">([\d,]+) Views', 'url': url_info['url'],
webpage, 'view count', fatal=False, default=None) 'release_timestamp': parse_iso8601(self._search_regex(
r'(?:Livestream begins|Streamed on):\s+<time datetime="([^"]+)', webpage, 'release date', default=None)),
return self.url_result( 'view_count': int_or_none(self._search_regex(
url_info['url'], ie_key=url_info['ie_key'], url_transparent=True, r'"userInteractionCount"\s*:\s*(\d+)', webpage, 'view count', default=None)),
view_count=parse_count(view_count_str), 'like_count': parse_count(self._search_regex(
release_timestamp=parse_iso8601(release_ts_str), r'<span data-js="rumbles_up_votes">\s*([\d,.KM]+)', webpage, 'like count', default=None)),
like_count=parse_count(get_element_by_class('rumbles-count', webpage)), 'dislike_count': parse_count(self._search_regex(
description=clean_html(get_element_by_class('media-description', webpage)), r'<span data-js="rumbles_down_votes">\s*([\d,.KM]+)', webpage, 'dislike count', default=None)),
) 'description': clean_html(get_element_by_class('media-description', webpage))
}
class RumbleChannelIE(InfoExtractor): class RumbleChannelIE(InfoExtractor):

View file

@ -1,5 +1,5 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import traverse_obj from ..utils import traverse_obj, url_or_none
class S4CIE(InfoExtractor): class S4CIE(InfoExtractor):
@ -11,7 +11,8 @@ class S4CIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'title': 'Y Swn', 'title': 'Y Swn',
'description': 'md5:f7681a30e4955b250b3224aa9fe70cf0', 'description': 'md5:f7681a30e4955b250b3224aa9fe70cf0',
'duration': 5340 'duration': 5340,
'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Y_Swn_2023S4C_099_ii.jpg'
}, },
}, { }, {
'url': 'https://www.s4c.cymru/clic/programme/856636948', 'url': 'https://www.s4c.cymru/clic/programme/856636948',
@ -21,6 +22,7 @@ class S4CIE(InfoExtractor):
'title': 'Am Dro', 'title': 'Am Dro',
'duration': 2880, 'duration': 2880,
'description': 'md5:100d8686fc9a632a0cb2db52a3433ffe', 'description': 'md5:100d8686fc9a632a0cb2db52a3433ffe',
'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Am_Dro_2022-23S4C_P6_4005.jpg'
}, },
}] }]
@ -30,7 +32,7 @@ def _real_extract(self, url):
f'https://www.s4c.cymru/df/full_prog_details?lang=e&programme_id={video_id}', f'https://www.s4c.cymru/df/full_prog_details?lang=e&programme_id={video_id}',
video_id, fatal=False) video_id, fatal=False)
filename = self._download_json( player_config = self._download_json(
'https://player-api.s4c-cdn.co.uk/player-configuration/prod', video_id, query={ 'https://player-api.s4c-cdn.co.uk/player-configuration/prod', video_id, query={
'programme_id': video_id, 'programme_id': video_id,
'signed': '0', 'signed': '0',
@ -38,7 +40,13 @@ def _real_extract(self, url):
'mode': 'od', 'mode': 'od',
'appId': 'clic', 'appId': 'clic',
'streamName': '', 'streamName': '',
}, note='Downloading player config JSON')['filename'] }, note='Downloading player config JSON')
subtitles = {}
for sub in traverse_obj(player_config, ('subtitles', lambda _, v: url_or_none(v['0']))):
subtitles.setdefault(sub.get('3', 'en'), []).append({
'url': sub['0'],
'name': sub.get('1'),
})
m3u8_url = self._download_json( m3u8_url = self._download_json(
'https://player-api.s4c-cdn.co.uk/streaming-urls/prod', video_id, query={ 'https://player-api.s4c-cdn.co.uk/streaming-urls/prod', video_id, query={
'mode': 'od', 'mode': 'od',
@ -46,17 +54,52 @@ def _real_extract(self, url):
'region': 'WW', 'region': 'WW',
'extra': 'false', 'extra': 'false',
'thirdParty': 'false', 'thirdParty': 'false',
'filename': filename, 'filename': player_config['filename'],
}, note='Downloading streaming urls JSON')['hls'] }, note='Downloading streaming urls JSON')['hls']
formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls')
return { return {
'id': video_id, 'id': video_id,
'formats': formats, 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls'),
'subtitles': subtitles, 'subtitles': subtitles,
'thumbnail': url_or_none(player_config.get('poster')),
**traverse_obj(details, ('full_prog_details', 0, { **traverse_obj(details, ('full_prog_details', 0, {
'title': (('programme_title', 'series_title'), {str}), 'title': (('programme_title', 'series_title'), {str}),
'description': ('full_billing', {str.strip}), 'description': ('full_billing', {str.strip}),
'duration': ('duration', {lambda x: int(x) * 60}), 'duration': ('duration', {lambda x: int(x) * 60}),
}), get_all=False), }), get_all=False),
} }
class S4CSeriesIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?s4c\.cymru/clic/series/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.s4c.cymru/clic/series/864982911',
'playlist_mincount': 6,
'info_dict': {
'id': '864982911',
'title': 'Iaith ar Daith',
'description': 'md5:e878ebf660dce89bd2ef521d7ce06397'
},
}, {
'url': 'https://www.s4c.cymru/clic/series/866852587',
'playlist_mincount': 8,
'info_dict': {
'id': '866852587',
'title': 'FFIT Cymru',
'description': 'md5:abcb3c129cb68dbb6cd304fd33b07e96'
},
}]
def _real_extract(self, url):
series_id = self._match_id(url)
series_details = self._download_json(
'https://www.s4c.cymru/df/series_details', series_id, query={
'lang': 'e',
'series_id': series_id,
'show_prog_in_series': 'Y'
}, note='Downloading series details JSON')
return self.playlist_result(
[self.url_result(f'https://www.s4c.cymru/clic/programme/{episode_id}', S4CIE, episode_id)
for episode_id in traverse_obj(series_details, ('other_progs_in_series', ..., 'id'))],
series_id, traverse_obj(series_details, ('full_prog_details', 0, 'series_title', {str})))

View file

@ -1,3 +1,4 @@
import base64
import re import re
from .common import InfoExtractor from .common import InfoExtractor
@ -8,7 +9,12 @@
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
float_or_none,
url_or_none,
unified_timestamp,
try_get, try_get,
urljoin,
traverse_obj,
) )
@ -31,13 +37,20 @@ class SohuIE(InfoExtractor):
'id': '409385080', 'id': '409385080',
'ext': 'mp4', 'ext': 'mp4',
'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》', 'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》',
} },
'skip': 'no longer available',
}, { }, {
'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml',
'info_dict': { 'info_dict': {
'id': '78693464', 'id': '78693464',
'ext': 'mp4', 'ext': 'mp4',
'title': '【爱范品】第31期MWC见不到的奇葩手机', 'title': '【爱范品】第31期MWC见不到的奇葩手机',
'uploader': '爱范儿视频',
'duration': 213,
'timestamp': 1425519600,
'upload_date': '20150305',
'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M10/83/FA/MTAuMTAuODguODA=/6_14cbccdde5eg104SysCutcloud_78693464_7_0b.jpg',
'tags': ['爱范儿', '爱范品', 'MWC', '手机'],
} }
}, { }, {
'note': 'Multipart video', 'note': 'Multipart video',
@ -45,6 +58,12 @@ class SohuIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '78910339', 'id': '78910339',
'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
'uploader': '小苍cany',
'duration': 744.0,
'timestamp': 1426269360,
'upload_date': '20150313',
'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M11/89/57/MTAuMTAuODguODA=/6_14cea022a1dg102SysCutcloud_78910339_8_0b.jpg',
'tags': ['小苍MM', '英雄联盟', '实战秘籍'],
}, },
'playlist': [{ 'playlist': [{
'info_dict': { 'info_dict': {
@ -75,6 +94,11 @@ class SohuIE(InfoExtractor):
'id': '78932792', 'id': '78932792',
'ext': 'mp4', 'ext': 'mp4',
'title': 'youtube-dl testing video', 'title': 'youtube-dl testing video',
'duration': 360,
'timestamp': 1426348620,
'upload_date': '20150314',
'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M02/8A/00/MTAuMTAuODguNzk=/6_14cee1be192g102SysCutcloud_78932792_7_7b.jpg',
'tags': [],
}, },
'params': { 'params': {
'skip_download': True 'skip_download': True
@ -100,7 +124,7 @@ def _fetch_data(vid_id, mytv=False):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
title = re.sub(r' - 搜狐视频$', '', self._og_search_title(webpage)) title = re.sub(r'( - 高清正版在线观看)? - 搜狐视频$', '', self._og_search_title(webpage))
vid = self._html_search_regex( vid = self._html_search_regex(
r'var vid ?= ?["\'](\d+)["\']', r'var vid ?= ?["\'](\d+)["\']',
@ -132,7 +156,9 @@ def _fetch_data(vid_id, mytv=False):
allot = format_data['allot'] allot = format_data['allot']
data = format_data['data'] data = format_data['data']
clips_url = data['clipsURL'] clip_url = traverse_obj(data, (('clipsURL', 'mp4PlayUrl'), i, {url_or_none}), get_all=False)
if not clip_url:
raise ExtractorError(f'Unable to extract url for clip {i}')
su = data['su'] su = data['su']
video_url = 'newflv.sohu.ccgslb.net' video_url = 'newflv.sohu.ccgslb.net'
@ -142,9 +168,9 @@ def _fetch_data(vid_id, mytv=False):
while 'newflv.sohu.ccgslb.net' in video_url: while 'newflv.sohu.ccgslb.net' in video_url:
params = { params = {
'prot': 9, 'prot': 9,
'file': clips_url[i], 'file': clip_url,
'new': su[i], 'new': su[i],
'prod': 'flash', 'prod': 'h5n',
'rb': 1, 'rb': 1,
} }
@ -193,6 +219,75 @@ def _fetch_data(vid_id, mytv=False):
'entries': playlist, 'entries': playlist,
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'duration': traverse_obj(vid_data, ('data', 'totalDuration', {float_or_none})),
} }
return info if mytv:
publish_time = unified_timestamp(self._search_regex(
r'publishTime:\s*["\'](\d+-\d+-\d+ \d+:\d+)["\']', webpage, 'publish time', fatal=False))
else:
publish_time = traverse_obj(vid_data, ('tv_application_time', {unified_timestamp}))
return {
'timestamp': publish_time - 8 * 3600 if publish_time else None,
**traverse_obj(vid_data, {
'alt_title': ('data', 'subName', {str}),
'uploader': ('wm_data', 'wm_username', {str}),
'thumbnail': ('data', 'coverImg', {url_or_none}),
'tags': ('data', 'tag', {str.split}),
}),
**info,
}
class SohuVIE(InfoExtractor):
_VALID_URL = r'https?://tv\.sohu\.com/v/(?P<id>[\w=-]+)\.html(?:$|[#?])'
_TESTS = [{
'note': 'Multipart video',
'url': 'https://tv.sohu.com/v/MjAyMzA2MTQvbjYwMTMxNTE5Mi5zaHRtbA==.html',
'info_dict': {
'id': '601315192',
'title': '《淬火丹心》第1集',
'alt_title': '“点天灯”发生事故',
'duration': 2701.692,
'timestamp': 1686758040,
'upload_date': '20230614',
'thumbnail': 'http://photocdn.tv.sohu.com/img/20230614/vrsa_hor_1686738763256_454010551.jpg',
},
'playlist_mincount': 9,
'skip': 'Only available in China',
}, {
'url': 'https://tv.sohu.com/v/dXMvMjMyNzk5ODg5Lzc4NjkzNDY0LnNodG1s.html',
'info_dict': {
'id': '78693464',
'ext': 'mp4',
'title': '【爱范品】第31期MWC见不到的奇葩手机',
'uploader': '爱范儿视频',
'duration': 213,
'timestamp': 1425519600,
'upload_date': '20150305',
'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M10/83/FA/MTAuMTAuODguODA=/6_14cbccdde5eg104SysCutcloud_78693464_7_0b.jpg',
'tags': ['爱范儿', '爱范品', 'MWC', '手机'],
}
}, {
'note': 'Multipart video',
'url': 'https://tv.sohu.com/v/dXMvMjQyNTYyMTYzLzc4OTEwMzM5LnNodG1s.html?src=pl',
'info_dict': {
'id': '78910339',
'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
'uploader': '小苍cany',
'duration': 744.0,
'timestamp': 1426269360,
'upload_date': '20150313',
'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M11/89/57/MTAuMTAuODguODA=/6_14cea022a1dg102SysCutcloud_78910339_8_0b.jpg',
'tags': ['小苍MM', '英雄联盟', '实战秘籍'],
},
'playlist_mincount': 3,
}]
def _real_extract(self, url):
encoded_id = self._match_id(url)
path = base64.urlsafe_b64decode(encoded_id).decode()
subdomain = 'tv' if re.match(r'\d+/n\d+\.shtml', path) else 'my.tv'
return self.url_result(urljoin(f'http://{subdomain}.sohu.com/', path), SohuIE)

View file

@ -15,7 +15,6 @@
UserNotLive, UserNotLive,
determine_ext, determine_ext,
format_field, format_field,
get_element_by_id,
get_first, get_first,
int_or_none, int_or_none,
join_nonempty, join_nonempty,
@ -50,8 +49,9 @@ def _create_url(user_id, video_id):
return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}' return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}'
def _get_sigi_state(self, webpage, display_id): def _get_sigi_state(self, webpage, display_id):
return self._parse_json(get_element_by_id( return self._search_json(
'SIGI_STATE|sigi-persisted-data', webpage, escape_value=False), display_id) r'<script[^>]+\bid="(?:SIGI_STATE|sigi-persisted-data)"[^>]*>', webpage,
'sigi state', display_id, end_pattern=r'</script>')
def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True,
note='Downloading API JSON', errnote='Unable to download API page'): note='Downloading API JSON', errnote='Unable to download API page'):

View file

@ -1,10 +1,14 @@
import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
extract_attributes, extract_attributes,
int_or_none, int_or_none,
parse_duration, parse_duration,
traverse_obj,
try_get, try_get,
url_or_none,
) )
@ -12,6 +16,36 @@ class TV5MondePlusIE(InfoExtractor):
IE_DESC = 'TV5MONDE+' IE_DESC = 'TV5MONDE+'
_VALID_URL = r'https?://(?:www\.)?(?:tv5mondeplus|revoir\.tv5monde)\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)' _VALID_URL = r'https?://(?:www\.)?(?:tv5mondeplus|revoir\.tv5monde)\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)'
_TESTS = [{ _TESTS = [{
# movie
'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/les-novices',
'md5': 'c86f60bf8b75436455b1b205f9745955',
'info_dict': {
'id': 'ZX0ipMyFQq_6D4BA7b',
'display_id': 'les-novices',
'ext': 'mp4',
'title': 'Les novices',
'description': 'md5:2e7c33ba3ad48dabfcc2a956b88bde2b',
'upload_date': '20230821',
'thumbnail': 'https://revoir.tv5monde.com/uploads/media/video_thumbnail/0738/60/01e952b7ccf36b7c6007ec9131588954ab651de9.jpeg',
'duration': 5177,
'episode': 'Les novices',
},
}, {
# series episode
'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/opj-les-dents-de-la-terre-2',
'info_dict': {
'id': 'wJ0eeEPozr_6D4BA7b',
'display_id': 'opj-les-dents-de-la-terre-2',
'ext': 'mp4',
'title': "OPJ - Les dents de la Terre (2)",
'description': 'md5:288f87fd68d993f814e66e60e5302d9d',
'upload_date': '20230823',
'series': 'OPJ',
'episode': 'Les dents de la Terre (2)',
'duration': 2877,
'thumbnail': 'https://dl-revoir.tv5monde.com/images/1a/5753448.jpg'
},
}, {
# movie # movie
'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/ceux-qui-travaillent', 'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/ceux-qui-travaillent',
'md5': '32fa0cde16a4480d1251502a66856d5f', 'md5': '32fa0cde16a4480d1251502a66856d5f',
@ -23,6 +57,7 @@ class TV5MondePlusIE(InfoExtractor):
'description': 'md5:570e8bb688036ace873b2d50d24c026d', 'description': 'md5:570e8bb688036ace873b2d50d24c026d',
'upload_date': '20210819', 'upload_date': '20210819',
}, },
'skip': 'no longer available',
}, { }, {
# series episode # series episode
'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/vestiaires-caro-actrice', 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/vestiaires-caro-actrice',
@ -39,6 +74,7 @@ class TV5MondePlusIE(InfoExtractor):
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
'skip': 'no longer available',
}, { }, {
'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/neuf-jours-en-hiver-neuf-jours-en-hiver', 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/neuf-jours-en-hiver-neuf-jours-en-hiver',
'only_matching': True, 'only_matching': True,
@ -63,20 +99,45 @@ def _real_extract(self, url):
video_files = self._parse_json( video_files = self._parse_json(
vpl_data['data-broadcast'], display_id) vpl_data['data-broadcast'], display_id)
formats = [] formats = []
for video_file in video_files: video_id = None
v_url = video_file.get('url')
if not v_url: def process_video_files(v):
continue nonlocal video_id
video_format = video_file.get('format') or determine_ext(v_url) for video_file in v:
if video_format == 'm3u8': v_url = video_file.get('url')
formats.extend(self._extract_m3u8_formats( if not v_url:
v_url, display_id, 'mp4', 'm3u8_native', continue
m3u8_id='hls', fatal=False)) if video_file.get('type') == 'application/deferred':
else: d_param = urllib.parse.quote(v_url)
formats.append({ token = video_file.get('token')
'url': v_url, if not token:
'format_id': video_format, continue
}) deferred_json = self._download_json(
f'https://api.tv5monde.com/player/asset/{d_param}/resolve?condenseKS=true', display_id,
note='Downloading deferred info', headers={'Authorization': f'Bearer {token}'}, fatal=False)
v_url = traverse_obj(deferred_json, (0, 'url', {url_or_none}))
if not v_url:
continue
# data-guid from the webpage isn't stable, use the material id from the json urls
video_id = self._search_regex(
r'materials/([\da-zA-Z]{10}_[\da-fA-F]{7})/', v_url, 'video id', default=None)
process_video_files(deferred_json)
video_format = video_file.get('format') or determine_ext(v_url)
if video_format == 'm3u8':
formats.extend(self._extract_m3u8_formats(
v_url, display_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
elif video_format == 'mpd':
formats.extend(self._extract_mpd_formats(
v_url, display_id, fatal=False))
else:
formats.append({
'url': v_url,
'format_id': video_format,
})
process_video_files(video_files)
metadata = self._parse_json( metadata = self._parse_json(
vpl_data['data-metadata'], display_id) vpl_data['data-metadata'], display_id)
@ -100,10 +161,11 @@ def _real_extract(self, url):
if upload_date: if upload_date:
upload_date = upload_date.replace('_', '') upload_date = upload_date.replace('_', '')
video_id = self._search_regex( if not video_id:
(r'data-guid=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', video_id = self._search_regex(
r'id_contenu["\']\s:\s*(\d+)'), webpage, 'video id', (r'data-guid=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
default=display_id) r'id_contenu["\']\s:\s*(\d+)'), webpage, 'video id',
default=display_id)
return { return {
'id': video_id, 'id': video_id,

View file

@ -22,7 +22,7 @@
class TwitCastingIE(InfoExtractor): class TwitCastingIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/(?:movie|twplayer)/(?P<id>\d+)' _VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P<uploader_id>[^/?#]+)/(?:movie|twplayer)/(?P<id>\d+)'
_M3U8_HEADERS = { _M3U8_HEADERS = {
'Origin': 'https://twitcasting.tv', 'Origin': 'https://twitcasting.tv',
'Referer': 'https://twitcasting.tv/', 'Referer': 'https://twitcasting.tv/',
@ -231,7 +231,7 @@ def find_dmu(x):
class TwitCastingLiveIE(InfoExtractor): class TwitCastingLiveIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<id>[^/]+)/?(?:[#?]|$)' _VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P<id>[^/?#]+)/?(?:[#?]|$)'
_TESTS = [{ _TESTS = [{
'url': 'https://twitcasting.tv/ivetesangalo', 'url': 'https://twitcasting.tv/ivetesangalo',
'only_matching': True, 'only_matching': True,
@ -265,8 +265,15 @@ def _real_extract(self, url):
class TwitCastingUserIE(InfoExtractor): class TwitCastingUserIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<id>[^/]+)/show/?(?:[#?]|$)' _VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P<id>[^/?#]+)/(:?show|archive)/?(?:[#?]|$)'
_TESTS = [{ _TESTS = [{
'url': 'https://twitcasting.tv/natsuiromatsuri/archive/',
'info_dict': {
'id': 'natsuiromatsuri',
'title': 'natsuiromatsuri - Live History',
},
'playlist_mincount': 235,
}, {
'url': 'https://twitcasting.tv/noriyukicas/show', 'url': 'https://twitcasting.tv/noriyukicas/show',
'only_matching': True, 'only_matching': True,
}] }]

View file

@ -1,9 +1,10 @@
import functools
import json import json
import random
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from .periscope import PeriscopeBaseIE, PeriscopeIE from .periscope import PeriscopeBaseIE, PeriscopeIE
from ..compat import functools # isort: split
from ..compat import ( from ..compat import (
compat_parse_qs, compat_parse_qs,
compat_urllib_parse_unquote, compat_urllib_parse_unquote,
@ -147,10 +148,14 @@ def _search_dimensions_in_video_url(a_format, video_url):
def is_logged_in(self): def is_logged_in(self):
return bool(self._get_cookies(self._API_BASE).get('auth_token')) return bool(self._get_cookies(self._API_BASE).get('auth_token'))
@functools.cached_property
def _selected_api(self):
return self._configuration_arg('api', ['graphql'], ie_key='Twitter')[0]
def _fetch_guest_token(self, display_id): def _fetch_guest_token(self, display_id):
guest_token = traverse_obj(self._download_json( guest_token = traverse_obj(self._download_json(
f'{self._API_BASE}guest/activate.json', display_id, 'Downloading guest token', data=b'', f'{self._API_BASE}guest/activate.json', display_id, 'Downloading guest token', data=b'',
headers=self._set_base_headers(legacy=display_id and self._configuration_arg('legacy_api'))), headers=self._set_base_headers(legacy=display_id and self._selected_api == 'legacy')),
('guest_token', {str})) ('guest_token', {str}))
if not guest_token: if not guest_token:
raise ExtractorError('Could not retrieve guest token') raise ExtractorError('Could not retrieve guest token')
@ -295,7 +300,7 @@ def input_dict(subtask_id, text):
self.report_login() self.report_login()
def _call_api(self, path, video_id, query={}, graphql=False): def _call_api(self, path, video_id, query={}, graphql=False):
headers = self._set_base_headers(legacy=not graphql and self._configuration_arg('legacy_api')) headers = self._set_base_headers(legacy=not graphql and self._selected_api == 'legacy')
headers.update({ headers.update({
'x-twitter-auth-type': 'OAuth2Session', 'x-twitter-auth-type': 'OAuth2Session',
'x-twitter-client-language': 'en', 'x-twitter-client-language': 'en',
@ -707,6 +712,7 @@ class TwitterIE(TwitterBaseIE):
'tags': [], 'tags': [],
'age_limit': 0, 'age_limit': 0,
}, },
'skip': 'This Tweet is unavailable',
}, { }, {
# not available in Periscope # not available in Periscope
'url': 'https://twitter.com/ViviEducation/status/1136534865145286656', 'url': 'https://twitter.com/ViviEducation/status/1136534865145286656',
@ -721,6 +727,7 @@ class TwitterIE(TwitterBaseIE):
'view_count': int, 'view_count': int,
}, },
'add_ie': ['TwitterBroadcast'], 'add_ie': ['TwitterBroadcast'],
'skip': 'Broadcast no longer exists',
}, { }, {
# unified card # unified card
'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20', 'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20',
@ -773,9 +780,9 @@ class TwitterIE(TwitterBaseIE):
'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464', 'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464',
'info_dict': { 'info_dict': {
'id': '1577719286659006464', 'id': '1577719286659006464',
'title': 'Ultima📛 | #вʟм - Test', 'title': 'Ultima📛| New Era - Test',
'description': 'Test https://t.co/Y3KEZD7Dad', 'description': 'Test https://t.co/Y3KEZD7Dad',
'uploader': 'Ultima📛 | #вʟм', 'uploader': 'Ultima📛| New Era',
'uploader_id': 'UltimaShadowX', 'uploader_id': 'UltimaShadowX',
'uploader_url': 'https://twitter.com/UltimaShadowX', 'uploader_url': 'https://twitter.com/UltimaShadowX',
'upload_date': '20221005', 'upload_date': '20221005',
@ -811,7 +818,7 @@ class TwitterIE(TwitterBaseIE):
'age_limit': 0, 'age_limit': 0,
}, },
}, { }, {
# Adult content, fails if not logged in (GraphQL) # Adult content, fails if not logged in
'url': 'https://twitter.com/Rizdraws/status/1575199173472927762', 'url': 'https://twitter.com/Rizdraws/status/1575199173472927762',
'info_dict': { 'info_dict': {
'id': '1575199163847000068', 'id': '1575199163847000068',
@ -831,9 +838,10 @@ class TwitterIE(TwitterBaseIE):
'age_limit': 18, 'age_limit': 18,
'tags': [] 'tags': []
}, },
'params': {'skip_download': 'The media could not be played'},
'skip': 'Requires authentication', 'skip': 'Requires authentication',
}, { }, {
# Playlist result only with auth # Playlist result only with graphql API
'url': 'https://twitter.com/Srirachachau/status/1395079556562706435', 'url': 'https://twitter.com/Srirachachau/status/1395079556562706435',
'playlist_mincount': 2, 'playlist_mincount': 2,
'info_dict': { 'info_dict': {
@ -898,7 +906,7 @@ class TwitterIE(TwitterBaseIE):
'uploader_id': 'MoniqueCamarra', 'uploader_id': 'MoniqueCamarra',
'live_status': 'was_live', 'live_status': 'was_live',
'release_timestamp': 1658417414, 'release_timestamp': 1658417414,
'description': 'md5:4dc8e972f1d8b3c6580376fabb02a3ad', 'description': 'md5:acce559345fd49f129c20dbcda3f1201',
'timestamp': 1658407771, 'timestamp': 1658407771,
'release_date': '20220721', 'release_date': '20220721',
'upload_date': '20220721', 'upload_date': '20220721',
@ -1007,10 +1015,10 @@ class TwitterIE(TwitterBaseIE):
'view_count': int, 'view_count': int,
'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig', 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig',
'age_limit': 0, 'age_limit': 0,
'uploader': 'Mün The Friend Of YWAP', 'uploader': 'Mün',
'repost_count': int, 'repost_count': int,
'upload_date': '20221206', 'upload_date': '20221206',
'title': 'Mün The Friend Of YWAP - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', 'title': 'Mün - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525',
'comment_count': int, 'comment_count': int,
'like_count': int, 'like_count': int,
'tags': [], 'tags': [],
@ -1019,7 +1027,7 @@ class TwitterIE(TwitterBaseIE):
'timestamp': 1670306984.0, 'timestamp': 1670306984.0,
}, },
}, { }, {
# url to retweet id w/ legacy api # retweeted_status (private)
'url': 'https://twitter.com/liberdalau/status/1623739803874349067', 'url': 'https://twitter.com/liberdalau/status/1623739803874349067',
'info_dict': { 'info_dict': {
'id': '1623274794488659969', 'id': '1623274794488659969',
@ -1039,32 +1047,84 @@ class TwitterIE(TwitterBaseIE):
'like_count': int, 'like_count': int,
'repost_count': int, 'repost_count': int,
}, },
'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}},
'skip': 'Protected tweet', 'skip': 'Protected tweet',
}, { }, {
# orig tweet w/ graphql # retweeted_status
'url': 'https://twitter.com/liberdalau/status/1623739803874349067', 'url': 'https://twitter.com/playstrumpcard/status/1695424220702888009',
'info_dict': { 'info_dict': {
'id': '1623274794488659969', 'id': '1694928337846538240',
'display_id': '1623739803874349067',
'ext': 'mp4', 'ext': 'mp4',
'title': '@selfisekai@hackerspace.pl 🐀 - RT @Johnnybull3ts: Me after going viral to over 30million people: Whoopsie-daisy', 'display_id': '1695424220702888009',
'description': 'md5:9258bdbb54793bdc124fe1cd47e96c6a', 'title': 'md5:e8daa9527bc2b947121395494f786d9d',
'uploader': '@selfisekai@hackerspace.pl 🐀', 'description': 'md5:004f2d37fd58737724ec75bc7e679938',
'uploader_id': 'liberdalau', 'uploader': 'Benny Johnson',
'uploader_url': 'https://twitter.com/liberdalau', 'uploader_id': 'bennyjohnson',
'uploader_url': 'https://twitter.com/bennyjohnson',
'age_limit': 0, 'age_limit': 0,
'tags': [], 'tags': [],
'duration': 8.033, 'duration': 45.001,
'timestamp': 1675964711.0, 'timestamp': 1692962814.0,
'upload_date': '20230209', 'upload_date': '20230825',
'thumbnail': r're:https://pbs\.twimg\.com/ext_tw_video_thumb/.+', 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+',
'like_count': int, 'like_count': int,
'view_count': int,
'repost_count': int, 'repost_count': int,
'view_count': int,
'comment_count': int, 'comment_count': int,
}, },
'skip': 'Protected tweet', }, {
# retweeted_status w/ legacy API
'url': 'https://twitter.com/playstrumpcard/status/1695424220702888009',
'info_dict': {
'id': '1694928337846538240',
'ext': 'mp4',
'display_id': '1695424220702888009',
'title': 'md5:e8daa9527bc2b947121395494f786d9d',
'description': 'md5:004f2d37fd58737724ec75bc7e679938',
'uploader': 'Benny Johnson',
'uploader_id': 'bennyjohnson',
'uploader_url': 'https://twitter.com/bennyjohnson',
'age_limit': 0,
'tags': [],
'duration': 45.001,
'timestamp': 1692962814.0,
'upload_date': '20230825',
'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+',
'like_count': int,
'repost_count': int,
},
'params': {'extractor_args': {'twitter': {'api': ['legacy']}}},
}, {
# Broadcast embedded in tweet
'url': 'https://twitter.com/JessicaDobsonWX/status/1693057346933600402',
'info_dict': {
'id': '1yNGaNLjEblJj',
'ext': 'mp4',
'title': 'Jessica Dobson - WAVE Weather Now - Saturday 8/19/23 Update',
'uploader': 'Jessica Dobson',
'uploader_id': '1DZEoDwDovRQa',
'thumbnail': r're:^https?://.*\.jpg',
'view_count': int,
},
'add_ie': ['TwitterBroadcast'],
}, {
# Animated gif and quote tweet video, with syndication API
'url': 'https://twitter.com/BAKKOOONN/status/1696256659889565950',
'playlist_mincount': 2,
'info_dict': {
'id': '1696256659889565950',
'title': 'BAKOON - https://t.co/zom968d0a0',
'description': 'https://t.co/zom968d0a0',
'tags': [],
'uploader': 'BAKOON',
'uploader_id': 'BAKKOOONN',
'uploader_url': 'https://twitter.com/BAKKOOONN',
'age_limit': 18,
'timestamp': 1693254077.0,
'upload_date': '20230828',
'like_count': int,
},
'params': {'extractor_args': {'twitter': {'api': ['syndication']}}},
'expected_warnings': ['Not all metadata'],
}, { }, {
# onion route # onion route
'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273',
@ -1103,6 +1163,14 @@ class TwitterIE(TwitterBaseIE):
'only_matching': True, 'only_matching': True,
}] }]
_MEDIA_ID_RE = re.compile(r'_video/(\d+)/')
@property
def _GRAPHQL_ENDPOINT(self):
if self.is_logged_in:
return 'zZXycP0V6H7m-2r0mOnFcA/TweetDetail'
return '2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId'
def _graphql_to_legacy(self, data, twid): def _graphql_to_legacy(self, data, twid):
result = traverse_obj(data, ( result = traverse_obj(data, (
'threaded_conversation_with_injections_v2', 'instructions', 0, 'entries', 'threaded_conversation_with_injections_v2', 'instructions', 0, 'entries',
@ -1130,9 +1198,14 @@ def _graphql_to_legacy(self, data, twid):
'user': ('core', 'user_results', 'result', 'legacy'), 'user': ('core', 'user_results', 'result', 'legacy'),
'card': ('card', 'legacy'), 'card': ('card', 'legacy'),
'quoted_status': ('quoted_status_result', 'result', 'legacy'), 'quoted_status': ('quoted_status_result', 'result', 'legacy'),
'retweeted_status': ('legacy', 'retweeted_status_result', 'result', 'legacy'),
}, expected_type=dict, default={})) }, expected_type=dict, default={}))
# extra transformation is needed since result does not match legacy format # extra transformations needed since result does not match legacy format
if status.get('retweeted_status'):
status['retweeted_status']['user'] = traverse_obj(status, (
'retweeted_status_result', 'result', 'core', 'user_results', 'result', 'legacy', {dict})) or {}
binding_values = { binding_values = {
binding_value.get('key'): binding_value.get('value') binding_value.get('key'): binding_value.get('value')
for binding_value in traverse_obj(status, ('card', 'binding_values', ..., {dict})) for binding_value in traverse_obj(status, ('card', 'binding_values', ..., {dict}))
@ -1208,33 +1281,42 @@ def _build_graphql_query(self, media_id):
} }
def _extract_status(self, twid): def _extract_status(self, twid):
if self.is_logged_in: if self.is_logged_in or self._selected_api == 'graphql':
return self._graphql_to_legacy( status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid)
self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid)
try: elif self._selected_api == 'legacy':
if not self._configuration_arg('legacy_api'): status = self._call_api(f'statuses/show/{twid}.json', twid, {
return self._graphql_to_legacy(
self._call_graphql_api('2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId', twid), twid)
return traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, {
'cards_platform': 'Web-12', 'cards_platform': 'Web-12',
'include_cards': 1, 'include_cards': 1,
'include_reply_count': 1, 'include_reply_count': 1,
'include_user_entities': 0, 'include_user_entities': 0,
'tweet_mode': 'extended', 'tweet_mode': 'extended',
}), 'retweeted_status', None) })
except ExtractorError as e: elif self._selected_api == 'syndication':
if e.expected:
raise
self.report_warning( self.report_warning(
f'{e.orig_msg}. Falling back to syndication endpoint; some metadata may be missing', twid) 'Not all metadata or media is available via syndication endpoint', twid, only_once=True)
status = self._download_json(
'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON',
headers={'User-Agent': 'Googlebot'}, query={
'id': twid,
# TODO: token = ((Number(twid) / 1e15) * Math.PI).toString(36).replace(/(0+|\.)/g, '')
'token': ''.join(random.choices('123456789abcdefghijklmnopqrstuvwxyz', k=10)),
})
if not status:
raise ExtractorError('Syndication endpoint returned empty JSON response')
# Transform the result so its structure matches that of legacy/graphql
media = []
for detail in traverse_obj(status, ((None, 'quoted_tweet'), 'mediaDetails', ..., {dict})):
detail['id_str'] = traverse_obj(detail, (
'video_info', 'variants', ..., 'url', {self._MEDIA_ID_RE.search}, 1), get_all=False) or twid
media.append(detail)
status['extended_entities'] = {'media': media}
status = self._download_json( else:
'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON', raise ExtractorError(f'"{self._selected_api}" is not a valid API selection', expected=True)
headers={'User-Agent': 'Googlebot'}, query={'id': twid})
status['extended_entities'] = {'media': status.get('mediaDetails')} return traverse_obj(status, 'retweeted_status', None, expected_type=dict) or {}
return status
def _real_extract(self, url): def _real_extract(self, url):
twid, selected_index = self._match_valid_url(url).group('id', 'index') twid, selected_index = self._match_valid_url(url).group('id', 'index')
@ -1266,10 +1348,7 @@ def _real_extract(self, url):
} }
def extract_from_video_info(media): def extract_from_video_info(media):
media_id = traverse_obj(media, 'id_str', 'id', ( media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none)
'video_info', 'variants', ..., 'url',
{functools.partial(re.search, r'_video/(\d+)/')}, 1
), get_all=False, expected_type=str_or_none) or twid
self.write_debug(f'Extracting from video info: {media_id}') self.write_debug(f'Extracting from video info: {media_id}')
formats = [] formats = []
@ -1503,6 +1582,8 @@ def _real_extract(self, url):
broadcast = self._call_api( broadcast = self._call_api(
'broadcasts/show.json', broadcast_id, 'broadcasts/show.json', broadcast_id,
{'ids': broadcast_id})['broadcasts'][broadcast_id] {'ids': broadcast_id})['broadcasts'][broadcast_id]
if not broadcast:
raise ExtractorError('Broadcast no longer exists', expected=True)
info = self._parse_broadcast_data(broadcast, broadcast_id) info = self._parse_broadcast_data(broadcast, broadcast_id)
media_key = broadcast['media_key'] media_key = broadcast['media_key']
source = self._call_api( source = self._call_api(

View file

@ -38,6 +38,7 @@ class VideaIE(InfoExtractor):
'title': 'Az őrült kígyász 285 kígyót enged szabadon', 'title': 'Az őrült kígyász 285 kígyót enged szabadon',
'thumbnail': r're:^https?://.*', 'thumbnail': r're:^https?://.*',
'duration': 21, 'duration': 21,
'age_limit': 0,
}, },
}, { }, {
'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH', 'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH',
@ -48,6 +49,7 @@ class VideaIE(InfoExtractor):
'title': 'Supercars előzés', 'title': 'Supercars előzés',
'thumbnail': r're:^https?://.*', 'thumbnail': r're:^https?://.*',
'duration': 64, 'duration': 64,
'age_limit': 0,
}, },
}, { }, {
'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ', 'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ',
@ -58,6 +60,7 @@ class VideaIE(InfoExtractor):
'title': 'Az őrült kígyász 285 kígyót enged szabadon', 'title': 'Az őrült kígyász 285 kígyót enged szabadon',
'thumbnail': r're:^https?://.*', 'thumbnail': r're:^https?://.*',
'duration': 21, 'duration': 21,
'age_limit': 0,
}, },
}, { }, {
'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', 'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1',
@ -124,7 +127,7 @@ def _real_extract(self, url):
query['_t'] = result[:16] query['_t'] = result[:16]
b64_info, handle = self._download_webpage_handle( b64_info, handle = self._download_webpage_handle(
'http://videa.hu/videaplayer_get_xml.php', video_id, query=query) 'http://videa.hu/player/xml', video_id, query=query)
if b64_info.startswith('<?xml'): if b64_info.startswith('<?xml'):
info = self._parse_xml(b64_info, video_id) info = self._parse_xml(b64_info, video_id)
else: else:

View file

@ -173,6 +173,7 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE
'skip': 'HTTP Error 404: Not Found', 'skip': 'HTTP Error 404: Not Found',
}, },
{ {
# FIXME: Asset JSON is directly embedded in webpage
'url': 'http://www1.wdr.de/mediathek/video/live/index.html', 'url': 'http://www1.wdr.de/mediathek/video/live/index.html',
'info_dict': { 'info_dict': {
'id': 'mdb-2296252', 'id': 'mdb-2296252',
@ -221,6 +222,8 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE
'id': 'mdb-869971', 'id': 'mdb-869971',
'ext': 'mp4', 'ext': 'mp4',
'title': r're:^COSMO Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'title': r're:^COSMO Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'alt_title': 'COSMO Livestream',
'live_status': 'is_live',
'upload_date': '20160101', 'upload_date': '20160101',
}, },
'params': { 'params': {
@ -248,6 +251,16 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE
'url': 'https://kinder.wdr.de/tv/die-sendung-mit-dem-elefanten/av/video-folge---astronaut-100.html', 'url': 'https://kinder.wdr.de/tv/die-sendung-mit-dem-elefanten/av/video-folge---astronaut-100.html',
'only_matching': True, 'only_matching': True,
}, },
{
'url': 'https://www1.wdr.de/mediathek/video/sendungen/rockpalast/video-baroness---freak-valley-festival--100.html',
'info_dict': {
'id': 'mdb-2741028',
'ext': 'mp4',
'title': 'Baroness - Freak Valley Festival 2022',
'alt_title': 'Rockpalast',
'upload_date': '20220725',
},
}
] ]
def _real_extract(self, url): def _real_extract(self, url):
@ -259,7 +272,7 @@ def _real_extract(self, url):
# Article with several videos # Article with several videos
# for wdr.de the data-extension is in a tag with the class "mediaLink" # for wdr.de the data-extension-ard is in a tag with the class "mediaLink"
# for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn"
# for wdrmaus, in a tag with the class "videoButton" (previously a link # for wdrmaus, in a tag with the class "videoButton" (previously a link
# to the page in a multiline "videoLink"-tag) # to the page in a multiline "videoLink"-tag)
@ -268,7 +281,7 @@ def _real_extract(self, url):
(?: (?:
(["\'])(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b.*?\1[^>]+| (["\'])(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b.*?\1[^>]+|
(["\'])videoLink\b.*?\2[\s]*>\n[^\n]* (["\'])videoLink\b.*?\2[\s]*>\n[^\n]*
)data-extension=(["\'])(?P<data>(?:(?!\3).)+)\3 )data-extension(?:-ard)?=(["\'])(?P<data>(?:(?!\3).)+)\3
''', webpage): ''', webpage):
media_link_obj = self._parse_json( media_link_obj = self._parse_json(
mobj.group('data'), display_id, transform_source=js_to_json, mobj.group('data'), display_id, transform_source=js_to_json,
@ -295,7 +308,7 @@ def _real_extract(self, url):
compat_urlparse.urljoin(url, mobj.group('href')), compat_urlparse.urljoin(url, mobj.group('href')),
ie=WDRPageIE.ie_key()) ie=WDRPageIE.ie_key())
for mobj in re.finditer( for mobj in re.finditer(
r'<a[^>]+\bhref=(["\'])(?P<href>(?:(?!\1).)+)\1[^>]+\bdata-extension=', r'<a[^>]+\bhref=(["\'])(?P<href>(?:(?!\1).)+)\1[^>]+\bdata-extension(?:-ard)?=',
webpage) if re.match(self._PAGE_REGEX, mobj.group('href')) webpage) if re.match(self._PAGE_REGEX, mobj.group('href'))
] ]

View file

@ -1,134 +1,241 @@
from .common import InfoExtractor
import json
import random import random
import re import itertools
import urllib.parse
from ..compat import ( from .common import InfoExtractor
compat_parse_qs,
compat_str,
)
from ..utils import ( from ..utils import (
js_to_json, int_or_none,
make_archive_id,
mimetype2ext,
parse_resolution,
str_or_none,
strip_jsonp, strip_jsonp,
traverse_obj,
url_or_none,
urlencode_postdata, urlencode_postdata,
urljoin,
) )
class WeiboIE(InfoExtractor): class WeiboBaseIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?weibo\.com/[0-9]+/(?P<id>[a-zA-Z0-9]+)' def _update_visitor_cookies(self, video_id):
_TEST = { visitor_data = self._download_json(
'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment', 'https://passport.weibo.com/visitor/genvisitor', video_id,
'info_dict': { note='Generating first-visit guest request',
'id': 'Fp6RGfbff', transform_source=strip_jsonp,
'ext': 'mp4', data=urlencode_postdata({
'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博', 'cb': 'gen_callback',
} 'fp': '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}',
} }))
def _real_extract(self, url): self._download_webpage(
video_id = self._match_id(url) 'https://passport.weibo.com/visitor/visitor', video_id,
# to get Referer url for genvisitor note='Running first-visit callback to get guest cookies',
webpage, urlh = self._download_webpage_handle(url, video_id) query={
'a': 'incarnate',
visitor_url = urlh.url 't': visitor_data['data']['tid'],
'w': 2,
if 'passport.weibo.com' in visitor_url: 'c': '%03d' % visitor_data['data']['confidence'],
# first visit 'cb': 'cross_domain',
visitor_data = self._download_json( 'from': 'weibo',
'https://passport.weibo.com/visitor/genvisitor', video_id, '_rand': random.random(),
note='Generating first-visit data',
transform_source=strip_jsonp,
headers={'Referer': visitor_url},
data=urlencode_postdata({
'cb': 'gen_callback',
'fp': json.dumps({
'os': '2',
'browser': 'Gecko57,0,0,0',
'fonts': 'undefined',
'screenInfo': '1440*900*24',
'plugins': '',
}),
}))
tid = visitor_data['data']['tid']
cnfd = '%03d' % visitor_data['data']['confidence']
self._download_webpage(
'https://passport.weibo.com/visitor/visitor', video_id,
note='Running first-visit callback',
query={
'a': 'incarnate',
't': tid,
'w': 2,
'c': cnfd,
'cb': 'cross_domain',
'from': 'weibo',
'_rand': random.random(),
})
webpage = self._download_webpage(
url, video_id, note='Revisiting webpage')
title = self._html_extract_title(webpage)
video_formats = compat_parse_qs(self._search_regex(
r'video-sources=\\\"(.+?)\"', webpage, 'video_sources'))
formats = []
supported_resolutions = (480, 720)
for res in supported_resolutions:
vid_urls = video_formats.get(compat_str(res))
if not vid_urls or not isinstance(vid_urls, list):
continue
vid_url = vid_urls[0]
formats.append({
'url': vid_url,
'height': res,
}) })
uploader = self._og_search_property( def _weibo_download_json(self, url, video_id, *args, fatal=True, note='Downloading JSON metadata', **kwargs):
'nick-name', webpage, 'uploader', default=None) webpage, urlh = self._download_webpage_handle(url, video_id, *args, fatal=fatal, note=note, **kwargs)
if urllib.parse.urlparse(urlh.url).netloc == 'passport.weibo.com':
self._update_visitor_cookies(video_id)
webpage = self._download_webpage(url, video_id, *args, fatal=fatal, note=note, **kwargs)
return self._parse_json(webpage, video_id, fatal=fatal)
def _extract_formats(self, video_info):
media_info = traverse_obj(video_info, ('page_info', 'media_info'))
formats = traverse_obj(media_info, (
'playback_list', lambda _, v: url_or_none(v['play_info']['url']), 'play_info', {
'url': 'url',
'format': ('quality_desc', {str}),
'format_id': ('label', {str}),
'ext': ('mime', {mimetype2ext}),
'tbr': ('bitrate', {int_or_none}, {lambda x: x or None}),
'vcodec': ('video_codecs', {str}),
'fps': ('fps', {int_or_none}),
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
'filesize': ('size', {int_or_none}),
'acodec': ('audio_codecs', {str}),
'asr': ('audio_sample_rate', {int_or_none}),
'audio_channels': ('audio_channels', {int_or_none}),
}))
if not formats: # fallback, should be barely used
for url in set(traverse_obj(media_info, (..., {url_or_none}))):
if 'label=' in url: # filter out non-video urls
format_id, resolution = self._search_regex(
r'label=(\w+)&template=(\d+x\d+)', url, 'format info',
group=(1, 2), default=(None, None))
formats.append({
'url': url,
'format_id': format_id,
**parse_resolution(resolution),
**traverse_obj(media_info, (
'video_details', lambda _, v: v['label'].startswith(format_id), {
'size': ('size', {int_or_none}),
'tbr': ('bitrate', {int_or_none}),
}
), get_all=False),
})
return formats
def _parse_video_info(self, video_info, video_id=None):
return { return {
'id': video_id, 'id': video_id,
'title': title, 'extractor_key': WeiboIE.ie_key(),
'uploader': uploader, 'extractor': WeiboIE.IE_NAME,
'formats': formats 'formats': self._extract_formats(video_info),
'http_headers': {'Referer': 'https://weibo.com/'},
'_old_archive_ids': [make_archive_id('WeiboMobile', video_id)],
**traverse_obj(video_info, {
'id': (('id', 'id_str', 'mid'), {str_or_none}),
'display_id': ('mblogid', {str_or_none}),
'title': ('page_info', 'media_info', ('video_title', 'kol_title', 'name'), {str}, {lambda x: x or None}),
'description': ('text_raw', {str}),
'duration': ('page_info', 'media_info', 'duration', {int_or_none}),
'timestamp': ('page_info', 'media_info', 'video_publish_time', {int_or_none}),
'thumbnail': ('page_info', 'page_pic', {url_or_none}),
'uploader': ('user', 'screen_name', {str}),
'uploader_id': ('user', ('id', 'id_str'), {str_or_none}),
'uploader_url': ('user', 'profile_url', {lambda x: urljoin('https://weibo.com/', x)}),
'view_count': ('page_info', 'media_info', 'online_users_number', {int_or_none}),
'like_count': ('attitudes_count', {int_or_none}),
'repost_count': ('reposts_count', {int_or_none}),
}, get_all=False),
'tags': traverse_obj(video_info, ('topic_struct', ..., 'topic_title', {str})) or None,
} }
class WeiboMobileIE(InfoExtractor): class WeiboIE(WeiboBaseIE):
_VALID_URL = r'https?://m\.weibo\.cn/status/(?P<id>[0-9]+)(\?.+)?' _VALID_URL = r'https?://(?:m\.weibo\.cn/status|(?:www\.)?weibo\.com/\d+)/(?P<id>[a-zA-Z0-9]+)'
_TEST = { _TESTS = [{
'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', 'url': 'https://weibo.com/7827771738/N4xlMvjhI',
'info_dict': {
'id': '4910815147462302',
'ext': 'mp4',
'display_id': 'N4xlMvjhI',
'title': '【睡前消息暑假版第一期:拉泰国一把 对中国有好处】',
'description': 'md5:e2637a7673980d68694ea7c43cf12a5f',
'duration': 918,
'timestamp': 1686312819,
'upload_date': '20230609',
'thumbnail': r're:https://.*\.jpg',
'uploader': '睡前视频基地',
'uploader_id': '7827771738',
'uploader_url': 'https://weibo.com/u/7827771738',
'view_count': int,
'like_count': int,
'repost_count': int,
'tags': ['泰国大选远进党获胜', '睡前消息', '暑期版'],
},
}, {
'url': 'https://m.weibo.cn/status/4189191225395228',
'info_dict': { 'info_dict': {
'id': '4189191225395228', 'id': '4189191225395228',
'ext': 'mp4', 'ext': 'mp4',
'title': '午睡当然是要甜甜蜜蜜的啦', 'display_id': 'FBqgOmDxO',
'uploader': '柴犬柴犬' 'title': '柴犬柴犬的秒拍视频',
'description': 'md5:80f461ab5cdae6bbdb70efbf5a1db24f',
'duration': 53,
'timestamp': 1514264429,
'upload_date': '20171226',
'thumbnail': r're:https://.*\.jpg',
'uploader': '柴犬柴犬',
'uploader_id': '5926682210',
'uploader_url': 'https://weibo.com/u/5926682210',
'view_count': int,
'like_count': int,
'repost_count': int,
} }
} }, {
'url': 'https://weibo.com/0/4224132150961381',
'note': 'no playback_list example',
'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
# to get Referer url for genvisitor
webpage = self._download_webpage(url, video_id, note='visit the page')
weibo_info = self._parse_json(self._search_regex( return self._parse_video_info(self._weibo_download_json(
r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\]\s*\|\|\s*{};', f'https://weibo.com/ajax/statuses/show?id={video_id}', video_id))
webpage, 'js_code', flags=re.DOTALL),
video_id, transform_source=js_to_json)
status_data = weibo_info.get('status', {})
page_info = status_data.get('page_info')
title = status_data['status_title']
uploader = status_data.get('user', {}).get('screen_name')
return { class WeiboVideoIE(WeiboBaseIE):
'id': video_id, _VALID_URL = r'https?://(?:www\.)?weibo\.com/tv/show/(?P<id>\d+:\d+)'
'title': title, _TESTS = [{
'uploader': uploader, 'url': 'https://weibo.com/tv/show/1034:4797699866951785?from=old_pc_videoshow',
'url': page_info['media_info']['stream_url'] 'info_dict': {
'id': '4797700463137878',
'ext': 'mp4',
'display_id': 'LEZDodaiW',
'title': '稍微了解了一下靡烟miya感觉这东西也太二了',
'description': '稍微了解了一下靡烟miya感觉这东西也太二了 http://t.cn/A6aerGsM ',
'duration': 76,
'timestamp': 1659344278,
'upload_date': '20220801',
'thumbnail': r're:https://.*\.jpg',
'uploader': '君子爱财陈平安',
'uploader_id': '3905382233',
'uploader_url': 'https://weibo.com/u/3905382233',
'view_count': int,
'like_count': int,
'repost_count': int,
} }
}]
def _real_extract(self, url):
video_id = self._match_id(url)
post_data = f'data={{"Component_Play_Playinfo":{{"oid":"{video_id}"}}}}'.encode()
video_info = self._weibo_download_json(
f'https://weibo.com/tv/api/component?page=%2Ftv%2Fshow%2F{video_id.replace(":", "%3A")}',
video_id, headers={'Referer': url}, data=post_data)['data']['Component_Play_Playinfo']
return self.url_result(f'https://weibo.com/0/{video_info["mid"]}', WeiboIE)
class WeiboUserIE(WeiboBaseIE):
_VALID_URL = r'https?://(?:www\.)?weibo\.com/u/(?P<id>\d+)'
_TESTS = [{
'url': 'https://weibo.com/u/2066652961?tabtype=video',
'info_dict': {
'id': '2066652961',
'title': '萧影殿下的视频',
'description': '萧影殿下的全部视频',
'uploader': '萧影殿下',
},
'playlist_mincount': 195,
}]
def _fetch_page(self, uid, cursor=0, page=1):
return self._weibo_download_json(
'https://weibo.com/ajax/profile/getWaterFallContent',
uid, note=f'Downloading videos page {page}',
query={'uid': uid, 'cursor': cursor})['data']
def _entries(self, uid, first_page):
cursor = 0
for page in itertools.count(1):
response = first_page if page == 1 else self._fetch_page(uid, cursor, page)
for video_info in traverse_obj(response, ('list', ..., {dict})):
yield self._parse_video_info(video_info)
cursor = response.get('next_cursor')
if (int_or_none(cursor) or -1) < 0:
break
def _real_extract(self, url):
uid = self._match_id(url)
first_page = self._fetch_page(uid)
uploader = traverse_obj(first_page, ('list', ..., 'user', 'screen_name', {str}), get_all=False)
metainfo = {
'title': f'{uploader}的视频',
'description': f'{uploader}的全部视频',
'uploader': uploader,
} if uploader else {}
return self.playlist_result(self._entries(uid, first_page), uid, **metainfo)

View file

@ -9,6 +9,7 @@
traverse_obj, traverse_obj,
try_call, try_call,
unescapeHTML, unescapeHTML,
url_basename,
url_or_none, url_or_none,
) )
@ -45,12 +46,14 @@ class ZaikoIE(ZaikoBaseIE):
'uploader_id': '454', 'uploader_id': '454',
'uploader': 'ZAIKO ZERO', 'uploader': 'ZAIKO ZERO',
'release_timestamp': 1583809200, 'release_timestamp': 1583809200,
'thumbnail': r're:https://[a-z0-9]+.cloudfront.net/[a-z0-9_]+/[a-z0-9_]+', 'thumbnail': r're:^https://[\w.-]+/\w+/\w+',
'thumbnails': 'maxcount:2',
'release_date': '20200310', 'release_date': '20200310',
'categories': ['Tech House'], 'categories': ['Tech House'],
'live_status': 'was_live', 'live_status': 'was_live',
}, },
'params': {'skip_download': 'm3u8'}, 'params': {'skip_download': 'm3u8'},
'skip': 'Your account does not have tickets to this event',
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -83,6 +86,12 @@ def _real_extract(self, url):
if not formats: if not formats:
self.raise_no_formats(msg, expected=expected) self.raise_no_formats(msg, expected=expected)
thumbnail_urls = [
traverse_obj(player_meta, ('initial_event_info', 'poster_url')),
self._og_search_thumbnail(self._download_webpage(
f'https://zaiko.io/event/{video_id}', video_id, 'Downloading event page', fatal=False) or ''),
]
return { return {
'id': video_id, 'id': video_id,
'formats': formats, 'formats': formats,
@ -96,8 +105,8 @@ def _real_extract(self, url):
}), }),
**traverse_obj(player_meta, ('initial_event_info', { **traverse_obj(player_meta, ('initial_event_info', {
'alt_title': ('title', {str}), 'alt_title': ('title', {str}),
'thumbnail': ('poster_url', {url_or_none}),
})), })),
'thumbnails': [{'url': url, 'id': url_basename(url)} for url in thumbnail_urls if url_or_none(url)]
} }

View file

@ -127,6 +127,7 @@ def _real_extract(self, url):
return { return {
'id': video_id, 'id': video_id,
'title': str_or_none(traverse_obj(data, ('meet', 'topic'))), 'title': str_or_none(traverse_obj(data, ('meet', 'topic'))),
'duration': int_or_none(data.get('duration')),
'subtitles': subtitles, 'subtitles': subtitles,
'formats': formats, 'formats': formats,
'http_headers': { 'http_headers': {

View file

@ -2,6 +2,7 @@
import contextlib import contextlib
import functools import functools
import socket
import ssl import ssl
import sys import sys
import typing import typing
@ -206,3 +207,59 @@ def wrapper(self, *args, **kwargs):
e.handler = self e.handler = self
raise raise
return wrapper return wrapper
def _socket_connect(ip_addr, timeout, source_address):
af, socktype, proto, canonname, sa = ip_addr
sock = socket.socket(af, socktype, proto)
try:
if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
sock.settimeout(timeout)
if source_address:
sock.bind(source_address)
sock.connect(sa)
return sock
except socket.error:
sock.close()
raise
def create_connection(
address,
timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
source_address=None,
*,
_create_socket_func=_socket_connect
):
# Work around socket.create_connection() which tries all addresses from getaddrinfo() including IPv6.
# This filters the addresses based on the given source_address.
# Based on: https://github.com/python/cpython/blob/main/Lib/socket.py#L810
host, port = address
ip_addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
if not ip_addrs:
raise socket.error('getaddrinfo returns an empty list')
if source_address is not None:
af = socket.AF_INET if ':' not in source_address[0] else socket.AF_INET6
ip_addrs = [addr for addr in ip_addrs if addr[0] == af]
if not ip_addrs:
raise OSError(
f'No remote IPv{4 if af == socket.AF_INET else 6} addresses available for connect. '
f'Can\'t use "{source_address[0]}" as source address')
err = None
for ip_addr in ip_addrs:
try:
sock = _create_socket_func(ip_addr, timeout, source_address)
# Explicitly break __traceback__ reference cycle
# https://bugs.python.org/issue36820
err = None
return sock
except socket.error as e:
err = e
try:
raise err
finally:
# Explicitly break __traceback__ reference cycle
# https://bugs.python.org/issue36820
err = None

View file

@ -23,6 +23,7 @@
from ._helper import ( from ._helper import (
InstanceStoreMixin, InstanceStoreMixin,
add_accept_encoding_header, add_accept_encoding_header,
create_connection,
get_redirect_method, get_redirect_method,
make_socks_proxy_opts, make_socks_proxy_opts,
select_proxy, select_proxy,
@ -54,44 +55,10 @@
def _create_http_connection(http_class, source_address, *args, **kwargs): def _create_http_connection(http_class, source_address, *args, **kwargs):
hc = http_class(*args, **kwargs) hc = http_class(*args, **kwargs)
if hasattr(hc, '_create_connection'):
hc._create_connection = create_connection
if source_address is not None: if source_address is not None:
# This is to workaround _create_connection() from socket where it will try all
# address data from getaddrinfo() including IPv6. This filters the result from
# getaddrinfo() based on the source_address value.
# This is based on the cpython socket.create_connection() function.
# https://github.com/python/cpython/blob/master/Lib/socket.py#L691
def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
host, port = address
err = None
addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
ip_addrs = [addr for addr in addrs if addr[0] == af]
if addrs and not ip_addrs:
ip_version = 'v4' if af == socket.AF_INET else 'v6'
raise OSError(
"No remote IP%s addresses available for connect, can't use '%s' as source address"
% (ip_version, source_address[0]))
for res in ip_addrs:
af, socktype, proto, canonname, sa = res
sock = None
try:
sock = socket.socket(af, socktype, proto)
if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
sock.settimeout(timeout)
sock.bind(source_address)
sock.connect(sa)
err = None # Explicitly break reference cycle
return sock
except OSError as _:
err = _
if sock is not None:
sock.close()
if err is not None:
raise err
else:
raise OSError('getaddrinfo returns an empty list')
if hasattr(hc, '_create_connection'):
hc._create_connection = _create_connection
hc.source_address = (source_address, 0) hc.source_address = (source_address, 0)
return hc return hc
@ -220,13 +187,28 @@ def make_socks_conn_class(base_class, socks_proxy):
proxy_args = make_socks_proxy_opts(socks_proxy) proxy_args = make_socks_proxy_opts(socks_proxy)
class SocksConnection(base_class): class SocksConnection(base_class):
def connect(self): _create_connection = create_connection
self.sock = sockssocket()
self.sock.setproxy(**proxy_args)
if type(self.timeout) in (int, float): # noqa: E721
self.sock.settimeout(self.timeout)
self.sock.connect((self.host, self.port))
def connect(self):
def sock_socket_connect(ip_addr, timeout, source_address):
af, socktype, proto, canonname, sa = ip_addr
sock = sockssocket(af, socktype, proto)
try:
connect_proxy_args = proxy_args.copy()
connect_proxy_args.update({'addr': sa[0], 'port': sa[1]})
sock.setproxy(**connect_proxy_args)
if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: # noqa: E721
sock.settimeout(timeout)
if source_address:
sock.bind(source_address)
sock.connect((self.host, self.port))
return sock
except socket.error:
sock.close()
raise
self.sock = create_connection(
(proxy_args['addr'], proxy_args['port']), timeout=self.timeout,
source_address=self.source_address, _create_socket_func=sock_socket_connect)
if isinstance(self, http.client.HTTPSConnection): if isinstance(self, http.client.HTTPSConnection):
self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host) self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
@ -429,7 +411,7 @@ def _send(self, request):
except urllib.error.HTTPError as e: except urllib.error.HTTPError as e:
if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)): if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
# Prevent file object from being closed when urllib.error.HTTPError is destroyed. # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
e._closer.file = None e._closer.close_called = True
raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
raise # unexpected raise # unexpected
except urllib.error.URLError as e: except urllib.error.URLError as e:

View file

@ -115,7 +115,7 @@ def __init__(self, http_error: HTTPError):
hdrs=http_error.response.headers, hdrs=http_error.response.headers,
fp=http_error.response fp=http_error.response
) )
self._closer.file = None # Disable auto close self._closer.close_called = True # Disable auto close
self._http_error = http_error self._http_error = http_error
HTTPError.__init__(self, http_error.response, redirect_loop=http_error.redirect_loop) HTTPError.__init__(self, http_error.response, redirect_loop=http_error.redirect_loop)

View file

@ -134,26 +134,31 @@ def _check_response_version(self, expected_version, got_version):
self.close() self.close()
raise InvalidVersionError(expected_version, got_version) raise InvalidVersionError(expected_version, got_version)
def _resolve_address(self, destaddr, default, use_remote_dns): def _resolve_address(self, destaddr, default, use_remote_dns, family=None):
try: for f in (family,) if family else (socket.AF_INET, socket.AF_INET6):
return socket.inet_aton(destaddr) try:
except OSError: return f, socket.inet_pton(f, destaddr)
if use_remote_dns and self._proxy.remote_dns: except OSError:
return default continue
else:
return socket.inet_aton(socket.gethostbyname(destaddr)) if use_remote_dns and self._proxy.remote_dns:
return 0, default
else:
res = socket.getaddrinfo(destaddr, None, family=family or 0)
f, _, _, _, ipaddr = res[0]
return f, socket.inet_pton(f, ipaddr[0])
def _setup_socks4(self, address, is_4a=False): def _setup_socks4(self, address, is_4a=False):
destaddr, port = address destaddr, port = address
ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a) _, ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a, family=socket.AF_INET)
packet = struct.pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr packet = struct.pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr
username = (self._proxy.username or '').encode() username = (self._proxy.username or '').encode()
packet += username + b'\x00' packet += username + b'\x00'
if is_4a and self._proxy.remote_dns: if is_4a and self._proxy.remote_dns and ipaddr == SOCKS4_DEFAULT_DSTIP:
packet += destaddr.encode() + b'\x00' packet += destaddr.encode() + b'\x00'
self.sendall(packet) self.sendall(packet)
@ -210,7 +215,7 @@ def _socks5_auth(self):
def _setup_socks5(self, address): def _setup_socks5(self, address):
destaddr, port = address destaddr, port = address
ipaddr = self._resolve_address(destaddr, None, use_remote_dns=True) family, ipaddr = self._resolve_address(destaddr, None, use_remote_dns=True)
self._socks5_auth() self._socks5_auth()
@ -220,8 +225,10 @@ def _setup_socks5(self, address):
destaddr = destaddr.encode() destaddr = destaddr.encode()
packet += struct.pack('!B', Socks5AddressType.ATYP_DOMAINNAME) packet += struct.pack('!B', Socks5AddressType.ATYP_DOMAINNAME)
packet += self._len_and_data(destaddr) packet += self._len_and_data(destaddr)
else: elif family == socket.AF_INET:
packet += struct.pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr packet += struct.pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr
elif family == socket.AF_INET6:
packet += struct.pack('!B', Socks5AddressType.ATYP_IPV6) + ipaddr
packet += struct.pack('!H', port) packet += struct.pack('!H', port)
self.sendall(packet) self.sendall(packet)

View file

@ -669,6 +669,7 @@ def replace_insane(char):
def sanitize_path(s, force=False): def sanitize_path(s, force=False):
"""Sanitizes and normalizes path on Windows""" """Sanitizes and normalizes path on Windows"""
# XXX: this handles drive relative paths (c:sth) incorrectly
if sys.platform == 'win32': if sys.platform == 'win32':
force = False force = False
drive_or_unc, _ = os.path.splitdrive(s) drive_or_unc, _ = os.path.splitdrive(s)
@ -687,7 +688,10 @@ def sanitize_path(s, force=False):
sanitized_path.insert(0, drive_or_unc + os.path.sep) sanitized_path.insert(0, drive_or_unc + os.path.sep)
elif force and s and s[0] == os.path.sep: elif force and s and s[0] == os.path.sep:
sanitized_path.insert(0, os.path.sep) sanitized_path.insert(0, os.path.sep)
return os.path.join(*sanitized_path) # TODO: Fix behavioral differences <3.12
# The workaround using `normpath` only superficially passes tests
# Ref: https://github.com/python/cpython/pull/100351
return os.path.normpath(os.path.join(*sanitized_path))
def sanitize_url(url, *, scheme='http'): def sanitize_url(url, *, scheme='http'):
@ -1256,7 +1260,7 @@ def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
if precision == 'auto': if precision == 'auto':
auto_precision = True auto_precision = True
precision = 'microsecond' precision = 'microsecond'
today = datetime_round(datetime.datetime.utcnow(), precision) today = datetime_round(datetime.datetime.now(datetime.timezone.utc), precision)
if date_str in ('now', 'today'): if date_str in ('now', 'today'):
return today return today
if date_str == 'yesterday': if date_str == 'yesterday':
@ -1319,8 +1323,8 @@ def datetime_round(dt, precision='day'):
'second': 1, 'second': 1,
} }
roundto = lambda x, n: ((x + n / 2) // n) * n roundto = lambda x, n: ((x + n / 2) // n) * n
timestamp = calendar.timegm(dt.timetuple()) timestamp = roundto(calendar.timegm(dt.timetuple()), unit_seconds[precision])
return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision])) return datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
def hyphenate_date(date_str): def hyphenate_date(date_str):
@ -2847,6 +2851,7 @@ def mimetype2ext(mt, default=NO_DEFAULT):
'quicktime': 'mov', 'quicktime': 'mov',
'webm': 'webm', 'webm': 'webm',
'vp9': 'vp9', 'vp9': 'vp9',
'video/ogg': 'ogv',
'x-flv': 'flv', 'x-flv': 'flv',
'x-m4v': 'm4v', 'x-m4v': 'm4v',
'x-matroska': 'mkv', 'x-matroska': 'mkv',