Merge branch 'master' into yt-live-from-start-range

This commit is contained in:
bashonly 2024-02-29 04:58:30 -06:00 committed by GitHub
commit 6fc6349ef0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
109 changed files with 5788 additions and 1875 deletions

View file

@ -107,10 +107,10 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
with:
python-version: "3.10"
- uses: conda-incubator/setup-miniconda@v2
- uses: conda-incubator/setup-miniconda@v3
with:
miniforge-variant: Mambaforge
use-mamba: true
@ -121,16 +121,14 @@ jobs:
- name: Install Requirements
run: |
sudo apt -y install zip pandoc man sed
reqs=$(mktemp)
cat > "$reqs" << EOF
cat > ./requirements.txt << EOF
python=3.10.*
pyinstaller
cffi
brotli-python
secretstorage
EOF
sed -E '/^(brotli|secretstorage).*/d' requirements.txt >> "$reqs"
mamba create -n build --file "$reqs"
python devscripts/install_deps.py --print \
--exclude brotli --exclude brotlicffi \
--include secretstorage --include pyinstaller >> ./requirements.txt
mamba create -n build --file ./requirements.txt
- name: Prepare
run: |
@ -144,9 +142,9 @@ jobs:
run: |
unset LD_LIBRARY_PATH # Harmful; set by setup-python
conda activate build
python pyinst.py --onedir
python -m bundle.pyinstaller --onedir
(cd ./dist/yt-dlp_linux && zip -r ../yt-dlp_linux.zip .)
python pyinst.py
python -m bundle.pyinstaller
mv ./dist/yt-dlp_linux ./yt-dlp_linux
mv ./dist/yt-dlp_linux.zip ./yt-dlp_linux.zip
@ -164,13 +162,15 @@ jobs:
done
- name: Upload artifacts
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: build-bin-${{ github.job }}
path: |
yt-dlp
yt-dlp.tar.gz
yt-dlp_linux
yt-dlp_linux.zip
compression-level: 0
linux_arm:
needs: process
@ -201,17 +201,18 @@ jobs:
dockerRunArgs: --volume "${PWD}/repo:/repo"
install: | # Installing Python 3.10 from the Deadsnakes repo raises errors
apt update
apt -y install zlib1g-dev python3.8 python3.8-dev python3.8-distutils python3-pip
apt -y install zlib1g-dev libffi-dev python3.8 python3.8-dev python3.8-distutils python3-pip
python3.8 -m pip install -U pip setuptools wheel
# Cannot access requirements.txt from the repo directory at this stage
python3.8 -m pip install -U Pyinstaller mutagen pycryptodomex websockets brotli certifi secretstorage
# Cannot access any files from the repo directory at this stage
python3.8 -m pip install -U Pyinstaller mutagen pycryptodomex websockets brotli certifi secretstorage cffi
run: |
cd repo
python3.8 -m pip install -U Pyinstaller secretstorage -r requirements.txt # Cached version may be out of date
python3.8 devscripts/install_deps.py -o --include build
python3.8 devscripts/install_deps.py --include pyinstaller --include secretstorage # Cached version may be out of date
python3.8 devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}"
python3.8 devscripts/make_lazy_extractors.py
python3.8 pyinst.py
python3.8 -m bundle.pyinstaller
if ${{ vars.UPDATE_TO_VERIFICATION && 'true' || 'false' }}; then
arch="${{ (matrix.architecture == 'armv7' && 'armv7l') || matrix.architecture }}"
@ -224,10 +225,12 @@ jobs:
fi
- name: Upload artifacts
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: build-bin-linux_${{ matrix.architecture }}
path: | # run-on-arch-action designates armv7l as armv7
repo/dist/yt-dlp_linux_${{ (matrix.architecture == 'armv7' && 'armv7l') || matrix.architecture }}
compression-level: 0
macos:
needs: process
@ -240,9 +243,10 @@ jobs:
- name: Install Requirements
run: |
brew install coreutils
python3 -m pip install -U --user pip setuptools wheel
python3 devscripts/install_deps.py --user -o --include build
python3 devscripts/install_deps.py --print --include pyinstaller > requirements.txt
# We need to ignore wheels otherwise we break universal2 builds
python3 -m pip install -U --user --no-binary :all: Pyinstaller -r requirements.txt
python3 -m pip install -U --user --no-binary :all: -r requirements.txt
- name: Prepare
run: |
@ -250,9 +254,9 @@ jobs:
python3 devscripts/make_lazy_extractors.py
- name: Build
run: |
python3 pyinst.py --target-architecture universal2 --onedir
python3 -m bundle.pyinstaller --target-architecture universal2 --onedir
(cd ./dist/yt-dlp_macos && zip -r ../yt-dlp_macos.zip .)
python3 pyinst.py --target-architecture universal2
python3 -m bundle.pyinstaller --target-architecture universal2
- name: Verify --update-to
if: vars.UPDATE_TO_VERIFICATION
@ -265,11 +269,13 @@ jobs:
[[ "$version" != "$downgraded_version" ]]
- name: Upload artifacts
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: build-bin-${{ github.job }}
path: |
dist/yt-dlp_macos
dist/yt-dlp_macos.zip
compression-level: 0
macos_legacy:
needs: process
@ -293,8 +299,8 @@ jobs:
- name: Install Requirements
run: |
brew install coreutils
python3 -m pip install -U --user pip setuptools wheel
python3 -m pip install -U --user Pyinstaller -r requirements.txt
python3 devscripts/install_deps.py --user -o --include build
python3 devscripts/install_deps.py --user --include pyinstaller
- name: Prepare
run: |
@ -302,7 +308,7 @@ jobs:
python3 devscripts/make_lazy_extractors.py
- name: Build
run: |
python3 pyinst.py
python3 -m bundle.pyinstaller
mv dist/yt-dlp_macos dist/yt-dlp_macos_legacy
- name: Verify --update-to
@ -316,10 +322,12 @@ jobs:
[[ "$version" != "$downgraded_version" ]]
- name: Upload artifacts
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: build-bin-${{ github.job }}
path: |
dist/yt-dlp_macos_legacy
compression-level: 0
windows:
needs: process
@ -328,13 +336,14 @@ jobs:
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
with: # 3.8 is used for Win7 support
python-version: "3.8"
- name: Install Requirements
run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds
python -m pip install -U pip setuptools wheel py2exe
pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-5.8.0-py3-none-any.whl" -r requirements.txt
python devscripts/install_deps.py -o --include build
python devscripts/install_deps.py --include py2exe
python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-5.8.0-py3-none-any.whl"
- name: Prepare
run: |
@ -342,10 +351,10 @@ jobs:
python devscripts/make_lazy_extractors.py
- name: Build
run: |
python setup.py py2exe
python -m bundle.py2exe
Move-Item ./dist/yt-dlp.exe ./dist/yt-dlp_min.exe
python pyinst.py
python pyinst.py --onedir
python -m bundle.pyinstaller
python -m bundle.pyinstaller --onedir
Compress-Archive -Path ./dist/yt-dlp/* -DestinationPath ./dist/yt-dlp_win.zip
- name: Verify --update-to
@ -362,12 +371,14 @@ jobs:
}
- name: Upload artifacts
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: build-bin-${{ github.job }}
path: |
dist/yt-dlp.exe
dist/yt-dlp_min.exe
dist/yt-dlp_win.zip
compression-level: 0
windows32:
needs: process
@ -376,14 +387,15 @@ jobs:
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
with:
python-version: "3.8"
architecture: "x86"
- name: Install Requirements
run: |
python -m pip install -U pip setuptools wheel
pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-5.8.0-py3-none-any.whl" -r requirements.txt
python devscripts/install_deps.py -o --include build
python devscripts/install_deps.py
python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-5.8.0-py3-none-any.whl"
- name: Prepare
run: |
@ -391,7 +403,7 @@ jobs:
python devscripts/make_lazy_extractors.py
- name: Build
run: |
python pyinst.py
python -m bundle.pyinstaller
- name: Verify --update-to
if: vars.UPDATE_TO_VERIFICATION
@ -407,10 +419,12 @@ jobs:
}
- name: Upload artifacts
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: build-bin-${{ github.job }}
path: |
dist/yt-dlp_x86.exe
compression-level: 0
meta_files:
if: inputs.meta_files && always() && !cancelled()
@ -424,7 +438,11 @@ jobs:
- windows32
runs-on: ubuntu-latest
steps:
- uses: actions/download-artifact@v3
- uses: actions/download-artifact@v4
with:
path: artifact
pattern: build-bin-*
merge-multiple: true
- name: Make SHA2-SUMS files
run: |
@ -459,8 +477,11 @@ jobs:
done
- name: Upload artifacts
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: build-${{ github.job }}
path: |
SHA*SUMS*
_update_spec
SHA*SUMS*
compression-level: 0
overwrite: true

View file

@ -49,11 +49,11 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install test requirements
run: pip install pytest -r requirements.txt
run: python3 ./devscripts/install_deps.py --include dev
- name: Run tests
continue-on-error: False
run: |

View file

@ -11,11 +11,11 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: 3.9
- name: Install test requirements
run: pip install pytest -r requirements.txt
run: python3 ./devscripts/install_deps.py --include dev
- name: Run tests
continue-on-error: true
run: python3 ./devscripts/run_tests.py download
@ -38,11 +38,11 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install test requirements
run: pip install pytest -r requirements.txt
run: python3 ./devscripts/install_deps.py --include dev
- name: Run tests
continue-on-error: true
run: python3 ./devscripts/run_tests.py download

View file

@ -11,11 +11,11 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.8
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: '3.8'
- name: Install test requirements
run: pip install pytest -r requirements.txt
run: python3 ./devscripts/install_deps.py --include dev
- name: Run tests
run: |
python3 -m yt_dlp -v || true
@ -26,10 +26,10 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
- name: Install flake8
run: pip install flake8
run: python3 ./devscripts/install_deps.py -o --include dev
- name: Make lazy extractors
run: python devscripts/make_lazy_extractors.py
run: python3 ./devscripts/make_lazy_extractors.py
- name: Run flake8
run: flake8 .

View file

@ -6,8 +6,10 @@ on:
paths:
- "yt_dlp/**.py"
- "!yt_dlp/version.py"
- "setup.py"
- "pyinst.py"
- "bundle/*.py"
- "pyproject.toml"
- "Makefile"
- ".github/workflows/build.yml"
concurrency:
group: release-master
permissions:

View file

@ -18,7 +18,14 @@ jobs:
- name: Check for new commits
id: check_for_new_commits
run: |
relevant_files=("yt_dlp/*.py" ':!yt_dlp/version.py' "setup.py" "pyinst.py")
relevant_files=(
"yt_dlp/*.py"
':!yt_dlp/version.py'
"bundle/*.py"
"pyproject.toml"
"Makefile"
".github/workflows/build.yml"
)
echo "commit=$(git log --format=%H -1 --since="24 hours ago" -- "${relevant_files[@]}")" | tee "$GITHUB_OUTPUT"
release:

View file

@ -71,7 +71,7 @@ jobs:
with:
fetch-depth: 0
- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
with:
python-version: "3.10"
@ -246,15 +246,16 @@ jobs:
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
fetch-depth: 0
- uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Install Requirements
run: |
sudo apt -y install pandoc man
python -m pip install -U pip setuptools wheel twine
python -m pip install -U -r requirements.txt
python devscripts/install_deps.py -o --include build
- name: Prepare
env:
@ -266,14 +267,19 @@ jobs:
run: |
python devscripts/update-version.py -c "${{ env.channel }}" -r "${{ env.target_repo }}" -s "${{ env.suffix }}" "${{ env.version }}"
python devscripts/make_lazy_extractors.py
sed -i -E "s/(name=')[^']+(', # package name)/\1${{ env.pypi_project }}\2/" setup.py
sed -i -E '0,/(name = ")[^"]+(")/s//\1${{ env.pypi_project }}\2/' pyproject.toml
- name: Build
run: |
rm -rf dist/*
make pypi-files
printf '%s\n\n' \
'Official repository: <https://github.com/yt-dlp/yt-dlp>' \
'**PS**: Some links in this document will not work since this is a copy of the README.md from Github' > ./README.md.new
cat ./README.md >> ./README.md.new && mv -f ./README.md.new ./README.md
python devscripts/set-variant.py pip -M "You installed yt-dlp with pip or using the wheel from PyPi; Use that to update"
python setup.py sdist bdist_wheel
make clean-cache
python -m build --no-isolation .
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
@ -290,8 +296,12 @@ jobs:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- uses: actions/download-artifact@v3
- uses: actions/setup-python@v4
- uses: actions/download-artifact@v4
with:
path: artifact
pattern: build-*
merge-multiple: true
- uses: actions/setup-python@v5
with:
python-version: "3.10"

View file

@ -1,10 +0,0 @@
include AUTHORS
include Changelog.md
include LICENSE
include README.md
include completions/*/*
include supportedsites.md
include yt-dlp.1
include requirements.txt
recursive-include devscripts *
recursive-include test *

View file

@ -6,11 +6,11 @@ doc: README.md CONTRIBUTING.md issuetemplates supportedsites
ot: offlinetest
tar: yt-dlp.tar.gz
# Keep this list in sync with MANIFEST.in
# Keep this list in sync with pyproject.toml includes/artifacts
# intended use: when building a source distribution,
# make pypi-files && python setup.py sdist
# make pypi-files && python3 -m build -sn .
pypi-files: AUTHORS Changelog.md LICENSE README.md README.txt supportedsites \
completions yt-dlp.1 requirements.txt setup.cfg devscripts/* test/*
completions yt-dlp.1 pyproject.toml setup.cfg devscripts/* test/*
.PHONY: all clean install test tar pypi-files completions ot offlinetest codetest supportedsites
@ -21,7 +21,7 @@ clean-test:
*.mp4 *.mpga *.oga *.ogg *.opus *.png *.sbv *.srt *.swf *.swp *.tt *.ttml *.url *.vtt *.wav *.webloc *.webm *.webp
clean-dist:
rm -rf yt-dlp.1.temp.md yt-dlp.1 README.txt MANIFEST build/ dist/ .coverage cover/ yt-dlp.tar.gz completions/ \
yt_dlp/extractor/lazy_extractors.py *.spec CONTRIBUTING.md.tmp yt-dlp yt-dlp.exe yt_dlp.egg-info/ AUTHORS .mailmap
yt_dlp/extractor/lazy_extractors.py *.spec CONTRIBUTING.md.tmp yt-dlp yt-dlp.exe yt_dlp.egg-info/ AUTHORS
clean-cache:
find . \( \
-type d -name .pytest_cache -o -type d -name __pycache__ -o -name "*.pyc" -o -name "*.class" \
@ -37,12 +37,15 @@ BINDIR ?= $(PREFIX)/bin
MANDIR ?= $(PREFIX)/man
SHAREDIR ?= $(PREFIX)/share
PYTHON ?= /usr/bin/env python3
GNUTAR ?= tar
# set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local
SYSCONFDIR = $(shell if [ $(PREFIX) = /usr -o $(PREFIX) = /usr/local ]; then echo /etc; else echo $(PREFIX)/etc; fi)
# set markdown input format to "markdown-smart" for pandoc version 2 and to "markdown" for pandoc prior to version 2
MARKDOWN = $(shell if [ `pandoc -v | head -n1 | cut -d" " -f2 | head -c1` = "2" ]; then echo markdown-smart; else echo markdown; fi)
# set markdown input format to "markdown-smart" for pandoc version 2+ and to "markdown" for pandoc prior to version 2
PANDOC_VERSION_CMD = pandoc -v 2>/dev/null | head -n1 | cut -d' ' -f2 | head -c1
PANDOC_VERSION != $(PANDOC_VERSION_CMD)
PANDOC_VERSION ?= $(shell $(PANDOC_VERSION_CMD))
MARKDOWN_CMD = if [ "$(PANDOC_VERSION)" = "1" -o "$(PANDOC_VERSION)" = "0" ]; then echo markdown; else echo markdown-smart; fi
MARKDOWN != $(MARKDOWN_CMD)
MARKDOWN ?= $(shell $(MARKDOWN_CMD))
install: lazy-extractors yt-dlp yt-dlp.1 completions
mkdir -p $(DESTDIR)$(BINDIR)
@ -73,24 +76,28 @@ test:
offlinetest: codetest
$(PYTHON) -m pytest -k "not download"
# XXX: This is hard to maintain
CODE_FOLDERS = yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor yt_dlp/compat yt_dlp/compat/urllib yt_dlp/utils yt_dlp/dependencies yt_dlp/networking
yt-dlp: yt_dlp/*.py yt_dlp/*/*.py
CODE_FOLDERS_CMD = find yt_dlp -type f -name '__init__.py' | sed 's,/__init__.py,,' | grep -v '/__' | sort
CODE_FOLDERS != $(CODE_FOLDERS_CMD)
CODE_FOLDERS ?= $(shell $(CODE_FOLDERS_CMD))
CODE_FILES_CMD = for f in $(CODE_FOLDERS) ; do echo "$$f" | sed 's,$$,/*.py,' ; done
CODE_FILES != $(CODE_FILES_CMD)
CODE_FILES ?= $(shell $(CODE_FILES_CMD))
yt-dlp: $(CODE_FILES)
mkdir -p zip
for d in $(CODE_FOLDERS) ; do \
mkdir -p zip/$$d ;\
cp -pPR $$d/*.py zip/$$d/ ;\
done
touch -t 200001010101 zip/yt_dlp/*.py zip/yt_dlp/*/*.py
(cd zip && touch -t 200001010101 $(CODE_FILES))
mv zip/yt_dlp/__main__.py zip/
cd zip ; zip -q ../yt-dlp yt_dlp/*.py yt_dlp/*/*.py __main__.py
(cd zip && zip -q ../yt-dlp $(CODE_FILES) __main__.py)
rm -rf zip
echo '#!$(PYTHON)' > yt-dlp
cat yt-dlp.zip >> yt-dlp
rm yt-dlp.zip
chmod a+x yt-dlp
README.md: yt_dlp/*.py yt_dlp/*/*.py devscripts/make_readme.py
README.md: $(CODE_FILES) devscripts/make_readme.py
COLUMNS=80 $(PYTHON) yt_dlp/__main__.py --ignore-config --help | $(PYTHON) devscripts/make_readme.py
CONTRIBUTING.md: README.md devscripts/make_contributing.py
@ -115,24 +122,26 @@ yt-dlp.1: README.md devscripts/prepare_manpage.py
pandoc -s -f $(MARKDOWN) -t man yt-dlp.1.temp.md -o yt-dlp.1
rm -f yt-dlp.1.temp.md
completions/bash/yt-dlp: yt_dlp/*.py yt_dlp/*/*.py devscripts/bash-completion.in
completions/bash/yt-dlp: $(CODE_FILES) devscripts/bash-completion.in
mkdir -p completions/bash
$(PYTHON) devscripts/bash-completion.py
completions/zsh/_yt-dlp: yt_dlp/*.py yt_dlp/*/*.py devscripts/zsh-completion.in
completions/zsh/_yt-dlp: $(CODE_FILES) devscripts/zsh-completion.in
mkdir -p completions/zsh
$(PYTHON) devscripts/zsh-completion.py
completions/fish/yt-dlp.fish: yt_dlp/*.py yt_dlp/*/*.py devscripts/fish-completion.in
completions/fish/yt-dlp.fish: $(CODE_FILES) devscripts/fish-completion.in
mkdir -p completions/fish
$(PYTHON) devscripts/fish-completion.py
_EXTRACTOR_FILES = $(shell find yt_dlp/extractor -name '*.py' -and -not -name 'lazy_extractors.py')
_EXTRACTOR_FILES_CMD = find yt_dlp/extractor -name '*.py' -and -not -name 'lazy_extractors.py'
_EXTRACTOR_FILES != $(_EXTRACTOR_FILES_CMD)
_EXTRACTOR_FILES ?= $(shell $(_EXTRACTOR_FILES_CMD))
yt_dlp/extractor/lazy_extractors.py: devscripts/make_lazy_extractors.py devscripts/lazy_load_template.py $(_EXTRACTOR_FILES)
$(PYTHON) devscripts/make_lazy_extractors.py $@
yt-dlp.tar.gz: all
@tar -czf yt-dlp.tar.gz --transform "s|^|yt-dlp/|" --owner 0 --group 0 \
@$(GNUTAR) -czf yt-dlp.tar.gz --transform "s|^|yt-dlp/|" --owner 0 --group 0 \
--exclude '*.DS_Store' \
--exclude '*.kate-swp' \
--exclude '*.pyc' \
@ -144,12 +153,8 @@ yt-dlp.tar.gz: all
-- \
README.md supportedsites.md Changelog.md LICENSE \
CONTRIBUTING.md Collaborators.md CONTRIBUTORS AUTHORS \
Makefile MANIFEST.in yt-dlp.1 README.txt completions \
setup.py setup.cfg yt-dlp yt_dlp requirements.txt \
devscripts test
Makefile yt-dlp.1 README.txt completions .gitignore \
setup.cfg yt-dlp yt_dlp pyproject.toml devscripts test
AUTHORS: .mailmap
git shortlog -s -n | cut -f2 | sort > AUTHORS
.mailmap:
git shortlog -s -e -n | awk '!(out[$$NF]++) { $$1="";sub(/^[ \t]+/,""); print}' > .mailmap
AUTHORS:
git shortlog -s -n HEAD | cut -f2 | sort > AUTHORS

View file

@ -167,7 +167,8 @@ ### Differences in default behavior
* `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter,-manifest-filesize-approx`
* `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter,-manifest-filesize-approx`
* `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date`
* `--compat-options 2022`: Same as `--compat-options playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler,manifest-filesize-approx`. Use this to enable all future compat options
* `--compat-options 2022`: Same as `--compat-options 2023,playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler,manifest-filesize-approx`
* `--compat-options 2023`: Currently does nothing. Use this to enable all future compat options
# INSTALLATION
@ -320,19 +321,21 @@ ### Deprecated
## COMPILE
### Standalone PyInstaller Builds
To build the standalone executable, you must have Python and `pyinstaller` (plus any of yt-dlp's [optional dependencies](#dependencies) if needed). Once you have all the necessary dependencies installed, simply run `pyinst.py`. The executable will be built for the same architecture (x86/ARM, 32/64 bit) as the Python used.
To build the standalone executable, you must have Python and `pyinstaller` (plus any of yt-dlp's [optional dependencies](#dependencies) if needed). The executable will be built for the same architecture (x86/ARM, 32/64 bit) as the Python used. You can run the following commands:
python3 -m pip install -U pyinstaller -r requirements.txt
python3 devscripts/make_lazy_extractors.py
python3 pyinst.py
```
python3 devscripts/install_deps.py --include pyinstaller
python3 devscripts/make_lazy_extractors.py
python3 -m bundle.pyinstaller
```
On some systems, you may need to use `py` or `python` instead of `python3`.
`pyinst.py` accepts any arguments that can be passed to `pyinstaller`, such as `--onefile/-F` or `--onedir/-D`, which is further [documented here](https://pyinstaller.org/en/stable/usage.html#what-to-generate).
`bundle/pyinstaller.py` accepts any arguments that can be passed to `pyinstaller`, such as `--onefile/-F` or `--onedir/-D`, which is further [documented here](https://pyinstaller.org/en/stable/usage.html#what-to-generate).
**Note**: Pyinstaller versions below 4.4 [do not support](https://github.com/pyinstaller/pyinstaller#requirements-and-tested-platforms) Python installed from the Windows store without using a virtual environment.
**Important**: Running `pyinstaller` directly **without** using `pyinst.py` is **not** officially supported. This may or may not work correctly.
**Important**: Running `pyinstaller` directly **without** using `bundle/pyinstaller.py` is **not** officially supported. This may or may not work correctly.
### Platform-independent Binary (UNIX)
You will need the build tools `python` (3.8+), `zip`, `make` (GNU), `pandoc`\* and `pytest`\*.
@ -345,14 +348,17 @@ ### Standalone Py2Exe Builds (Windows)
While we provide the option to build with [py2exe](https://www.py2exe.org), it is recommended to build [using PyInstaller](#standalone-pyinstaller-builds) instead since the py2exe builds **cannot contain `pycryptodomex`/`certifi` and needs VC++14** on the target computer to run.
If you wish to build it anyway, install Python and py2exe, and then simply run `setup.py py2exe`
If you wish to build it anyway, install Python (if it is not already installed) and you can run the following commands:
py -m pip install -U py2exe -r requirements.txt
py devscripts/make_lazy_extractors.py
py setup.py py2exe
```
py devscripts/install_deps.py --include py2exe
py devscripts/make_lazy_extractors.py
py -m bundle.py2exe
```
### Related scripts
* **`devscripts/install_deps.py`** - Install dependencies for yt-dlp.
* **`devscripts/update-version.py`** - Update the version number based on current date.
* **`devscripts/set-variant.py`** - Set the build variant of the executable.
* **`devscripts/make_changelog.py`** - Create a markdown changelog using short commit messages and update `CONTRIBUTORS` file.
@ -1305,7 +1311,8 @@ # OUTPUT TEMPLATE
- `display_id` (string): An alternative identifier for the video
- `uploader` (string): Full name of the video uploader
- `license` (string): License name the video is licensed under
- `creator` (string): The creator of the video
- `creators` (list): The creators of the video
- `creator` (string): The creators of the video; comma-separated
- `timestamp` (numeric): UNIX timestamp of the moment the video became available
- `upload_date` (string): Video upload date in UTC (YYYYMMDD)
- `release_timestamp` (numeric): UNIX timestamp of the moment the video was released
@ -1379,11 +1386,16 @@ # OUTPUT TEMPLATE
- `track` (string): Title of the track
- `track_number` (numeric): Number of the track within an album or a disc
- `track_id` (string): Id of the track
- `artist` (string): Artist(s) of the track
- `genre` (string): Genre(s) of the track
- `artists` (list): Artist(s) of the track
- `artist` (string): Artist(s) of the track; comma-separated
- `genres` (list): Genre(s) of the track
- `genre` (string): Genre(s) of the track; comma-separated
- `composers` (list): Composer(s) of the piece
- `composer` (string): Composer(s) of the piece; comma-separated
- `album` (string): Title of the album the track belongs to
- `album_type` (string): Type of the album
- `album_artist` (string): List of all artists appeared on the album
- `album_artists` (list): All artists appeared on the album
- `album_artist` (string): All artists appeared on the album; comma-separated
- `disc_number` (numeric): Number of the disc or other physical medium the track belongs to
Available only when using `--download-sections` and for `chapter:` prefix when using `--split-chapters` for videos with internal chapters:
@ -1761,10 +1773,11 @@ # MODIFYING METADATA
`description`, `synopsis` | `description`
`purl`, `comment` | `webpage_url`
`track` | `track_number`
`artist` | `artist`, `creator`, `uploader` or `uploader_id`
`genre` | `genre`
`artist` | `artist`, `artists`, `creator`, `creators`, `uploader` or `uploader_id`
`composer` | `composer` or `composers`
`genre` | `genre` or `genres`
`album` | `album`
`album_artist` | `album_artist`
`album_artist` | `album_artist` or `album_artists`
`disc` | `disc_number`
`show` | `series`
`season_number` | `season_number`

1
bundle/__init__.py Normal file
View file

@ -0,0 +1 @@
# Empty file

59
bundle/py2exe.py Executable file
View file

@ -0,0 +1,59 @@
#!/usr/bin/env python3
# Allow execution from anywhere
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import warnings
from py2exe import freeze
from devscripts.utils import read_version
VERSION = read_version()
def main():
warnings.warn(
'py2exe builds do not support pycryptodomex and needs VC++14 to run. '
'It is recommended to run "pyinst.py" to build using pyinstaller instead')
return freeze(
console=[{
'script': './yt_dlp/__main__.py',
'dest_base': 'yt-dlp',
'icon_resources': [(1, 'devscripts/logo.ico')],
}],
version_info={
'version': VERSION,
'description': 'A youtube-dl fork with additional features and patches',
'comments': 'Official repository: <https://github.com/yt-dlp/yt-dlp>',
'product_name': 'yt-dlp',
'product_version': VERSION,
},
options={
'bundle_files': 0,
'compressed': 1,
'optimize': 2,
'dist_dir': './dist',
'excludes': [
# py2exe cannot import Crypto
'Crypto',
'Cryptodome',
# py2exe appears to confuse this with our socks library.
# We don't use pysocks and urllib3.contrib.socks would fail to import if tried.
'urllib3.contrib.socks'
],
'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'],
# Modules that are only imported dynamically must be added here
'includes': ['yt_dlp.compat._legacy', 'yt_dlp.compat._deprecated',
'yt_dlp.utils._legacy', 'yt_dlp.utils._deprecated'],
},
zipfile=None,
)
if __name__ == '__main__':
main()

2
pyinst.py → bundle/pyinstaller.py Normal file → Executable file
View file

@ -4,7 +4,7 @@
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import platform

66
devscripts/install_deps.py Executable file
View file

@ -0,0 +1,66 @@
#!/usr/bin/env python3
# Allow execution from anywhere
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import argparse
import re
import subprocess
from devscripts.tomlparse import parse_toml
from devscripts.utils import read_file
def parse_args():
parser = argparse.ArgumentParser(description='Install dependencies for yt-dlp')
parser.add_argument(
'input', nargs='?', metavar='TOMLFILE', default='pyproject.toml', help='Input file (default: %(default)s)')
parser.add_argument(
'-e', '--exclude', metavar='REQUIREMENT', action='append', help='Exclude a required dependency')
parser.add_argument(
'-i', '--include', metavar='GROUP', action='append', help='Include an optional dependency group')
parser.add_argument(
'-o', '--only-optional', action='store_true', help='Only install optional dependencies')
parser.add_argument(
'-p', '--print', action='store_true', help='Only print a requirements.txt to stdout')
parser.add_argument(
'-u', '--user', action='store_true', help='Install with pip as --user')
return parser.parse_args()
def main():
args = parse_args()
toml_data = parse_toml(read_file(args.input))
deps = toml_data['project']['dependencies']
targets = deps.copy() if not args.only_optional else []
for exclude in args.exclude or []:
for dep in deps:
simplified_dep = re.match(r'[\w-]+', dep)[0]
if dep in targets and (exclude.lower() == simplified_dep.lower() or exclude == dep):
targets.remove(dep)
optional_deps = toml_data['project']['optional-dependencies']
for include in args.include or []:
group = optional_deps.get(include)
if group:
targets.extend(group)
if args.print:
for target in targets:
print(target)
return
pip_args = [sys.executable, '-m', 'pip', 'install', '-U']
if args.user:
pip_args.append('--user')
pip_args.extend(targets)
return subprocess.call(pip_args)
if __name__ == '__main__':
sys.exit(main())

189
devscripts/tomlparse.py Executable file
View file

@ -0,0 +1,189 @@
#!/usr/bin/env python3
"""
Simple parser for spec compliant toml files
A simple toml parser for files that comply with the spec.
Should only be used to parse `pyproject.toml` for `install_deps.py`.
IMPORTANT: INVALID FILES OR MULTILINE STRINGS ARE NOT SUPPORTED!
"""
from __future__ import annotations
import datetime
import json
import re
WS = r'(?:[\ \t]*)'
STRING_RE = re.compile(r'"(?:\\.|[^\\"\n])*"|\'[^\'\n]*\'')
SINGLE_KEY_RE = re.compile(rf'{STRING_RE.pattern}|[A-Za-z0-9_-]+')
KEY_RE = re.compile(rf'{WS}(?:{SINGLE_KEY_RE.pattern}){WS}(?:\.{WS}(?:{SINGLE_KEY_RE.pattern}){WS})*')
EQUALS_RE = re.compile(rf'={WS}')
WS_RE = re.compile(WS)
_SUBTABLE = rf'(?P<subtable>^\[(?P<is_list>\[)?(?P<path>{KEY_RE.pattern})\]\]?)'
EXPRESSION_RE = re.compile(rf'^(?:{_SUBTABLE}|{KEY_RE.pattern}=)', re.MULTILINE)
LIST_WS_RE = re.compile(rf'{WS}((#[^\n]*)?\n{WS})*')
LEFTOVER_VALUE_RE = re.compile(r'[^,}\]\t\n#]+')
def parse_key(value: str):
for match in SINGLE_KEY_RE.finditer(value):
if match[0][0] == '"':
yield json.loads(match[0])
elif match[0][0] == '\'':
yield match[0][1:-1]
else:
yield match[0]
def get_target(root: dict, paths: list[str], is_list=False):
target = root
for index, key in enumerate(paths, 1):
use_list = is_list and index == len(paths)
result = target.get(key)
if result is None:
result = [] if use_list else {}
target[key] = result
if isinstance(result, dict):
target = result
elif use_list:
target = {}
result.append(target)
else:
target = result[-1]
assert isinstance(target, dict)
return target
def parse_enclosed(data: str, index: int, end: str, ws_re: re.Pattern):
index += 1
if match := ws_re.match(data, index):
index = match.end()
while data[index] != end:
index = yield True, index
if match := ws_re.match(data, index):
index = match.end()
if data[index] == ',':
index += 1
if match := ws_re.match(data, index):
index = match.end()
assert data[index] == end
yield False, index + 1
def parse_value(data: str, index: int):
if data[index] == '[':
result = []
indices = parse_enclosed(data, index, ']', LIST_WS_RE)
valid, index = next(indices)
while valid:
index, value = parse_value(data, index)
result.append(value)
valid, index = indices.send(index)
return index, result
if data[index] == '{':
result = {}
indices = parse_enclosed(data, index, '}', WS_RE)
valid, index = next(indices)
while valid:
valid, index = indices.send(parse_kv_pair(data, index, result))
return index, result
if match := STRING_RE.match(data, index):
return match.end(), json.loads(match[0]) if match[0][0] == '"' else match[0][1:-1]
match = LEFTOVER_VALUE_RE.match(data, index)
assert match
value = match[0].strip()
for func in [
int,
float,
datetime.time.fromisoformat,
datetime.date.fromisoformat,
datetime.datetime.fromisoformat,
{'true': True, 'false': False}.get,
]:
try:
value = func(value)
break
except Exception:
pass
return match.end(), value
def parse_kv_pair(data: str, index: int, target: dict):
match = KEY_RE.match(data, index)
if not match:
return None
*keys, key = parse_key(match[0])
match = EQUALS_RE.match(data, match.end())
assert match
index = match.end()
index, value = parse_value(data, index)
get_target(target, keys)[key] = value
return index
def parse_toml(data: str):
root = {}
target = root
index = 0
while True:
match = EXPRESSION_RE.search(data, index)
if not match:
break
if match.group('subtable'):
index = match.end()
path, is_list = match.group('path', 'is_list')
target = get_target(root, list(parse_key(path)), bool(is_list))
continue
index = parse_kv_pair(data, match.start(), target)
assert index is not None
return root
def main():
import argparse
from pathlib import Path
parser = argparse.ArgumentParser()
parser.add_argument('infile', type=Path, help='The TOML file to read as input')
args = parser.parse_args()
with args.infile.open('r', encoding='utf-8') as file:
data = file.read()
def default(obj):
if isinstance(obj, (datetime.date, datetime.time, datetime.datetime)):
return obj.isoformat()
print(json.dumps(parse_toml(data), default=default))
if __name__ == '__main__':
main()

View file

@ -1,5 +1,118 @@
[build-system]
build-backend = 'setuptools.build_meta'
# https://github.com/yt-dlp/yt-dlp/issues/5941
# https://github.com/pypa/distutils/issues/17
requires = ['setuptools > 50']
requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "yt-dlp"
maintainers = [
{name = "pukkandan", email = "pukkandan.ytdlp@gmail.com"},
{name = "Grub4K", email = "contact@grub4k.xyz"},
{name = "bashonly", email = "bashonly@protonmail.com"},
]
description = "A youtube-dl fork with additional features and patches"
readme = "README.md"
requires-python = ">=3.8"
keywords = [
"youtube-dl",
"video-downloader",
"youtube-downloader",
"sponsorblock",
"youtube-dlc",
"yt-dlp",
]
license = {file = "LICENSE"}
classifiers = [
"Topic :: Multimedia :: Video",
"Development Status :: 5 - Production/Stable",
"Environment :: Console",
"Programming Language :: Python",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: Implementation",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
"License :: OSI Approved :: The Unlicense (Unlicense)",
"Operating System :: OS Independent",
]
dynamic = ["version"]
dependencies = [
"brotli; implementation_name=='cpython'",
"brotlicffi; implementation_name!='cpython'",
"certifi",
"mutagen",
"pycryptodomex",
"requests>=2.31.0,<3",
"urllib3>=1.26.17,<3",
"websockets>=12.0",
]
[project.optional-dependencies]
secretstorage = [
"cffi",
"secretstorage",
]
build = [
"build",
"hatchling",
"pip",
"wheel",
]
dev = [
"flake8",
"isort",
"pytest",
]
pyinstaller = ["pyinstaller>=6.3"]
py2exe = ["py2exe>=0.12"]
[project.urls]
Documentation = "https://github.com/yt-dlp/yt-dlp#readme"
Repository = "https://github.com/yt-dlp/yt-dlp"
Tracker = "https://github.com/yt-dlp/yt-dlp/issues"
Funding = "https://github.com/yt-dlp/yt-dlp/blob/master/Collaborators.md#collaborators"
[project.scripts]
yt-dlp = "yt_dlp:main"
[project.entry-points.pyinstaller40]
hook-dirs = "yt_dlp.__pyinstaller:get_hook_dirs"
[tool.hatch.build.targets.sdist]
include = [
"/yt_dlp",
"/devscripts",
"/test",
"/.gitignore", # included by default, needed for auto-excludes
"/Changelog.md",
"/LICENSE", # included as license
"/pyproject.toml", # included by default
"/README.md", # included as readme
"/setup.cfg",
"/supportedsites.md",
]
artifacts = [
"/yt_dlp/extractor/lazy_extractors.py",
"/completions",
"/AUTHORS", # included by default
"/README.txt",
"/yt-dlp.1",
]
[tool.hatch.build.targets.wheel]
packages = ["yt_dlp"]
artifacts = ["/yt_dlp/extractor/lazy_extractors.py"]
[tool.hatch.build.targets.wheel.shared-data]
"completions/bash/yt-dlp" = "share/bash-completion/completions/yt-dlp"
"completions/zsh/_yt-dlp" = "share/zsh/site-functions/_yt-dlp"
"completions/fish/yt-dlp.fish" = "share/fish/vendor_completions.d/yt-dlp.fish"
"README.txt" = "share/doc/yt_dlp/README.txt"
"yt-dlp.1" = "share/man/man1/yt-dlp.1"
[tool.hatch.version]
path = "yt_dlp/version.py"
pattern = "_pkg_version = '(?P<version>[^']+)'"

View file

@ -1,8 +0,0 @@
mutagen
pycryptodomex
brotli; implementation_name=='cpython'
brotlicffi; implementation_name!='cpython'
certifi
requests>=2.31.0,<3
urllib3>=1.26.17,<3
websockets>=12.0

View file

@ -1,7 +1,3 @@
[wheel]
universal = true
[flake8]
exclude = build,venv,.tox,.git,.pytest_cache
ignore = E402,E501,E731,E741,W503

183
setup.py
View file

@ -1,183 +0,0 @@
#!/usr/bin/env python3
# Allow execution from anywhere
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import subprocess
import warnings
try:
from setuptools import Command, find_packages, setup
setuptools_available = True
except ImportError:
from distutils.core import Command, setup
setuptools_available = False
from devscripts.utils import read_file, read_version
VERSION = read_version(varname='_pkg_version')
DESCRIPTION = 'A youtube-dl fork with additional features and patches'
LONG_DESCRIPTION = '\n\n'.join((
'Official repository: <https://github.com/yt-dlp/yt-dlp>',
'**PS**: Some links in this document will not work since this is a copy of the README.md from Github',
read_file('README.md')))
REQUIREMENTS = read_file('requirements.txt').splitlines()
def packages():
if setuptools_available:
return find_packages(exclude=('youtube_dl', 'youtube_dlc', 'test', 'ytdlp_plugins', 'devscripts'))
return [
'yt_dlp', 'yt_dlp.extractor', 'yt_dlp.downloader', 'yt_dlp.postprocessor', 'yt_dlp.compat',
]
def py2exe_params():
warnings.warn(
'py2exe builds do not support pycryptodomex and needs VC++14 to run. '
'It is recommended to run "pyinst.py" to build using pyinstaller instead')
return {
'console': [{
'script': './yt_dlp/__main__.py',
'dest_base': 'yt-dlp',
'icon_resources': [(1, 'devscripts/logo.ico')],
}],
'version_info': {
'version': VERSION,
'description': DESCRIPTION,
'comments': LONG_DESCRIPTION.split('\n')[0],
'product_name': 'yt-dlp',
'product_version': VERSION,
},
'options': {
'bundle_files': 0,
'compressed': 1,
'optimize': 2,
'dist_dir': './dist',
'excludes': [
# py2exe cannot import Crypto
'Crypto',
'Cryptodome',
# py2exe appears to confuse this with our socks library.
# We don't use pysocks and urllib3.contrib.socks would fail to import if tried.
'urllib3.contrib.socks'
],
'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'],
# Modules that are only imported dynamically must be added here
'includes': ['yt_dlp.compat._legacy', 'yt_dlp.compat._deprecated',
'yt_dlp.utils._legacy', 'yt_dlp.utils._deprecated'],
},
'zipfile': None,
}
def build_params():
files_spec = [
('share/bash-completion/completions', ['completions/bash/yt-dlp']),
('share/zsh/site-functions', ['completions/zsh/_yt-dlp']),
('share/fish/vendor_completions.d', ['completions/fish/yt-dlp.fish']),
('share/doc/yt_dlp', ['README.txt']),
('share/man/man1', ['yt-dlp.1'])
]
data_files = []
for dirname, files in files_spec:
resfiles = []
for fn in files:
if not os.path.exists(fn):
warnings.warn(f'Skipping file {fn} since it is not present. Try running " make pypi-files " first')
else:
resfiles.append(fn)
data_files.append((dirname, resfiles))
params = {'data_files': data_files}
if setuptools_available:
params['entry_points'] = {
'console_scripts': ['yt-dlp = yt_dlp:main'],
'pyinstaller40': ['hook-dirs = yt_dlp.__pyinstaller:get_hook_dirs'],
}
else:
params['scripts'] = ['yt-dlp']
return params
class build_lazy_extractors(Command):
description = 'Build the extractor lazy loading module'
user_options = []
def initialize_options(self):
pass
def finalize_options(self):
pass
def run(self):
if self.dry_run:
print('Skipping build of lazy extractors in dry run mode')
return
subprocess.run([sys.executable, 'devscripts/make_lazy_extractors.py'])
def main():
if sys.argv[1:2] == ['py2exe']:
params = py2exe_params()
try:
from py2exe import freeze
except ImportError:
import py2exe # noqa: F401
warnings.warn('You are using an outdated version of py2exe. Support for this version will be removed in the future')
params['console'][0].update(params.pop('version_info'))
params['options'] = {'py2exe': params.pop('options')}
else:
return freeze(**params)
else:
params = build_params()
setup(
name='yt-dlp', # package name (do not change/remove comment)
version=VERSION,
maintainer='pukkandan',
maintainer_email='pukkandan.ytdlp@gmail.com',
description=DESCRIPTION,
long_description=LONG_DESCRIPTION,
long_description_content_type='text/markdown',
url='https://github.com/yt-dlp/yt-dlp',
packages=packages(),
install_requires=REQUIREMENTS,
python_requires='>=3.8',
project_urls={
'Documentation': 'https://github.com/yt-dlp/yt-dlp#readme',
'Source': 'https://github.com/yt-dlp/yt-dlp',
'Tracker': 'https://github.com/yt-dlp/yt-dlp/issues',
'Funding': 'https://github.com/yt-dlp/yt-dlp/blob/master/Collaborators.md#collaborators',
},
classifiers=[
'Topic :: Multimedia :: Video',
'Development Status :: 5 - Production/Stable',
'Environment :: Console',
'Programming Language :: Python',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
'Programming Language :: Python :: 3.12',
'Programming Language :: Python :: Implementation',
'Programming Language :: Python :: Implementation :: CPython',
'Programming Language :: Python :: Implementation :: PyPy',
'License :: Public Domain',
'Operating System :: OS Independent',
],
cmdclass={'build_lazy_extractors': build_lazy_extractors},
**params
)
main()

View file

@ -223,6 +223,10 @@ def sanitize(key, value):
if test_info_dict.get('display_id') == test_info_dict.get('id'):
test_info_dict.pop('display_id')
# Remove deprecated fields
for old in YoutubeDL._deprecated_multivalue_fields.keys():
test_info_dict.pop(old, None)
# release_year may be generated from release_date
if try_call(lambda: test_info_dict['release_year'] == int(test_info_dict['release_date'][:4])):
test_info_dict.pop('release_year')

View file

@ -941,7 +941,7 @@ def test_match_filter(self):
def get_videos(filter_=None):
ydl = YDL({'match_filter': filter_, 'simulate': True})
for v in videos:
ydl.process_ie_result(v, download=True)
ydl.process_ie_result(v.copy(), download=True)
return [v['id'] for v in ydl.downloaded_info_dicts]
res = get_videos()

View file

@ -13,6 +13,7 @@
import http.cookiejar
import http.server
import io
import logging
import pathlib
import random
import ssl
@ -180,6 +181,12 @@ def do_GET(self):
self.send_header('Location', '/a/b/./../../headers')
self.send_header('Content-Length', '0')
self.end_headers()
elif self.path == '/redirect_dotsegments_absolute':
self.send_response(301)
# redirect to /headers but with dot segments before - absolute url
self.send_header('Location', f'http://127.0.0.1:{http_server_port(self.server)}/a/b/./../../headers')
self.send_header('Content-Length', '0')
self.end_headers()
elif self.path.startswith('/redirect_'):
self._redirect()
elif self.path.startswith('/method'):
@ -345,16 +352,17 @@ def test_percent_encode(self, handler):
res.close()
@pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True)
def test_remove_dot_segments(self, handler):
with handler() as rh:
@pytest.mark.parametrize('path', [
'/a/b/./../../headers',
'/redirect_dotsegments',
# https://github.com/yt-dlp/yt-dlp/issues/9020
'/redirect_dotsegments_absolute',
])
def test_remove_dot_segments(self, handler, path):
with handler(verbose=True) as rh:
# This isn't a comprehensive test,
# but it should be enough to check whether the handler is removing dot segments
res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/a/b/./../../headers'))
assert res.status == 200
assert res.url == f'http://127.0.0.1:{self.http_port}/headers'
res.close()
res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_dotsegments'))
# but it should be enough to check whether the handler is removing dot segments in required scenarios
res = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}{path}'))
assert res.status == 200
assert res.url == f'http://127.0.0.1:{self.http_port}/headers'
res.close()
@ -745,6 +753,25 @@ def test_certificate_nocombined_pass(self, handler):
})
class TestRequestHandlerMisc:
"""Misc generic tests for request handlers, not related to request or validation testing"""
@pytest.mark.parametrize('handler,logger_name', [
('Requests', 'urllib3'),
('Websockets', 'websockets.client'),
('Websockets', 'websockets.server')
], indirect=['handler'])
def test_remove_logging_handler(self, handler, logger_name):
# Ensure any logging handlers, which may contain a YoutubeDL instance,
# are removed when we close the request handler
# See: https://github.com/yt-dlp/yt-dlp/issues/8922
logging_handlers = logging.getLogger(logger_name).handlers
before_count = len(logging_handlers)
rh = handler()
assert len(logging_handlers) == before_count + 1
rh.close()
assert len(logging_handlers) == before_count
class TestUrllibRequestHandler(TestRequestHandlerBase):
@pytest.mark.parametrize('handler', ['Urllib'], indirect=True)
def test_file_urls(self, handler):
@ -820,6 +847,7 @@ def test_httplib_validation_errors(self, handler, req, match, version_check):
assert not isinstance(exc_info.value, TransportError)
@pytest.mark.parametrize('handler', ['Requests'], indirect=True)
class TestRequestsRequestHandler(TestRequestHandlerBase):
@pytest.mark.parametrize('raised,expected', [
(lambda: requests.exceptions.ConnectTimeout(), TransportError),
@ -836,7 +864,6 @@ class TestRequestsRequestHandler(TestRequestHandlerBase):
(lambda: requests.exceptions.RequestException(), RequestError)
# (lambda: requests.exceptions.TooManyRedirects(), HTTPError) - Needs a response object
])
@pytest.mark.parametrize('handler', ['Requests'], indirect=True)
def test_request_error_mapping(self, handler, monkeypatch, raised, expected):
with handler() as rh:
def mock_get_instance(*args, **kwargs):
@ -870,7 +897,6 @@ def request(self, *args, **kwargs):
'3 bytes read, 5 more expected'
),
])
@pytest.mark.parametrize('handler', ['Requests'], indirect=True)
def test_response_error_mapping(self, handler, monkeypatch, raised, expected, match):
from requests.models import Response as RequestsResponse
from urllib3.response import HTTPResponse as Urllib3Response
@ -889,6 +915,21 @@ def mock_read(*args, **kwargs):
assert exc_info.type is expected
def test_close(self, handler, monkeypatch):
rh = handler()
session = rh._get_instance(cookiejar=rh.cookiejar)
called = False
original_close = session.close
def mock_close(*args, **kwargs):
nonlocal called
called = True
return original_close(*args, **kwargs)
monkeypatch.setattr(session, 'close', mock_close)
rh.close()
assert called
def run_validation(handler, error, req, **handler_kwargs):
with handler(**handler_kwargs) as rh:
@ -1198,6 +1239,19 @@ def some_preference(rh, request):
assert director.send(Request('http://')).read() == b''
assert director.send(Request('http://', headers={'prefer': '1'})).read() == b'supported'
def test_close(self, monkeypatch):
director = RequestDirector(logger=FakeLogger())
director.add_handler(FakeRH(logger=FakeLogger()))
called = False
def mock_close(*args, **kwargs):
nonlocal called
called = True
monkeypatch.setattr(director.handlers[FakeRH.RH_KEY], 'close', mock_close)
director.close()
assert called
# XXX: do we want to move this to test_YoutubeDL.py?
class TestYoutubeDLNetworking:

View file

@ -8,13 +8,9 @@
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import contextlib
import io
import platform
import random
import ssl
import urllib.error
import warnings
from yt_dlp.cookies import YoutubeDLCookieJar
from yt_dlp.dependencies import certifi
@ -30,7 +26,6 @@
from yt_dlp.networking.exceptions import (
HTTPError,
IncompleteRead,
_CompatHTTPError,
)
from yt_dlp.socks import ProxyType
from yt_dlp.utils.networking import HTTPHeaderDict
@ -179,11 +174,10 @@ class TestNetworkingExceptions:
def create_response(status):
return Response(fp=io.BytesIO(b'test'), url='http://example.com', headers={'tesT': 'test'}, status=status)
@pytest.mark.parametrize('http_error_class', [HTTPError, lambda r: _CompatHTTPError(HTTPError(r))])
def test_http_error(self, http_error_class):
def test_http_error(self):
response = self.create_response(403)
error = http_error_class(response)
error = HTTPError(response)
assert error.status == 403
assert str(error) == error.msg == 'HTTP Error 403: Forbidden'
@ -194,80 +188,12 @@ def test_http_error(self, http_error_class):
assert data == b'test'
assert repr(error) == '<HTTPError 403: Forbidden>'
@pytest.mark.parametrize('http_error_class', [HTTPError, lambda *args, **kwargs: _CompatHTTPError(HTTPError(*args, **kwargs))])
def test_redirect_http_error(self, http_error_class):
def test_redirect_http_error(self):
response = self.create_response(301)
error = http_error_class(response, redirect_loop=True)
error = HTTPError(response, redirect_loop=True)
assert str(error) == error.msg == 'HTTP Error 301: Moved Permanently (redirect loop detected)'
assert error.reason == 'Moved Permanently'
def test_compat_http_error(self):
response = self.create_response(403)
error = _CompatHTTPError(HTTPError(response))
assert isinstance(error, HTTPError)
assert isinstance(error, urllib.error.HTTPError)
@contextlib.contextmanager
def raises_deprecation_warning():
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter('always')
yield
if len(w) == 0:
pytest.fail('Did not raise DeprecationWarning')
if len(w) > 1:
pytest.fail(f'Raised multiple warnings: {w}')
if not issubclass(w[-1].category, DeprecationWarning):
pytest.fail(f'Expected DeprecationWarning, got {w[-1].category}')
w.clear()
with raises_deprecation_warning():
assert error.code == 403
with raises_deprecation_warning():
assert error.getcode() == 403
with raises_deprecation_warning():
assert error.hdrs is error.response.headers
with raises_deprecation_warning():
assert error.info() is error.response.headers
with raises_deprecation_warning():
assert error.headers is error.response.headers
with raises_deprecation_warning():
assert error.filename == error.response.url
with raises_deprecation_warning():
assert error.url == error.response.url
with raises_deprecation_warning():
assert error.geturl() == error.response.url
# Passthrough file operations
with raises_deprecation_warning():
assert error.read() == b'test'
with raises_deprecation_warning():
assert not error.closed
with raises_deprecation_warning():
# Technically Response operations are also passed through, which should not be used.
assert error.get_header('test') == 'test'
# Should not raise a warning
error.close()
@pytest.mark.skipif(
platform.python_implementation() == 'PyPy', reason='garbage collector works differently in pypy')
def test_compat_http_error_autoclose(self):
# Compat HTTPError should not autoclose response
response = self.create_response(403)
_CompatHTTPError(HTTPError(response))
assert not response.closed
def test_incomplete_read_error(self):
error = IncompleteRead(4, 3, cause='test')
assert isinstance(error, IncompleteRead)

View file

@ -45,7 +45,6 @@
NoSupportingHandlers,
RequestError,
SSLError,
_CompatHTTPError,
network_exceptions,
)
from .plugins import directories as plugin_directories
@ -586,6 +585,13 @@ class YoutubeDL:
'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'hls_aes', 'downloader_options',
'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time'
}
_deprecated_multivalue_fields = {
'album_artist': 'album_artists',
'artist': 'artists',
'composer': 'composers',
'creator': 'creators',
'genre': 'genres',
}
_format_selection_exts = {
'audio': set(MEDIA_EXTENSIONS.common_audio),
'video': set(MEDIA_EXTENSIONS.common_video + ('3gp', )),
@ -689,7 +695,6 @@ def process_color_policy(stream):
self.params['http_headers'] = HTTPHeaderDict(std_headers, self.params.get('http_headers'))
self._load_cookies(self.params['http_headers'].get('Cookie')) # compat
self.params['http_headers'].pop('Cookie', None)
self._request_director = self.build_request_director(_REQUEST_HANDLERS.values(), _RH_PREFERENCES)
if auto_init and auto_init != 'no_verbose_header':
self.print_debug_header()
@ -963,6 +968,7 @@ def __exit__(self, *args):
def close(self):
self.save_cookies()
self._request_director.close()
del self._request_director
def trouble(self, message=None, tb=None, is_error=True):
"""Determine action to take when a download problem appears.
@ -2457,7 +2463,7 @@ def selector_function(ctx):
# for extractors with incomplete formats (audio only (soundcloud)
# or video only (imgur)) best/worst will fallback to
# best/worst {video,audio}-only format
matches = formats
matches = list(filter(lambda f: f.get('vcodec') != 'none' or f.get('acodec') != 'none', formats))
elif seperate_fallback and not ctx['has_merged_format']:
# for compatibility with youtube-dl when there is no pre-merged format
matches = list(filter(seperate_fallback, formats))
@ -2646,6 +2652,14 @@ def _fill_common_fields(self, info_dict, final=True):
if final and info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
for old_key, new_key in self._deprecated_multivalue_fields.items():
if new_key in info_dict and old_key in info_dict:
self.deprecation_warning(f'Do not return {old_key!r} when {new_key!r} is present')
elif old_value := info_dict.get(old_key):
info_dict[new_key] = old_value.split(', ')
elif new_value := info_dict.get(new_key):
info_dict[old_key] = ', '.join(v.replace(',', '\N{FULLWIDTH COMMA}') for v in new_value)
def _raise_pending_errors(self, info):
err = info.pop('__pending_error', None)
if err:
@ -3489,7 +3503,8 @@ def ffmpeg_fixup(cndn, msg, cls):
or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None,
'Possible MPEG-TS in MP4 container or malformed AAC timestamps',
FFmpegFixupM3u8PP)
ffmpeg_fixup(info_dict.get('is_live') and downloader == 'dashsegments',
ffmpeg_fixup(downloader == 'dashsegments'
and (info_dict.get('is_live') or info_dict.get('is_dash_periods')),
'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP)
ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed timestamps detected', FFmpegFixupTimestampPP)
@ -4115,8 +4130,6 @@ def urlopen(self, req):
'SSLV3_ALERT_HANDSHAKE_FAILURE: The server may not support the current cipher list. '
'Try using --legacy-server-connect', cause=e) from e
raise
except HTTPError as e: # TODO: Remove in a future release
raise _CompatHTTPError(e) from e
def build_request_director(self, handlers, preferences=None):
logger = _YDLLogger(self)
@ -4152,6 +4165,10 @@ def build_request_director(self, handlers, preferences=None):
director.preferences.add(lambda rh, _: 500 if rh.RH_KEY == 'Urllib' else 0)
return director
@functools.cached_property
def _request_director(self):
return self.build_request_director(_REQUEST_HANDLERS.values(), _RH_PREFERENCES)
def encode(self, s):
if isinstance(s, bytes):
return s # Already encoded

View file

@ -15,7 +15,7 @@
import time
import traceback
from .compat import compat_shlex_quote
from .compat import compat_os_name, compat_shlex_quote
from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS
from .downloader.external import get_external_downloader
from .extractor import list_extractor_classes
@ -991,7 +991,28 @@ def _real_main(argv=None):
if pre_process:
return ydl._download_retcode
ydl.warn_if_short_id(sys.argv[1:] if argv is None else argv)
args = sys.argv[1:] if argv is None else argv
ydl.warn_if_short_id(args)
# Show a useful error message and wait for keypress if not launched from shell on Windows
if not args and compat_os_name == 'nt' and getattr(sys, 'frozen', False):
import ctypes.wintypes
import msvcrt
kernel32 = ctypes.WinDLL('Kernel32')
buffer = (1 * ctypes.wintypes.DWORD)()
attached_processes = kernel32.GetConsoleProcessList(buffer, 1)
# If we only have a single process attached, then the executable was double clicked
# When using `pyinstaller` with `--onefile`, two processes get attached
is_onefile = hasattr(sys, '_MEIPASS') and os.path.basename(sys._MEIPASS).startswith('_MEI')
if attached_processes == 1 or is_onefile and attached_processes == 2:
print(parser._generate_error_message(
'Do not double-click the executable, instead call it from a command line.\n'
'Please read the README for further information on how to use yt-dlp: '
'https://github.com/yt-dlp/yt-dlp#readme'))
msvcrt.getch()
_exit(2)
parser.error(
'You must provide at least one URL.\n'
'Type yt-dlp --help to see a list of all options.')

View file

@ -31,4 +31,4 @@ def get_hidden_imports():
hiddenimports = list(get_hidden_imports())
print(f'Adding imports: {hiddenimports}')
excludedimports = ['youtube_dl', 'youtube_dlc', 'test', 'ytdlp_plugins', 'devscripts']
excludedimports = ['youtube_dl', 'youtube_dlc', 'test', 'ytdlp_plugins', 'devscripts', 'bundle']

View file

@ -35,6 +35,7 @@
from ..dependencies import brotli as compat_brotli # noqa: F401
from ..dependencies import websockets as compat_websockets # noqa: F401
from ..dependencies.Cryptodome import AES as compat_pycrypto_AES # noqa: F401
from ..networking.exceptions import HTTPError as compat_HTTPError # noqa: F401
passthrough_module(__name__, '...utils', ('WINDOWS_VT_MODE', 'windows_enable_vt_mode'))
@ -70,7 +71,6 @@ def compat_setenv(key, value, env=os.environ):
compat_HTMLParser = compat_html_parser_HTMLParser = html.parser.HTMLParser
compat_http_client = http.client
compat_http_server = http.server
compat_HTTPError = urllib.error.HTTPError
compat_input = input
compat_integer_types = (int, )
compat_itertools_count = itertools.count
@ -88,7 +88,7 @@ def compat_setenv(key, value, env=os.environ):
compat_subprocess_get_DEVNULL = lambda: subprocess.DEVNULL
compat_tokenize_tokenize = tokenize.tokenize
compat_urllib_error = urllib.error
compat_urllib_HTTPError = urllib.error.HTTPError
compat_urllib_HTTPError = compat_HTTPError
compat_urllib_parse = urllib.parse
compat_urllib_parse_parse_qs = urllib.parse.parse_qs
compat_urllib_parse_quote = urllib.parse.quote

View file

@ -1,6 +1,7 @@
import base64
import collections
import contextlib
import glob
import http.cookiejar
import http.cookies
import io
@ -23,7 +24,8 @@
aes_gcm_decrypt_and_verify_bytes,
unpad_pkcs7,
)
from .compat import functools
from .compat import functools # isort: split
from .compat import compat_os_name
from .dependencies import (
_SECRETSTORAGE_UNAVAILABLE_REASON,
secretstorage,
@ -31,6 +33,7 @@
)
from .minicurses import MultilinePrinter, QuietMultilinePrinter
from .utils import (
DownloadError,
Popen,
error_to_str,
expand_path,
@ -122,13 +125,14 @@ def _extract_firefox_cookies(profile, container, logger):
return YoutubeDLCookieJar()
if profile is None:
search_root = _firefox_browser_dir()
search_roots = list(_firefox_browser_dirs())
elif _is_path(profile):
search_root = profile
search_roots = [profile]
else:
search_root = os.path.join(_firefox_browser_dir(), profile)
search_roots = [os.path.join(path, profile) for path in _firefox_browser_dirs()]
search_root = ', '.join(map(repr, search_roots))
cookie_database_path = _find_most_recently_used_file(search_root, 'cookies.sqlite', logger)
cookie_database_path = _newest(_firefox_cookie_dbs(search_roots))
if cookie_database_path is None:
raise FileNotFoundError(f'could not find firefox cookies database in {search_root}')
logger.debug(f'Extracting cookies from: "{cookie_database_path}"')
@ -182,12 +186,21 @@ def _extract_firefox_cookies(profile, container, logger):
cursor.connection.close()
def _firefox_browser_dir():
def _firefox_browser_dirs():
if sys.platform in ('cygwin', 'win32'):
return os.path.expandvars(R'%APPDATA%\Mozilla\Firefox\Profiles')
yield os.path.expandvars(R'%APPDATA%\Mozilla\Firefox\Profiles')
elif sys.platform == 'darwin':
return os.path.expanduser('~/Library/Application Support/Firefox/Profiles')
return os.path.expanduser('~/.mozilla/firefox')
yield os.path.expanduser('~/Library/Application Support/Firefox/Profiles')
else:
yield from map(os.path.expanduser, ('~/.mozilla/firefox', '~/snap/firefox/common/.mozilla/firefox'))
def _firefox_cookie_dbs(roots):
for root in map(os.path.abspath, roots):
for pattern in ('', '*/', 'Profiles/*/'):
yield from glob.iglob(os.path.join(root, pattern, 'cookies.sqlite'))
def _get_chromium_based_browser_settings(browser_name):
@ -268,7 +281,7 @@ def _extract_chrome_cookies(browser_name, profile, keyring, logger):
logger.error(f'{browser_name} does not support profiles')
search_root = config['browser_dir']
cookie_database_path = _find_most_recently_used_file(search_root, 'Cookies', logger)
cookie_database_path = _newest(_find_files(search_root, 'Cookies', logger))
if cookie_database_path is None:
raise FileNotFoundError(f'could not find {browser_name} cookies database in "{search_root}"')
logger.debug(f'Extracting cookies from: "{cookie_database_path}"')
@ -307,6 +320,12 @@ def _extract_chrome_cookies(browser_name, profile, keyring, logger):
counts['unencrypted'] = unencrypted_cookies
logger.debug(f'cookie version breakdown: {counts}')
return jar
except PermissionError as error:
if compat_os_name == 'nt' and error.errno == 13:
message = 'Could not copy Chrome cookie database. See https://github.com/yt-dlp/yt-dlp/issues/7271 for more info'
logger.error(message)
raise DownloadError(message) # force exit
raise
finally:
if cursor is not None:
cursor.connection.close()
@ -947,7 +966,7 @@ def _get_windows_v10_key(browser_root, logger):
References:
- [1] https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/sync/os_crypt_win.cc
"""
path = _find_most_recently_used_file(browser_root, 'Local State', logger)
path = _newest(_find_files(browser_root, 'Local State', logger))
if path is None:
logger.error('could not find local state file')
return None
@ -1049,17 +1068,20 @@ def _get_column_names(cursor, table_name):
return [row[1].decode() for row in table_info]
def _find_most_recently_used_file(root, filename, logger):
def _newest(files):
return max(files, key=lambda path: os.lstat(path).st_mtime, default=None)
def _find_files(root, filename, logger):
# if there are multiple browser profiles, take the most recently used one
i, paths = 0, []
i = 0
with _create_progress_bar(logger) as progress_bar:
for curr_root, dirs, files in os.walk(root):
for curr_root, _, files in os.walk(root):
for file in files:
i += 1
progress_bar.print(f'Searching for "{filename}": {i: 6d} files searched')
if file == filename:
paths.append(os.path.join(curr_root, file))
return None if not paths else max(paths, key=lambda path: os.lstat(path).st_mtime)
yield os.path.join(curr_root, file)
def _merge_cookie_jars(jars):
@ -1073,7 +1095,7 @@ def _merge_cookie_jars(jars):
def _is_path(value):
return os.path.sep in value
return any(sep in value for sep in (os.path.sep, os.path.altsep) if sep)
def _parse_browser_specification(browser_name, profile=None, keyring=None, container=None):

View file

@ -369,7 +369,10 @@ def fin_fragments():
return output.getvalue().encode()
self.download_and_append_fragments(
ctx, fragments, info_dict, pack_func=pack_fragment, finish_func=fin_fragments)
if len(fragments) == 1:
self.download_and_append_fragments(ctx, fragments, info_dict)
else:
self.download_and_append_fragments(
ctx, fragments, info_dict, pack_func=pack_fragment, finish_func=fin_fragments)
else:
return self.download_and_append_fragments(ctx, fragments, info_dict)

View file

@ -138,6 +138,10 @@
ARDMediathekCollectionIE,
ARDIE,
)
from .art19 import (
Art19IE,
Art19ShowIE,
)
from .arte import (
ArteTVIE,
ArteTVEmbedIE,
@ -253,6 +257,7 @@
from .bloomberg import BloombergIE
from .bokecc import BokeCCIE
from .bongacams import BongaCamsIE
from .boosty import BoostyIE
from .bostonglobe import BostonGlobeIE
from .box import BoxIE
from .boxcast import BoxCastVideoIE
@ -369,11 +374,11 @@
from .cliprs import ClipRsIE
from .closertotruth import CloserToTruthIE
from .cloudflarestream import CloudflareStreamIE
from .cloudycdn import CloudyCDNIE
from .clubic import ClubicIE
from .clyp import ClypIE
from .cmt import CMTIE
from .cnbc import (
CNBCIE,
CNBCVideoIE,
)
from .cnn import (
@ -564,6 +569,7 @@
EroProfileIE,
EroProfileAlbumIE,
)
from .err import ERRJupiterIE
from .ertgr import (
ERTFlixCodenameIE,
ERTFlixIE,
@ -588,6 +594,7 @@
FacebookPluginsVideoIE,
FacebookRedirectURLIE,
FacebookReelIE,
FacebookAdsIE,
)
from .fancode import (
FancodeVodIE,
@ -610,6 +617,7 @@
from .filmweb import FilmwebIE
from .firsttv import FirstTVIE
from .fivetv import FiveTVIE
from .flextv import FlexTVIE
from .flickr import FlickrIE
from .floatplane import (
FloatplaneIE,
@ -1000,6 +1008,11 @@
LRTVODIE,
LRTStreamIE
)
from .lsm import (
LSMLREmbedIE,
LSMLTVEmbedIE,
LSMReplayIE
)
from .lumni import (
LumniIE
)
@ -1111,6 +1124,7 @@
MotherlessIE,
MotherlessGroupIE,
MotherlessGalleryIE,
MotherlessUploaderIE,
)
from .motorsport import MotorsportIE
from .moviepilot import MoviepilotIE
@ -1137,6 +1151,11 @@
MusicdexArtistIE,
MusicdexPlaylistIE,
)
from .mx3 import (
Mx3IE,
Mx3NeoIE,
Mx3VolksmusikIE,
)
from .mxplayer import (
MxplayerIE,
MxplayerShowIE,
@ -1229,7 +1248,10 @@
NexxIE,
NexxEmbedIE,
)
from .nfb import NFBIE
from .nfb import (
NFBIE,
NFBSeriesIE,
)
from .nfhsnetwork import NFHSNetworkIE
from .nfl import (
NFLIE,
@ -1266,6 +1288,7 @@
NicovideoTagURLIE,
NiconicoLiveIE,
)
from .ninaprotocol import NinaProtocolIE
from .ninecninemedia import (
NineCNineMediaIE,
CPTwentyFourIE,
@ -1330,6 +1353,12 @@
NYTimesIE,
NYTimesArticleIE,
NYTimesCookingIE,
NYTimesCookingRecipeIE,
)
from .nuum import (
NuumLiveIE,
NuumTabIE,
NuumMediaIE,
)
from .nuvid import NuvidIE
from .nzherald import NZHeraldIE
@ -1372,6 +1401,7 @@
from .orf import (
ORFTVthekIE,
ORFFM4StoryIE,
ORFONIE,
ORFRadioIE,
ORFPodcastIE,
ORFIPTVIE,
@ -1496,7 +1526,7 @@
PuhuTVSerieIE,
)
from .pr0gramm import Pr0grammIE
from .prankcast import PrankCastIE
from .prankcast import PrankCastIE, PrankCastPostIE
from .premiershiprugby import PremiershipRugbyIE
from .presstv import PressTVIE
from .projectveritas import ProjectVeritasIE
@ -1593,6 +1623,7 @@
RedBullIE,
)
from .reddit import RedditIE
from .redge import RedCDNLivxIE
from .redgifs import (
RedGifsIE,
RedGifsSearchIE,
@ -1727,6 +1758,7 @@
)
from .scrolller import ScrolllerIE
from .seeker import SeekerIE
from .sejmpl import SejmIE
from .senalcolombia import SenalColombiaLiveIE
from .senategov import SenateISVPIE, SenateGovIE
from .sendtonews import SendtoNewsIE
@ -2289,11 +2321,6 @@
WashingtonPostIE,
WashingtonPostArticleIE,
)
from .wasdtv import (
WASDTVStreamIE,
WASDTVRecordIE,
WASDTVClipIE,
)
from .wat import WatIE
from .wdr import (
WDRIE,
@ -2472,6 +2499,7 @@
Zee5SeriesIE,
)
from .zeenews import ZeeNewsIE
from .zetland import ZetlandDKArticleIE
from .zhihu import ZhihuIE
from .zingmp3 import (
ZingMp3IE,

View file

@ -3,6 +3,7 @@
import json
import os
import random
import time
from .common import InfoExtractor
from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
@ -17,6 +18,7 @@
int_or_none,
intlist_to_bytes,
long_to_bytes,
parse_iso8601,
pkcs1pad,
strip_or_none,
str_or_none,
@ -185,7 +187,10 @@ def _real_extract(self, url):
user = options['user']
if not user.get('hasAccess'):
self.raise_login_required()
start_date = traverse_obj(options, ('video', 'startDate', {str}))
if (parse_iso8601(start_date) or 0) > time.time():
raise ExtractorError(f'This video is not available yet. Release date: {start_date}', expected=True)
self.raise_login_required('This video requires a subscription', method='password')
token = self._download_json(
user.get('refreshTokenUrl') or (self._PLAYER_BASE_URL + 'refresh/token'),
@ -267,6 +272,9 @@ def _real_extract(self, url):
f['language'] = 'de'
formats.extend(m3u8_formats)
if not formats:
self.raise_login_required('This video requires a subscription', method='password')
video = (self._download_json(
self._API_BASE_URL + 'video/%s' % video_id, video_id,
'Downloading additional video metadata', fatal=False) or {}).get('video') or {}

View file

@ -22,7 +22,7 @@ class AltCensoredIE(InfoExtractor):
'title': "QUELLES SONT LES CONSÉQUENCES DE L'HYPERSEXUALISATION DE LA SOCIÉTÉ ?",
'display_id': 'k0srjLSkga8.webm',
'release_date': '20180403',
'creator': 'Virginie Vota',
'creators': ['Virginie Vota'],
'release_year': 2018,
'upload_date': '20230318',
'uploader': 'admin@altcensored.com',
@ -32,7 +32,7 @@ class AltCensoredIE(InfoExtractor):
'duration': 926.09,
'thumbnail': 'https://archive.org/download/youtube-k0srjLSkga8/youtube-k0srjLSkga8.thumbs/k0srjLSkga8_000925.jpg',
'view_count': int,
'categories': ['News & Politics'],
'categories': ['News & Politics'], # FIXME
}
}]
@ -62,14 +62,21 @@ class AltCensoredChannelIE(InfoExtractor):
'title': 'Virginie Vota',
'id': 'UCFPTO55xxHqFqkzRZHu4kcw',
},
'playlist_count': 91
'playlist_count': 85,
}, {
'url': 'https://altcensored.com/channel/UC9CcJ96HKMWn0LZlcxlpFTw',
'info_dict': {
'title': 'yukikaze775',
'id': 'UC9CcJ96HKMWn0LZlcxlpFTw',
},
'playlist_count': 4
'playlist_count': 4,
}, {
'url': 'https://altcensored.com/channel/UCfYbb7nga6-icsFWWgS-kWw',
'info_dict': {
'title': 'Mister Metokur',
'id': 'UCfYbb7nga6-icsFWWgS-kWw',
},
'playlist_count': 121,
}]
def _real_extract(self, url):
@ -78,7 +85,7 @@ def _real_extract(self, url):
url, channel_id, 'Download channel webpage', 'Unable to get channel webpage')
title = self._html_search_meta('altcen_title', webpage, 'title', fatal=False)
page_count = int_or_none(self._html_search_regex(
r'<a[^>]+href="/channel/\w+/page/(\d+)">(?:\1)</a>',
r'<a[^>]+href="/channel/[\w-]+/page/(\d+)">(?:\1)</a>',
webpage, 'page count', default='1'))
def page_func(page_num):

View file

@ -78,14 +78,14 @@ class Ant1NewsGrArticleIE(AntennaBaseIE):
_TESTS = [{
'url': 'https://www.ant1news.gr/afieromata/article/549468/o-tzeims-mpont-sta-meteora-oi-apeiles-kai-o-xesikomos-ton-kalogeron',
'md5': '294f18331bb516539d72d85a82887dcc',
'md5': '57eb8d12181f0fa2b14b0b138e1de9b6',
'info_dict': {
'id': '_xvg/m_cmbatw=',
'ext': 'mp4',
'title': 'md5:a93e8ecf2e4073bfdffcb38f59945411',
'timestamp': 1603092840,
'upload_date': '20201019',
'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/756206d2-d640-40e2-b201-3555abdfc0db.jpg',
'timestamp': 1666166520,
'upload_date': '20221019',
'thumbnail': 'https://ant1media.azureedge.net/imgHandler/1920/756206d2-d640-40e2-b201-3555abdfc0db.jpg',
},
}, {
'url': 'https://ant1news.gr/Society/article/620286/symmoria-anilikon-dikigoros-thymaton-ithelan-na-toys-apoteleiosoyn',
@ -117,7 +117,7 @@ class Ant1NewsGrEmbedIE(AntennaBaseIE):
_BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player'
_VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)'
_EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)']
_API_PATH = '/news/templates/data/jsonPlayer'
_API_PATH = '/templates/data/jsonPlayer'
_TESTS = [{
'url': 'https://www.antenna.gr/templates/pages/player?cid=3f_li_c_az_jw_y_u=&w=670&h=377',

View file

@ -300,7 +300,7 @@ def _real_extract(self, url):
is_logged_in = bool(self._get_cookies('https://archive.org').get('logged-in-sig'))
if extension in KNOWN_EXTENSIONS and (not f.get('private') or is_logged_in):
entry['formats'].append({
'url': 'https://archive.org/download/' + identifier + '/' + f['name'],
'url': 'https://archive.org/download/' + identifier + '/' + urllib.parse.quote(f['name']),
'format': f.get('format'),
'width': int_or_none(f.get('width')),
'height': int_or_none(f.get('height')),

View file

@ -8,6 +8,7 @@
determine_ext,
int_or_none,
join_nonempty,
jwt_decode_hs256,
make_archive_id,
parse_duration,
parse_iso8601,
@ -238,6 +239,7 @@ class ARDBetaMediathekIE(InfoExtractor):
(?P<id>[a-zA-Z0-9]+)
/?(?:[?#]|$)'''
_GEO_COUNTRIES = ['DE']
_TOKEN_URL = 'https://sso.ardmediathek.de/sso/token'
_TESTS = [{
'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
@ -359,12 +361,27 @@ def _extract_episode_info(self, title):
def _real_extract(self, url):
display_id = self._match_id(url)
query = {'embedded': 'false', 'mcV6': 'true'}
headers = {}
if self._get_cookies(self._TOKEN_URL).get('ams'):
token = self._download_json(
self._TOKEN_URL, display_id, 'Fetching token for age verification',
'Unable to fetch age verification token', fatal=False)
id_token = traverse_obj(token, ('idToken', {str}))
decoded_token = traverse_obj(id_token, ({jwt_decode_hs256}, {dict}))
user_id = traverse_obj(decoded_token, (('user_id', 'sub'), {str}), get_all=False)
if not user_id:
self.report_warning('Unable to extract token, continuing without authentication')
else:
headers['x-authorization'] = f'Bearer {id_token}'
query['userId'] = user_id
if decoded_token.get('age_rating') != 18:
self.report_warning('Account is not verified as 18+; video may be unavailable')
page_data = self._download_json(
f'https://api.ardmediathek.de/page-gateway/pages/ard/item/{display_id}', display_id, query={
'embedded': 'false',
'mcV6': 'true',
})
f'https://api.ardmediathek.de/page-gateway/pages/ard/item/{display_id}',
display_id, query=query, headers=headers)
# For user convenience we use the old contentId instead of the longer crid
# Ref: https://github.com/yt-dlp/yt-dlp/issues/8731#issuecomment-1874398283
@ -383,7 +400,7 @@ def _real_extract(self, url):
media_data = traverse_obj(player_data, ('mediaCollection', 'embedded', {dict}))
if player_data.get('blockedByFsk'):
self.raise_no_formats('This video is only available after 22:00', expected=True)
self.raise_login_required('This video is only available for age verified users or after 22:00')
formats = []
subtitles = {}

303
yt_dlp/extractor/art19.py Normal file
View file

@ -0,0 +1,303 @@
import re
from .common import InfoExtractor
from ..utils import float_or_none, int_or_none, parse_iso8601, url_or_none
from ..utils.traversal import traverse_obj
class Art19IE(InfoExtractor):
_UUID_REGEX = r'[\da-f]{8}-?[\da-f]{4}-?[\da-f]{4}-?[\da-f]{4}-?[\da-f]{12}'
_VALID_URL = [
rf'https?://(?:www\.)?art19\.com/shows/[^/#?]+/episodes/(?P<id>{_UUID_REGEX})',
rf'https?://rss\.art19\.com/episodes/(?P<id>{_UUID_REGEX})\.mp3',
]
_EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL[0]})']
_TESTS = [{
'url': 'https://rss.art19.com/episodes/5ba1413c-48b8-472b-9cc3-cfd952340bdb.mp3',
'info_dict': {
'id': '5ba1413c-48b8-472b-9cc3-cfd952340bdb',
'ext': 'mp3',
'title': 'Why Did DeSantis Drop Out?',
'series': 'The Daily Briefing',
'release_timestamp': 1705941275,
'description': 'md5:da38961da4a3f7e419471365e3c6b49f',
'episode': 'Episode 582',
'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
'series_id': 'ed52a0ab-08b1-4def-8afc-549e4d93296d',
'upload_date': '20240122',
'timestamp': 1705940815,
'episode_number': 582,
'modified_date': '20240122',
'episode_id': '5ba1413c-48b8-472b-9cc3-cfd952340bdb',
'modified_timestamp': 1705941275,
'release_date': '20240122',
'duration': 527.4,
},
}, {
'url': 'https://art19.com/shows/scamfluencers/episodes/8319b776-4153-4d22-8630-631f204a03dd',
'info_dict': {
'id': '8319b776-4153-4d22-8630-631f204a03dd',
'ext': 'mp3',
'title': 'Martha Stewart: The Homemaker Hustler Part 2',
'modified_date': '20240116',
'upload_date': '20240105',
'modified_timestamp': 1705435802,
'episode_id': '8319b776-4153-4d22-8630-631f204a03dd',
'series_id': 'd3c9b8ca-26b3-42f4-9bd8-21d1a9031e75',
'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
'description': 'md5:4aa7cfd1358dc57e729835bc208d7893',
'release_timestamp': 1705305660,
'release_date': '20240115',
'timestamp': 1704481536,
'episode_number': 88,
'series': 'Scamfluencers',
'duration': 2588.37501,
'episode': 'Episode 88',
},
}]
_WEBPAGE_TESTS = [{
'url': 'https://www.nu.nl/formule-1/6291456/verstappen-wordt-een-synoniem-voor-formule-1.html',
'info_dict': {
'id': '7d42626a-7301-47db-bb8a-3b6f054d77d7',
'ext': 'mp3',
'title': "'Verstappen wordt een synoniem voor Formule 1'",
'season': 'Seizoen 6',
'description': 'md5:39a7159a31c4cda312b2e893bdd5c071',
'episode_id': '7d42626a-7301-47db-bb8a-3b6f054d77d7',
'duration': 3061.82111,
'series_id': '93f4e113-2a60-4609-a564-755058fa40d8',
'release_date': '20231126',
'modified_timestamp': 1701156004,
'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
'season_number': 6,
'episode_number': 52,
'modified_date': '20231128',
'upload_date': '20231126',
'timestamp': 1701025981,
'season_id': '36097c1e-7455-490d-a2fe-e2f10b4d5f26',
'series': 'De Boordradio',
'release_timestamp': 1701026308,
'episode': 'Episode 52',
},
}, {
'url': 'https://www.wishtv.com/podcast-episode/larry-bucshon-announces-retirement-from-congress/',
'info_dict': {
'id': '8da368bd-08d1-46d0-afaa-c134a4af7dc0',
'ext': 'mp3',
'title': 'Larry Bucshon announces retirement from congress',
'upload_date': '20240115',
'episode_number': 148,
'episode': 'Episode 148',
'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
'release_date': '20240115',
'timestamp': 1705328205,
'release_timestamp': 1705329275,
'series': 'All INdiana Politics',
'modified_date': '20240117',
'modified_timestamp': 1705458901,
'series_id': 'c4af6c27-b10f-4ff2-9f84-0f407df86ff1',
'episode_id': '8da368bd-08d1-46d0-afaa-c134a4af7dc0',
'description': 'md5:53b5239e4d14973a87125c217c255b2a',
'duration': 1256.18848,
},
}]
@classmethod
def _extract_embed_urls(cls, url, webpage):
yield from super()._extract_embed_urls(url, webpage)
for episode_id in re.findall(
rf'<div[^>]+\bclass=[\'"][^\'"]*art19-web-player[^\'"]*[\'"][^>]+\bdata-episode-id=[\'"]({cls._UUID_REGEX})[\'"]', webpage):
yield f'https://rss.art19.com/episodes/{episode_id}.mp3'
def _real_extract(self, url):
episode_id = self._match_id(url)
player_metadata = self._download_json(
f'https://art19.com/episodes/{episode_id}', episode_id,
note='Downloading player metadata', fatal=False,
headers={'Accept': 'application/vnd.art19.v0+json'})
rss_metadata = self._download_json(
f'https://rss.art19.com/episodes/{episode_id}.json', episode_id, fatal=False,
note='Downloading RSS metadata')
formats = [{
'format_id': 'direct',
'url': f'https://rss.art19.com/episodes/{episode_id}.mp3',
'vcodec': 'none',
'acodec': 'mp3',
}]
for fmt_id, fmt_data in traverse_obj(rss_metadata, ('content', 'media', {dict.items}, ...)):
if fmt_id == 'waveform_bin':
continue
fmt_url = traverse_obj(fmt_data, ('url', {url_or_none}))
if not fmt_url:
continue
formats.append({
'format_id': fmt_id,
'url': fmt_url,
'vcodec': 'none',
'acodec': fmt_id,
'quality': -2 if fmt_id == 'ogg' else -1,
})
return {
'id': episode_id,
'formats': formats,
**traverse_obj(player_metadata, ('episode', {
'title': ('title', {str}),
'description': ('description_plain', {str}),
'episode_id': ('id', {str}),
'episode_number': ('episode_number', {int_or_none}),
'season_id': ('season_id', {str}),
'series_id': ('series_id', {str}),
'timestamp': ('created_at', {parse_iso8601}),
'release_timestamp': ('released_at', {parse_iso8601}),
'modified_timestamp': ('updated_at', {parse_iso8601})
})),
**traverse_obj(rss_metadata, ('content', {
'title': ('episode_title', {str}),
'description': ('episode_description_plain', {str}),
'episode_id': ('episode_id', {str}),
'episode_number': ('episode_number', {int_or_none}),
'season': ('season_title', {str}),
'season_id': ('season_id', {str}),
'season_number': ('season_number', {int_or_none}),
'series': ('series_title', {str}),
'series_id': ('series_id', {str}),
'thumbnail': ('cover_image', {url_or_none}),
'duration': ('duration', {float_or_none}),
})),
}
class Art19ShowIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?art19\.com/shows/(?P<id>[\w-]+)(?:/embed)?/?'
_VALID_URL = [
rf'{_VALID_URL_BASE}(?:$|[#?])',
r'https?://rss\.art19\.com/(?P<id>[\w-]+)/?(?:$|[#?])',
]
_EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL_BASE}[^\'"])']
_TESTS = [{
'url': 'https://www.art19.com/shows/5898c087-a14f-48dc-b6fc-a2280a1ff6e0/',
'info_dict': {
'_type': 'playlist',
'id': '5898c087-a14f-48dc-b6fc-a2280a1ff6e0',
'display_id': 'echt-gebeurd',
'title': 'Echt Gebeurd',
'description': 'md5:5fd11dc80b76e51ffd34b6067fd5e560',
'timestamp': 1492642167,
'upload_date': '20170419',
'modified_timestamp': int,
'modified_date': str,
'tags': 'count:7',
},
'playlist_mincount': 425,
}, {
'url': 'https://www.art19.com/shows/echt-gebeurd',
'info_dict': {
'_type': 'playlist',
'id': '5898c087-a14f-48dc-b6fc-a2280a1ff6e0',
'display_id': 'echt-gebeurd',
'title': 'Echt Gebeurd',
'description': 'md5:5fd11dc80b76e51ffd34b6067fd5e560',
'timestamp': 1492642167,
'upload_date': '20170419',
'modified_timestamp': int,
'modified_date': str,
'tags': 'count:7',
},
'playlist_mincount': 425,
}, {
'url': 'https://rss.art19.com/scamfluencers',
'info_dict': {
'_type': 'playlist',
'id': 'd3c9b8ca-26b3-42f4-9bd8-21d1a9031e75',
'display_id': 'scamfluencers',
'title': 'Scamfluencers',
'description': 'md5:7d239d670c0ced6dadbf71c4caf764b7',
'timestamp': 1647368573,
'upload_date': '20220315',
'modified_timestamp': int,
'modified_date': str,
'tags': [],
},
'playlist_mincount': 90,
}, {
'url': 'https://art19.com/shows/enthuellt/embed',
'info_dict': {
'_type': 'playlist',
'id': 'e2cacf57-bb8a-4263-aa81-719bcdd4f80c',
'display_id': 'enthuellt',
'title': 'Enthüllt',
'description': 'md5:17752246643414a2fd51744fc9a1c08e',
'timestamp': 1601645860,
'upload_date': '20201002',
'modified_timestamp': int,
'modified_date': str,
'tags': 'count:10',
},
'playlist_mincount': 10,
}]
_WEBPAGE_TESTS = [{
'url': 'https://deconstructingyourself.com/deconstructing-yourself-podcast',
'info_dict': {
'_type': 'playlist',
'id': 'cfbb9b01-c295-4adb-8726-adde7c03cf21',
'display_id': 'deconstructing-yourself',
'title': 'Deconstructing Yourself',
'description': 'md5:dab5082b28b248a35476abf64768854d',
'timestamp': 1570581181,
'upload_date': '20191009',
'modified_timestamp': int,
'modified_date': str,
'tags': 'count:5',
},
'playlist_mincount': 80,
}, {
'url': 'https://chicagoreader.com/columns-opinion/podcasts/ben-joravsky-show-podcast-episodes/',
'info_dict': {
'_type': 'playlist',
'id': '9dfa2c37-ab87-4c13-8388-4897914313ec',
'display_id': 'the-ben-joravsky-show',
'title': 'The Ben Joravsky Show',
'description': 'md5:c0f3ec0ee0dbea764390e521adc8780a',
'timestamp': 1550875095,
'upload_date': '20190222',
'modified_timestamp': int,
'modified_date': str,
'tags': ['Chicago Politics', 'chicago', 'Ben Joravsky'],
},
'playlist_mincount': 1900,
}]
@classmethod
def _extract_embed_urls(cls, url, webpage):
yield from super()._extract_embed_urls(url, webpage)
for series_id in re.findall(
r'<div[^>]+\bclass=[\'"][^\'"]*art19-web-player[^\'"]*[\'"][^>]+\bdata-series-id=[\'"]([\w-]+)[\'"]', webpage):
yield f'https://art19.com/shows/{series_id}'
def _real_extract(self, url):
series_id = self._match_id(url)
series_metadata = self._download_json(
f'https://art19.com/series/{series_id}', series_id, note='Downloading series metadata',
headers={'Accept': 'application/vnd.art19.v0+json'})
return {
'_type': 'playlist',
'entries': [
self.url_result(f'https://rss.art19.com/episodes/{episode_id}.mp3', Art19IE)
for episode_id in traverse_obj(series_metadata, ('series', 'episode_ids', ..., {str}))
],
**traverse_obj(series_metadata, ('series', {
'id': ('id', {str}),
'display_id': ('slug', {str}),
'title': ('title', {str}),
'description': ('description_plain', {str}),
'timestamp': ('created_at', {parse_iso8601}),
'modified_timestamp': ('updated_at', {parse_iso8601}),
})),
'tags': traverse_obj(series_metadata, ('tags', ..., 'name', {str})),
}

View file

@ -7,6 +7,7 @@
import re
import time
import urllib.parse
import uuid
from .common import InfoExtractor, SearchInfoExtractor
from ..dependencies import Cryptodome
@ -1304,6 +1305,26 @@ class BilibiliPlaylistIE(BilibiliSpaceListBaseIE):
'upload_date': '20211127',
},
'playlist_mincount': 513,
}, {
'url': 'https://www.bilibili.com/list/1958703906?sid=547718&oid=687146339&bvid=BV1DU4y1r7tz',
'info_dict': {
'id': 'BV1DU4y1r7tz',
'ext': 'mp4',
'title': '【直播回放】8.20晚9:30 3d发布喵 2022年8月20日21点场',
'upload_date': '20220820',
'description': '',
'timestamp': 1661016330,
'uploader_id': '1958703906',
'uploader': '靡烟miya',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'duration': 9552.903,
'tags': list,
'comment_count': int,
'view_count': int,
'like_count': int,
'_old_archive_ids': ['bilibili 687146339_part1'],
},
'params': {'noplaylist': True},
}, {
'url': 'https://www.bilibili.com/medialist/play/1958703906?business=space_series&business_id=547718&desc=1',
'info_dict': {
@ -1355,6 +1376,11 @@ def _extract_medialist(self, query, list_id):
def _real_extract(self, url):
list_id = self._match_id(url)
bvid = traverse_obj(parse_qs(url), ('bvid', 0))
if not self._yes_playlist(list_id, bvid):
return self.url_result(f'https://www.bilibili.com/video/{bvid}', BiliBiliIE)
webpage = self._download_webpage(url, list_id)
initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', list_id)
if traverse_obj(initial_state, ('error', 'code', {int_or_none})) != 200:
@ -1464,8 +1490,37 @@ class BiliBiliSearchIE(SearchInfoExtractor):
IE_DESC = 'Bilibili video search'
_MAX_RESULTS = 100000
_SEARCH_KEY = 'bilisearch'
_TESTS = [{
'url': 'bilisearch3:靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊',
'playlist_count': 3,
'info_dict': {
'id': '靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊',
'title': '靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊',
},
'playlist': [{
'info_dict': {
'id': 'BV1n44y1Q7sc',
'ext': 'mp4',
'title': '“出道一年我怎么还在等你单推的女人睡觉后开播啊”【一分钟了解靡烟miya】',
'timestamp': 1669889987,
'upload_date': '20221201',
'description': 'md5:43343c0973defff527b5a4b403b4abf9',
'tags': list,
'uploader': '靡烟miya',
'duration': 123.156,
'uploader_id': '1958703906',
'comment_count': int,
'view_count': int,
'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 988222410_part1'],
},
}],
}]
def _search_results(self, query):
if not self._get_cookies('https://api.bilibili.com').get('buvid3'):
self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc')
for page_num in itertools.count(1):
videos = self._download_json(
'https://api.bilibili.com/x/web-interface/search/type', query,
@ -1941,7 +1996,7 @@ def _extract_video_metadata(self, url, video_id, season_id):
'title': get_element_by_class(
'bstar-meta__title', webpage) or self._html_search_meta('og:title', webpage),
'description': get_element_by_class(
'bstar-meta__desc', webpage) or self._html_search_meta('og:description'),
'bstar-meta__desc', webpage) or self._html_search_meta('og:description', webpage),
}, self._search_json_ld(webpage, video_id, default={}))
def _get_comments_reply(self, root_id, next_id=0, display_id=None):

209
yt_dlp/extractor/boosty.py Normal file
View file

@ -0,0 +1,209 @@
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..utils import (
ExtractorError,
int_or_none,
qualities,
str_or_none,
url_or_none,
)
from ..utils.traversal import traverse_obj
class BoostyIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?boosty\.to/(?P<user>[^/#?]+)/posts/(?P<post_id>[^/#?]+)'
_TESTS = [{
# single ok_video
'url': 'https://boosty.to/kuplinov/posts/e55d050c-e3bb-4873-a7db-ac7a49b40c38',
'info_dict': {
'id': 'd7473824-352e-48e2-ae53-d4aa39459968',
'title': 'phasma_3',
'channel': 'Kuplinov',
'channel_id': '7958701',
'timestamp': 1655031975,
'upload_date': '20220612',
'release_timestamp': 1655049000,
'release_date': '20220612',
'modified_timestamp': 1668680993,
'modified_date': '20221117',
'tags': ['куплинов', 'phasmophobia'],
'like_count': int,
'ext': 'mp4',
'duration': 105,
'view_count': int,
'thumbnail': r're:^https://i\.mycdn\.me/videoPreview\?',
},
}, {
# multiple ok_video
'url': 'https://boosty.to/maddyson/posts/0c652798-3b35-471f-8b48-a76a0b28736f',
'info_dict': {
'id': '0c652798-3b35-471f-8b48-a76a0b28736f',
'title': 'то что не пропустил юта6',
'channel': 'Илья Давыдов',
'channel_id': '6808257',
'timestamp': 1694017040,
'upload_date': '20230906',
'release_timestamp': 1694017040,
'release_date': '20230906',
'modified_timestamp': 1694071178,
'modified_date': '20230907',
'like_count': int,
},
'playlist_count': 3,
'playlist': [{
'info_dict': {
'id': 'cc325a9f-a563-41c6-bf47-516c1b506c9a',
'title': 'то что не пропустил юта6',
'channel': 'Илья Давыдов',
'channel_id': '6808257',
'timestamp': 1694017040,
'upload_date': '20230906',
'release_timestamp': 1694017040,
'release_date': '20230906',
'modified_timestamp': 1694071178,
'modified_date': '20230907',
'like_count': int,
'ext': 'mp4',
'duration': 31204,
'view_count': int,
'thumbnail': r're:^https://i\.mycdn\.me/videoPreview\?',
},
}, {
'info_dict': {
'id': 'd07b0a72-9493-4512-b54e-55ce468fd4b7',
'title': 'то что не пропустил юта6',
'channel': 'Илья Давыдов',
'channel_id': '6808257',
'timestamp': 1694017040,
'upload_date': '20230906',
'release_timestamp': 1694017040,
'release_date': '20230906',
'modified_timestamp': 1694071178,
'modified_date': '20230907',
'like_count': int,
'ext': 'mp4',
'duration': 25704,
'view_count': int,
'thumbnail': r're:^https://i\.mycdn\.me/videoPreview\?',
},
}, {
'info_dict': {
'id': '4a3bba32-78c8-422a-9432-2791aff60b42',
'title': 'то что не пропустил юта6',
'channel': 'Илья Давыдов',
'channel_id': '6808257',
'timestamp': 1694017040,
'upload_date': '20230906',
'release_timestamp': 1694017040,
'release_date': '20230906',
'modified_timestamp': 1694071178,
'modified_date': '20230907',
'like_count': int,
'ext': 'mp4',
'duration': 31867,
'view_count': int,
'thumbnail': r're:^https://i\.mycdn\.me/videoPreview\?',
},
}],
}, {
# single external video (youtube)
'url': 'https://boosty.to/denischuzhoy/posts/6094a487-bcec-4cf8-a453-43313b463c38',
'info_dict': {
'id': 'EXelTnve5lY',
'title': 'Послание Президента Федеральному Собранию | Класс народа',
'upload_date': '20210425',
'channel': 'Денис Чужой',
'tags': 'count:10',
'like_count': int,
'ext': 'mp4',
'duration': 816,
'view_count': int,
'thumbnail': r're:^https://i\.ytimg\.com/',
'age_limit': 0,
'availability': 'public',
'categories': list,
'channel_follower_count': int,
'channel_id': 'UCCzVNbWZfYpBfyofCCUD_0w',
'channel_is_verified': bool,
'channel_url': r're:^https://www\.youtube\.com/',
'comment_count': int,
'description': str,
'heatmap': 'count:100',
'live_status': str,
'playable_in_embed': bool,
'uploader': str,
'uploader_id': str,
'uploader_url': r're:^https://www\.youtube\.com/',
},
}]
_MP4_TYPES = ('tiny', 'lowest', 'low', 'medium', 'high', 'full_hd', 'quad_hd', 'ultra_hd')
def _extract_formats(self, player_urls, video_id):
formats = []
quality = qualities(self._MP4_TYPES)
for player_url in traverse_obj(player_urls, lambda _, v: url_or_none(v['url'])):
url = player_url['url']
format_type = player_url.get('type')
if format_type in ('hls', 'hls_live', 'live_ondemand_hls', 'live_playback_hls'):
formats.extend(self._extract_m3u8_formats(url, video_id, m3u8_id='hls', fatal=False))
elif format_type in ('dash', 'dash_live', 'live_playback_dash'):
formats.extend(self._extract_mpd_formats(url, video_id, mpd_id='dash', fatal=False))
elif format_type in self._MP4_TYPES:
formats.append({
'url': url,
'ext': 'mp4',
'format_id': format_type,
'quality': quality(format_type),
})
else:
self.report_warning(f'Unknown format type: {format_type!r}')
return formats
def _real_extract(self, url):
user, post_id = self._match_valid_url(url).group('user', 'post_id')
post = self._download_json(
f'https://api.boosty.to/v1/blog/{user}/post/{post_id}', post_id,
note='Downloading post data', errnote='Unable to download post data')
post_title = post.get('title')
if not post_title:
self.report_warning('Unable to extract post title. Falling back to parsing html page')
webpage = self._download_webpage(url, video_id=post_id)
post_title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage)
common_metadata = {
'title': post_title,
**traverse_obj(post, {
'channel': ('user', 'name', {str}),
'channel_id': ('user', 'id', {str_or_none}),
'timestamp': ('createdAt', {int_or_none}),
'release_timestamp': ('publishTime', {int_or_none}),
'modified_timestamp': ('updatedAt', {int_or_none}),
'tags': ('tags', ..., 'title', {str}),
'like_count': ('count', 'likes', {int_or_none}),
}),
}
entries = []
for item in traverse_obj(post, ('data', ..., {dict})):
item_type = item.get('type')
if item_type == 'video' and url_or_none(item.get('url')):
entries.append(self.url_result(item['url'], YoutubeIE))
elif item_type == 'ok_video':
video_id = item.get('id') or post_id
entries.append({
'id': video_id,
'formats': self._extract_formats(item.get('playerUrls'), video_id),
**common_metadata,
**traverse_obj(item, {
'title': ('title', {str}),
'duration': ('duration', {int_or_none}),
'view_count': ('viewsCounter', {int_or_none}),
'thumbnail': (('previewUrl', 'defaultPreview'), {url_or_none}),
}, get_all=False)})
if not entries:
raise ExtractorError('No videos found', expected=True)
if len(entries) == 1:
return entries[0]
return self.playlist_result(entries, post_id, post_title, **common_metadata)

View file

@ -1,6 +1,7 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
determine_ext,
int_or_none,
parse_duration,
parse_resolution,
@ -60,6 +61,7 @@ def _real_extract(self, url):
'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={
'media': media_type,
'idint': media_id,
'format': 'dm',
})
formats = []
@ -69,6 +71,10 @@ def _real_extract(self, url):
format_url = url_or_none(format_.get('file'))
if not format_url:
continue
if determine_ext(format_url) == 'mpd':
formats.extend(self._extract_mpd_formats(
format_url, media_id, mpd_id='dash', fatal=False))
continue
label = format_.get('label')
f = parse_resolution(label)
f.update({

View file

@ -67,7 +67,10 @@ def _real_extract(self, url):
html = self._download_webpage(url, video_id)
idetails = self._search_nextjs_data(html, video_id)['props']['pageProps']['idetails']
if idetails.get('err_code') == 1200:
err_code = idetails.get('err_code')
if err_code == 1002:
self.raise_login_required()
elif err_code == 1200:
self.raise_geo_restricted(
'This video is not available from your location due to geo restriction. '
'You may be able to bypass it by using the /details/ page instead of the /watch/ page',

View file

@ -4,27 +4,25 @@
class CloudflareStreamIE(InfoExtractor):
_SUBDOMAIN_RE = r'(?:(?:watch|iframe|customer-\w+)\.)?'
_DOMAIN_RE = r'(?:cloudflarestream\.com|(?:videodelivery|bytehighway)\.net)'
_EMBED_RE = r'embed\.%s/embed/[^/]+\.js\?.*?\bvideo=' % _DOMAIN_RE
_EMBED_RE = rf'embed\.{_DOMAIN_RE}/embed/[^/]+\.js\?.*?\bvideo='
_ID_RE = r'[\da-f]{32}|[\w-]+\.[\w-]+\.[\w-]+'
_VALID_URL = r'''(?x)
https?://
(?:
(?:watch\.)?%s/|
%s
)
(?P<id>%s)
''' % (_DOMAIN_RE, _EMBED_RE, _ID_RE)
_EMBED_REGEX = [fr'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE}).*?)\1']
_VALID_URL = rf'https?://(?:{_SUBDOMAIN_RE}{_DOMAIN_RE}/|{_EMBED_RE})(?P<id>{_ID_RE})'
_EMBED_REGEX = [
rf'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE}).*?)\1',
rf'<iframe[^>]+\bsrc=["\'](?P<url>https?://{_SUBDOMAIN_RE}{_DOMAIN_RE}/[\da-f]{{32}})',
]
_TESTS = [{
'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717',
'info_dict': {
'id': '31c9291ab41fac05471db4e73aa11717',
'ext': 'mp4',
'title': '31c9291ab41fac05471db4e73aa11717',
'thumbnail': 'https://videodelivery.net/31c9291ab41fac05471db4e73aa11717/thumbnails/thumbnail.jpg',
},
'params': {
'skip_download': True,
'skip_download': 'm3u8',
},
}, {
'url': 'https://watch.cloudflarestream.com/9df17203414fd1db3e3ed74abbe936c1',
@ -35,6 +33,21 @@ class CloudflareStreamIE(InfoExtractor):
}, {
'url': 'https://embed.videodelivery.net/embed/r4xu.fla9.latest.js?video=81d80727f3022488598f68d323c1ad5e',
'only_matching': True,
}, {
'url': 'https://customer-aw5py76sw8wyqzmh.cloudflarestream.com/2463f6d3e06fa29710a337f5f5389fd8/iframe',
'only_matching': True,
}]
_WEBPAGE_TESTS = [{
'url': 'https://upride.cc/incident/shoulder-pass-at-light/',
'info_dict': {
'id': 'eaef9dea5159cf968be84241b5cedfe7',
'ext': 'mp4',
'title': 'eaef9dea5159cf968be84241b5cedfe7',
'thumbnail': 'https://videodelivery.net/eaef9dea5159cf968be84241b5cedfe7/thumbnails/thumbnail.jpg',
},
'params': {
'skip_download': 'm3u8',
},
}]
def _real_extract(self, url):

View file

@ -0,0 +1,79 @@
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_iso8601,
url_or_none,
urlencode_postdata,
)
from ..utils.traversal import traverse_obj
class CloudyCDNIE(InfoExtractor):
_VALID_URL = r'(?:https?:)?//embed\.cloudycdn\.services/(?P<site_id>[^/?#]+)/media/(?P<id>[\w-]+)'
_EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL})']
_TESTS = [{
'url': 'https://embed.cloudycdn.services/ltv/media/46k_d23-6000-105?',
'md5': '64f72a360ca530d5ed89c77646c9eee5',
'info_dict': {
'id': '46k_d23-6000-105',
'ext': 'mp4',
'timestamp': 1700589151,
'duration': 1442,
'upload_date': '20231121',
'title': 'D23-6000-105_cetstud',
'thumbnail': 'https://store.cloudycdn.services/tmsp00060/assets/media/660858/placeholder1700589200.jpg',
}
}, {
'url': 'https://embed.cloudycdn.services/izm/media/26e_lv-8-5-1',
'md5': '798828a479151e2444d8dcfbec76e482',
'info_dict': {
'id': '26e_lv-8-5-1',
'ext': 'mp4',
'title': 'LV-8-5-1',
'timestamp': 1669767167,
'thumbnail': 'https://store.cloudycdn.services/tmsp00120/assets/media/488306/placeholder1679423604.jpg',
'duration': 1205,
'upload_date': '20221130',
}
}]
_WEBPAGE_TESTS = [{
'url': 'https://www.tavaklase.lv/video/es-esmu-mina-um-2/',
'md5': '63074e8e6c84ac2a01f2fb8bf03b8f43',
'info_dict': {
'id': 'cqd_lib-2',
'ext': 'mp4',
'upload_date': '20230223',
'duration': 629,
'thumbnail': 'https://store.cloudycdn.services/tmsp00120/assets/media/518407/placeholder1678748124.jpg',
'timestamp': 1677181513,
'title': 'LIB-2',
}
}]
def _real_extract(self, url):
site_id, video_id = self._match_valid_url(url).group('site_id', 'id')
data = self._download_json(
f'https://player.cloudycdn.services/player/{site_id}/media/{video_id}/',
video_id, data=urlencode_postdata({
'version': '6.4.0',
'referer': url,
}))
formats, subtitles = [], {}
for m3u8_url in traverse_obj(data, ('source', 'sources', ..., 'src', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(data, {
'title': ('name', {str}),
'duration': ('duration', {int_or_none}),
'timestamp': ('upload_date', {parse_iso8601}),
'thumbnail': ('source', 'poster', {url_or_none}),
}),
}

View file

@ -1,68 +1,97 @@
from .common import InfoExtractor
from ..utils import smuggle_url
class CNBCIE(InfoExtractor):
_VALID_URL = r'https?://video\.cnbc\.com/gallery/\?video=(?P<id>[0-9]+)'
_TEST = {
'url': 'http://video.cnbc.com/gallery/?video=3000503714',
'info_dict': {
'id': '3000503714',
'ext': 'mp4',
'title': 'Fighting zombies is big business',
'description': 'md5:0c100d8e1a7947bd2feec9a5550e519e',
'timestamp': 1459332000,
'upload_date': '20160330',
'uploader': 'NBCU-CNBC',
},
'params': {
# m3u8 download
'skip_download': True,
},
'skip': 'Dead link',
}
def _real_extract(self, url):
video_id = self._match_id(url)
return {
'_type': 'url_transparent',
'ie_key': 'ThePlatform',
'url': smuggle_url(
'http://link.theplatform.com/s/gZWlPC/media/guid/2408950221/%s?mbr=true&manifest=m3u' % video_id,
{'force_smil_url': True}),
'id': video_id,
}
from ..utils import int_or_none, parse_iso8601, str_or_none, url_or_none
from ..utils.traversal import traverse_obj
class CNBCVideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)'
_TEST = {
'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html',
_VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/?#]+/)+(?P<id>[^./?#&]+)\.html'
_TESTS = [{
'url': 'https://www.cnbc.com/video/2023/12/07/mcdonalds-just-unveiled-cosmcsits-new-spinoff-brand.html',
'info_dict': {
'id': '7000031301',
'ext': 'mp4',
'title': "Trump: I don't necessarily agree with raising rates",
'description': 'md5:878d8f0b4ebb5bb1dda3514b91b49de3',
'timestamp': 1531958400,
'upload_date': '20180719',
'uploader': 'NBCU-CNBC',
'id': '107344774',
'display_id': 'mcdonalds-just-unveiled-cosmcsits-new-spinoff-brand',
'modified_timestamp': 1702053483,
'timestamp': 1701977810,
'channel': 'News Videos',
'upload_date': '20231207',
'description': 'md5:882c001d85cb43d7579b514307b3e78b',
'release_timestamp': 1701977375,
'modified_date': '20231208',
'release_date': '20231207',
'duration': 65,
'author': 'Sean Conlon',
'title': 'Here\'s a first look at McDonald\'s new spinoff brand, CosMc\'s',
'thumbnail': 'https://image.cnbcfm.com/api/v1/image/107344192-1701894812493-CosMcsskyHero_2336x1040_hero-desktop.jpg?v=1701894855',
},
'params': {
'skip_download': True,
'expected_warnings': ['Unable to download f4m manifest'],
}, {
'url': 'https://www.cnbc.com/video/2023/12/08/jim-cramer-shares-his-take-on-seattles-tech-scene.html',
'info_dict': {
'author': 'Jim Cramer',
'channel': 'Mad Money with Jim Cramer',
'description': 'md5:72925be21b952e95eba51178dddf4e3e',
'duration': 299.0,
'ext': 'mp4',
'id': '107345451',
'display_id': 'jim-cramer-shares-his-take-on-seattles-tech-scene',
'thumbnail': 'https://image.cnbcfm.com/api/v1/image/107345481-1702079431MM-B-120823.jpg?v=1702079430',
'timestamp': 1702080139,
'title': 'Jim Cramer shares his take on Seattle\'s tech scene',
'release_date': '20231208',
'upload_date': '20231209',
'modified_timestamp': 1702080139,
'modified_date': '20231209',
'release_timestamp': 1702073551,
},
'skip': 'Dead link',
}
'expected_warnings': ['Unable to download f4m manifest'],
}, {
'url': 'https://www.cnbc.com/video/2023/12/08/the-epicenter-of-ai-is-in-seattle-says-jim-cramer.html',
'info_dict': {
'author': 'Jim Cramer',
'channel': 'Mad Money with Jim Cramer',
'description': 'md5:72925be21b952e95eba51178dddf4e3e',
'duration': 113.0,
'ext': 'mp4',
'id': '107345474',
'display_id': 'the-epicenter-of-ai-is-in-seattle-says-jim-cramer',
'thumbnail': 'https://image.cnbcfm.com/api/v1/image/107345486-Screenshot_2023-12-08_at_70339_PM.png?v=1702080248',
'timestamp': 1702080535,
'title': 'The epicenter of AI is in Seattle, says Jim Cramer',
'release_timestamp': 1702077347,
'modified_timestamp': 1702080535,
'release_date': '20231208',
'upload_date': '20231209',
'modified_date': '20231209',
},
'expected_warnings': ['Unable to download f4m manifest'],
}]
def _real_extract(self, url):
path, display_id = self._match_valid_url(url).groups()
video_id = self._download_json(
'https://webql-redesign.cnbcfm.com/graphql', display_id, query={
'query': '''{
page(path: "%s") {
vcpsId
}
}''' % path,
})['data']['page']['vcpsId']
return self.url_result(
'http://video.cnbc.com/gallery/?video=%d' % video_id,
CNBCIE.ie_key())
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
data = self._search_json(r'window\.__s_data=', webpage, 'video data', display_id)
player_data = traverse_obj(data, (
'page', 'page', 'layout', ..., 'columns', ..., 'modules',
lambda _, v: v['name'] == 'clipPlayer', 'data', {dict}), get_all=False)
return {
'id': display_id,
'display_id': display_id,
'formats': self._extract_akamai_formats(player_data['playbackURL'], display_id),
**self._search_json_ld(webpage, display_id, fatal=False),
**traverse_obj(player_data, {
'id': ('id', {str_or_none}),
'title': ('title', {str}),
'description': ('description', {str}),
'author': ('author', ..., 'name', {str}),
'timestamp': ('datePublished', {parse_iso8601}),
'release_timestamp': ('uploadDate', {parse_iso8601}),
'modified_timestamp': ('dateLastPublished', {parse_iso8601}),
'thumbnail': ('thumbnail', {url_or_none}),
'duration': ('duration', {int_or_none}),
'channel': ('section', 'title', {str}),
}, get_all=False),
}

View file

@ -247,6 +247,8 @@ class InfoExtractor:
(For internal use only)
* http_chunk_size Chunk size for HTTP downloads
* ffmpeg_args Extra arguments for ffmpeg downloader
* is_dash_periods Whether the format is a result of merging
multiple DASH periods.
RTMP formats can also have the additional fields: page_url,
app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
rtmp_protocol, rtmp_real_time
@ -278,7 +280,7 @@ class InfoExtractor:
description: Full video description.
uploader: Full name of the video uploader.
license: License name the video is licensed under.
creator: The creator of the video.
creators: List of creators of the video.
timestamp: UNIX timestamp of the moment the video was uploaded
upload_date: Video upload date in UTC (YYYYMMDD).
If not explicitly set, calculated from timestamp
@ -422,16 +424,16 @@ class InfoExtractor:
track_number: Number of the track within an album or a disc, as an integer.
track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
as a unicode string.
artist: Artist(s) of the track.
genre: Genre(s) of the track.
artists: List of artists of the track.
composers: List of composers of the piece.
genres: List of genres of the track.
album: Title of the album the track belongs to.
album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
album_artist: List of all artists appeared on the album (e.g.
"Ash Borer / Fell Voices" or "Various Artists", useful for splits
and compilations).
album_artists: List of all artists appeared on the album.
E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"].
Useful for splits and compilations.
disc_number: Number of the disc or other physical medium the track belongs to,
as an integer.
composer: Composer of the piece
The following fields should only be set for clips that should be cut from the original video:
@ -442,6 +444,18 @@ class InfoExtractor:
rows: Number of rows in each storyboard fragment, as an integer
columns: Number of columns in each storyboard fragment, as an integer
The following fields are deprecated and should not be set by new code:
composer: Use "composers" instead.
Composer(s) of the piece, comma-separated.
artist: Use "artists" instead.
Artist(s) of the track, comma-separated.
genre: Use "genres" instead.
Genre(s) of the track, comma-separated.
album_artist: Use "album_artists" instead.
All artists appeared on the album, comma-separated.
creator: Use "creators" instead.
The creator of the video.
Unless mentioned otherwise, the fields should be Unicode strings.
Unless mentioned otherwise, None is equivalent to absence of information.
@ -2530,7 +2544,11 @@ def _extract_mpd_formats(self, *args, **kwargs):
self._report_ignoring_subs('DASH')
return fmts
def _extract_mpd_formats_and_subtitles(
def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
periods = self._extract_mpd_periods(*args, **kwargs)
return self._merge_mpd_periods(periods)
def _extract_mpd_periods(
self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
fatal=True, data=None, headers={}, query={}):
@ -2543,17 +2561,16 @@ def _extract_mpd_formats_and_subtitles(
errnote='Failed to download MPD manifest' if errnote is None else errnote,
fatal=fatal, data=data, headers=headers, query=query)
if res is False:
return [], {}
return []
mpd_doc, urlh = res
if mpd_doc is None:
return [], {}
return []
# We could have been redirected to a new url when we retrieved our mpd file.
mpd_url = urlh.url
mpd_base_url = base_url(mpd_url)
return self._parse_mpd_formats_and_subtitles(
mpd_doc, mpd_id, mpd_base_url, mpd_url)
return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
def _parse_mpd_formats(self, *args, **kwargs):
fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
@ -2561,8 +2578,39 @@ def _parse_mpd_formats(self, *args, **kwargs):
self._report_ignoring_subs('DASH')
return fmts
def _parse_mpd_formats_and_subtitles(
self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
periods = self._parse_mpd_periods(*args, **kwargs)
return self._merge_mpd_periods(periods)
def _merge_mpd_periods(self, periods):
"""
Combine all formats and subtitles from an MPD manifest into a single list,
by concatenate streams with similar formats.
"""
formats, subtitles = {}, {}
for period in periods:
for f in period['formats']:
assert 'is_dash_periods' not in f, 'format already processed'
f['is_dash_periods'] = True
format_key = tuple(v for k, v in f.items() if k not in (
('format_id', 'fragments', 'manifest_stream_number')))
if format_key not in formats:
formats[format_key] = f
elif 'fragments' in f:
formats[format_key].setdefault('fragments', []).extend(f['fragments'])
if subtitles and period['subtitles']:
self.report_warning(bug_reports_message(
'Found subtitles in multiple periods in the DASH manifest; '
'if part of the subtitles are missing,'
), only_once=True)
for sub_lang, sub_info in period['subtitles'].items():
subtitles.setdefault(sub_lang, []).extend(sub_info)
return list(formats.values()), subtitles
def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
"""
Parse formats from MPD manifest.
References:
@ -2643,14 +2691,17 @@ def extract_Initialization(source):
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
availability_start_time = unified_timestamp(
mpd_doc.get('availabilityStartTime'), with_milliseconds=True) or 0
formats, subtitles = [], {}
stream_numbers = collections.defaultdict(int)
for period in mpd_doc.findall(_add_ns('Period')):
for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
# segmentIngestTime is completely out of spec, but YT Livestream do this
segment_ingest_time = period.get('{http://youtube.com/yt/2012/10/10}segmentIngestTime')
if segment_ingest_time:
availability_start_time = unified_timestamp(segment_ingest_time, with_milliseconds=True)
period_entry = {
'id': period.get('id', f'period-{period_idx}'),
'formats': [],
'subtitles': collections.defaultdict(list),
}
period_duration = parse_duration(period.get('duration')) or mpd_duration
period_ms_info = extract_multisegment_info(period, {
'start_number': 1,
@ -2908,11 +2959,10 @@ def add_segment_url():
if content_type in ('video', 'audio', 'image/jpeg'):
f['manifest_stream_number'] = stream_numbers[f['url']]
stream_numbers[f['url']] += 1
formats.append(f)
period_entry['formats'].append(f)
elif content_type == 'text':
subtitles.setdefault(lang or 'und', []).append(f)
return formats, subtitles
period_entry['subtitles'][lang or 'und'].append(f)
yield period_entry
def _extract_ism_formats(self, *args, **kwargs):
fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)

View file

@ -33,10 +33,7 @@ def _real_extract(self, url):
webpage = self._download_webpage(
'http://embed.crooksandliars.com/embed/%s' % video_id, video_id)
manifest = self._parse_json(
self._search_regex(
r'var\s+manifest\s*=\s*({.+?})\n', webpage, 'manifest JSON'),
video_id)
manifest = self._search_json(r'var\s+manifest\s*=', webpage, 'manifest JSON', video_id)
quality = qualities(('webm_low', 'mp4_low', 'webm_high', 'mp4_high'))

View file

@ -1,8 +1,10 @@
from .common import InfoExtractor
from ..utils import (
encode_base_n,
ExtractorError,
encode_base_n,
get_elements_by_class,
int_or_none,
join_nonempty,
merge_dicts,
parse_duration,
str_to_int,
@ -81,6 +83,7 @@ def calc_hash(s):
sources = video['sources']
formats = []
has_av1 = bool(get_elements_by_class('download-av1', webpage))
for kind, formats_dict in sources.items():
if not isinstance(formats_dict, dict):
continue
@ -106,6 +109,14 @@ def calc_hash(s):
'height': height,
'fps': fps,
})
if has_av1:
formats.append({
'url': src.replace('.mp4', '-av1.mp4'),
'format_id': join_nonempty('av1', format_id),
'height': height,
'fps': fps,
'vcodec': 'av1',
})
json_ld = self._search_json_ld(webpage, display_id, default={})

224
yt_dlp/extractor/err.py Normal file
View file

@ -0,0 +1,224 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
int_or_none,
str_or_none,
url_or_none,
)
from ..utils.traversal import traverse_obj
class ERRJupiterIE(InfoExtractor):
_VALID_URL = r'https?://(?:jupiter(?:pluss)?|lasteekraan)\.err\.ee/(?P<id>\d+)'
_TESTS = [{
'note': 'Jupiter: Movie: siin-me-oleme',
'url': 'https://jupiter.err.ee/1211107/siin-me-oleme',
'md5': '9b45d1682a98853acaa1e1b0c791f425',
'info_dict': {
'id': '1211107',
'ext': 'mp4',
'title': 'Siin me oleme!',
'alt_title': '',
'description': 'md5:1825b795f5f7584241aeb59e5bbb4f70',
'release_date': '20231226',
'upload_date': '20201217',
'modified_date': '20201217',
'release_timestamp': 1703577600,
'timestamp': 1608210000,
'modified_timestamp': 1608220800,
'release_year': 1978,
},
}, {
'note': 'Jupiter: Series: Impulss',
'url': 'https://jupiter.err.ee/1609145945/impulss',
'md5': 'a378486df07ed1ba74e46cc861886243',
'info_dict': {
'id': '1609145945',
'ext': 'mp4',
'title': 'Impulss',
'alt_title': 'Loteriipilet hooldekodusse',
'description': 'md5:fa8a2ed0cdccb130211513443ee4d571',
'release_date': '20231107',
'upload_date': '20231026',
'modified_date': '20231118',
'release_timestamp': 1699380000,
'timestamp': 1698327601,
'modified_timestamp': 1700311802,
'series': 'Impulss',
'season': 'Season 1',
'season_number': 1,
'episode': 'Loteriipilet hooldekodusse',
'episode_number': 6,
'series_id': '1609108187',
'release_year': 2023,
'episode_id': '1609145945',
},
}, {
'note': 'Jupiter: Radio Show: mnemoturniir episode',
'url': 'https://jupiter.err.ee/1037919/mnemoturniir',
'md5': 'f1eb95fe66f9620ff84e81bbac37076a',
'info_dict': {
'id': '1037919',
'ext': 'm4a',
'title': 'Mnemoturniir',
'alt_title': '',
'description': 'md5:626db52394e7583c26ab74d6a34d9982',
'release_date': '20240121',
'upload_date': '20240108',
'modified_date': '20240121',
'release_timestamp': 1705827900,
'timestamp': 1704675602,
'modified_timestamp': 1705827601,
'series': 'Mnemoturniir',
'season': 'Season 0',
'season_number': 0,
'episode': 'Episode 0',
'episode_number': 0,
'series_id': '1037919',
'release_year': 2024,
'episode_id': '1609215101',
},
}, {
'note': 'Jupiter+: Clip: bolee-zelenyj-tallinn',
'url': 'https://jupiterpluss.err.ee/1609180445/bolee-zelenyj-tallinn',
'md5': '1b812270c4daf6ce51c06bfeaf33ed95',
'info_dict': {
'id': '1609180445',
'ext': 'mp4',
'title': 'Более зеленый Таллинн',
'alt_title': '',
'description': 'md5:fd34d9bf939c28c4a725b19a7f0d6320',
'release_date': '20231224',
'upload_date': '20231130',
'modified_date': '20231207',
'release_timestamp': 1703423400,
'timestamp': 1701338400,
'modified_timestamp': 1701967200,
'release_year': 2023,
},
}, {
'note': 'Jupiter+: Series: The Sniffer',
'url': 'https://jupiterpluss.err.ee/1608311387/njuhach',
'md5': '2abdeb7131ce551bce49e8d0cea08536',
'info_dict': {
'id': '1608311387',
'ext': 'mp4',
'title': 'Нюхач',
'alt_title': '',
'description': 'md5:8c5c7d8f32ec6e54cd498c9e59ca83bc',
'release_date': '20230601',
'upload_date': '20210818',
'modified_date': '20210903',
'release_timestamp': 1685633400,
'timestamp': 1629318000,
'modified_timestamp': 1630686000,
'release_year': 2013,
'episode': 'Episode 1',
'episode_id': '1608311390',
'episode_number': 1,
'season': 'Season 1',
'season_number': 1,
'series': 'Нюхач',
'series_id': '1608311387',
},
}, {
'note': 'Jupiter+: Podcast: lesnye-istorii-aisty',
'url': 'https://jupiterpluss.err.ee/1608990335/lesnye-istorii-aisty',
'md5': '8b46d7e4510b254a14b7a52211b5bf96',
'info_dict': {
'id': '1608990335',
'ext': 'm4a',
'title': 'Лесные истории | Аисты',
'alt_title': '',
'description': 'md5:065e721623e271e7a63e6540d409ca6b',
'release_date': '20230609',
'upload_date': '20230527',
'modified_date': '20230608',
'release_timestamp': 1686308700,
'timestamp': 1685145600,
'modified_timestamp': 1686252600,
'release_year': 2023,
'episode': 'Episode 0',
'episode_id': '1608990335',
'episode_number': 0,
'season': 'Season 0',
'season_number': 0,
'series': 'Лесные истории | Аисты',
'series_id': '1037497',
}
}, {
'note': 'Lasteekraan: Pätu',
'url': 'https://lasteekraan.err.ee/1092243/patu',
'md5': 'a67eb9b9bcb3d201718c15d1638edf77',
'info_dict': {
'id': '1092243',
'ext': 'mp4',
'title': 'Pätu',
'alt_title': '',
'description': 'md5:64a7b5a80afd7042d3f8ec48c77befd9',
'release_date': '20230614',
'upload_date': '20200520',
'modified_date': '20200520',
'release_timestamp': 1686745800,
'timestamp': 1589975640,
'modified_timestamp': 1589975640,
'release_year': 1990,
'episode': 'Episode 1',
'episode_id': '1092243',
'episode_number': 1,
'season': 'Season 1',
'season_number': 1,
'series': 'Pätu',
'series_id': '1092236',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
data = self._download_json(
'https://services.err.ee/api/v2/vodContent/getContentPageData', video_id,
query={'contentId': video_id})['data']['mainContent']
media_data = traverse_obj(data, ('medias', ..., {dict}), get_all=False)
if traverse_obj(media_data, ('restrictions', 'drm', {bool})):
self.report_drm(video_id)
formats, subtitles = [], {}
for format_url in set(traverse_obj(media_data, ('src', ('hls', 'hls2', 'hlsNew'), {url_or_none}))):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
format_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
for format_url in set(traverse_obj(media_data, ('src', ('dash', 'dashNew'), {url_or_none}))):
fmts, subs = self._extract_mpd_formats_and_subtitles(
format_url, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if format_url := traverse_obj(media_data, ('src', 'file', {url_or_none})):
formats.append({
'url': format_url,
'format_id': 'http',
})
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(data, {
'title': ('heading', {str}),
'alt_title': ('subHeading', {str}),
'description': (('lead', 'body'), {clean_html}, {lambda x: x or None}),
'timestamp': ('created', {int_or_none}),
'modified_timestamp': ('updated', {int_or_none}),
'release_timestamp': (('scheduleStart', 'publicStart'), {int_or_none}),
'release_year': ('year', {int_or_none}),
}, get_all=False),
**(traverse_obj(data, {
'series': ('heading', {str}),
'series_id': ('rootContentId', {str_or_none}),
'episode': ('subHeading', {str}),
'season_number': ('season', {int_or_none}),
'episode_number': ('episode', {int_or_none}),
'episode_id': ('id', {str_or_none}),
}) if data.get('type') == 'episode' else {}),
}

View file

@ -20,6 +20,7 @@
get_element_by_id,
get_first,
int_or_none,
join_nonempty,
js_to_json,
merge_dicts,
parse_count,
@ -43,6 +44,7 @@ class FacebookIE(InfoExtractor):
(?:[^#]*?\#!/)?
(?:
(?:
permalink\.php|
video/video\.php|
photo\.php|
video\.php|
@ -52,6 +54,7 @@ class FacebookIE(InfoExtractor):
)\?(?:.*?)(?:v|video_id|story_fbid)=|
[^/]+/videos/(?:[^/]+/)?|
[^/]+/posts/|
events/(?:[^/]+/)?|
groups/[^/]+/(?:permalink|posts)/|
watchparty/
)|
@ -248,6 +251,7 @@ class FacebookIE(InfoExtractor):
'duration': 148.435,
},
}, {
# data.node.comet_sections.content.story.attachments[].styles.attachment.media
'url': 'https://www.facebook.com/attn/posts/pfbid0j1Czf2gGDVqeQ8KiMLFm3pWN8GxsQmeRrVhimWDzMuKQoR8r4b1knNsejELmUgyhl',
'info_dict': {
'id': '6968553779868435',
@ -262,6 +266,22 @@ class FacebookIE(InfoExtractor):
'thumbnail': r're:^https?://.*',
'timestamp': 1701975646,
},
}, {
# data.node.comet_sections.content.story.attachments[].styles.attachment.media
'url': 'https://www.facebook.com/permalink.php?story_fbid=pfbid0fqQuVEQyXRa9Dp4RcaTR14KHU3uULHV1EK7eckNXSH63JMuoALsAvVCJ97zAGitil&id=100068861234290',
'info_dict': {
'id': '270103405756416',
'ext': 'mp4',
'title': 'Lela Evans',
'description': 'Today Makkovik\'s own Pilot Mandy Smith made her inaugural landing on the airstrip in her hometown. What a proud moment as we all cheered and...',
'thumbnail': r're:^https?://.*',
'uploader': 'Lela Evans',
'uploader_id': 'pfbid0shZJipuigyy5mqrUJn9ub5LJFWNHvan5prtyi3LrDuuuJ4NwrURgnQHYR9fywBepl',
'upload_date': '20231228',
'timestamp': 1703804085,
'duration': 394.347,
'view_count': int,
},
}, {
'url': 'https://www.facebook.com/story.php?story_fbid=pfbid0Fnzhm8UuzjBYpPMNFzaSpFE9UmLdU4fJN8qTANi1Dmtj5q7DNrL5NERXfsAzDEV7l&id=100073071055552',
'only_matching': True,
@ -380,6 +400,18 @@ class FacebookIE(InfoExtractor):
},
'playlist_count': 1,
'skip': 'Requires logging in',
}, {
# data.event.cover_media_renderer.cover_video
'url': 'https://m.facebook.com/events/1509582499515440',
'info_dict': {
'id': '637246984455045',
'ext': 'mp4',
'title': 'ANALISI IN CAMPO OSCURO " Coaguli nel sangue dei vaccinati"',
'description': 'Other event by Comitato Liberi Pensatori on Tuesday, October 18 2022',
'thumbnail': r're:^https?://.*',
'uploader': 'Comitato Liberi Pensatori',
'uploader_id': '100065709540881',
},
}]
_SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)'
_api_config = {
@ -454,38 +486,10 @@ def extract_metadata(webpage):
r'data-sjs>({.*?ScheduledServerJS.*?})</script>', webpage)]
post = traverse_obj(post_data, (
..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or []
automatic_captions, subtitles = {}, {}
subs_data = traverse_obj(post, (..., 'video', ..., 'attachments', ..., lambda k, v: (
k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')))
is_video_broadcast = get_first(subs_data, 'is_video_broadcast', expected_type=bool)
captions = get_first(subs_data, 'video_available_captions_locales', 'captions_url')
if url_or_none(captions): # if subs_data only had a 'captions_url'
locale = self._html_search_meta(['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US')
subtitles[locale] = [{'url': captions}]
# or else subs_data had 'video_available_captions_locales', a list of dicts
for caption in traverse_obj(captions, (
{lambda x: sorted(x, key=lambda c: c['locale'])}, lambda _, v: v['captions_url'])
):
lang = caption.get('localized_language') or ''
subs = {
'url': caption['captions_url'],
'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang),
}
if caption.get('localized_creation_method') or is_video_broadcast:
automatic_captions.setdefault(caption['locale'], []).append(subs)
else:
subtitles.setdefault(caption['locale'], []).append(subs)
media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: (
k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict)
title = get_first(media, ('title', 'text'))
description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text'))
uploader_data = (
get_first(media, ('owner', {dict}))
or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name']))
or get_first(post, ('node', 'actors', ..., {dict})) or {})
page_title = title or self._html_search_regex((
r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>',
r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(?P<content>.*?)</span>',
@ -494,11 +498,16 @@ def extract_metadata(webpage):
description = description or self._html_search_meta(
['description', 'og:description', 'twitter:description'],
webpage, 'description', default=None)
uploader_data = (
get_first(media, ('owner', {dict}))
or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name']))
or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name']))
or get_first(post, ('node', 'actors', ..., {dict}))
or get_first(post, ('event', 'event_creator', {dict})) or {})
uploader = uploader_data.get('name') or (
clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
or self._search_regex(
(r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False))
timestamp = int_or_none(self._search_regex(
r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
'timestamp', default=None))
@ -520,8 +529,6 @@ def extract_metadata(webpage):
webpage, 'view count', default=None)),
'concurrent_view_count': get_first(post, (
('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})),
'automatic_captions': automatic_captions,
'subtitles': subtitles,
}
info_json_ld = self._search_json_ld(webpage, video_id, default={})
@ -563,7 +570,11 @@ def process_formats(info):
# Downloads with browser's User-Agent are rate limited. Working around
# with non-browser User-Agent.
for f in info['formats']:
# Downloads with browser's User-Agent are rate limited. Working around
# with non-browser User-Agent.
f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
# Formats larger than ~500MB will return error 403 unless chunk size is regulated
f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20
def extract_relay_data(_filter):
return self._parse_json(self._search_regex(
@ -573,8 +584,8 @@ def extract_relay_data(_filter):
def extract_relay_prefetched_data(_filter):
return traverse_obj(extract_relay_data(_filter), (
'require', (None, (..., ..., ..., '__bbox', 'require')),
lambda _, v: 'RelayPrefetchedStreamCache' in v, ..., ...,
'__bbox', 'result', 'data', {dict}), get_all=False) or {}
lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v),
..., ..., '__bbox', 'result', 'data', {dict}), get_all=False) or {}
if not video_data:
server_js_data = self._parse_json(self._search_regex([
@ -615,6 +626,29 @@ def parse_graphql_video(video):
'url': playable_url,
})
extract_dash_manifest(video, formats)
automatic_captions, subtitles = {}, {}
is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool}))
for caption in traverse_obj(video, (
'video_available_captions_locales',
{lambda x: sorted(x, key=lambda c: c['locale'])},
lambda _, v: url_or_none(v['captions_url'])
)):
lang = caption.get('localized_language') or 'und'
subs = {
'url': caption['captions_url'],
'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang),
}
if caption.get('localized_creation_method') or is_broadcast:
automatic_captions.setdefault(caption['locale'], []).append(subs)
else:
subtitles.setdefault(caption['locale'], []).append(subs)
captions_url = traverse_obj(video, ('captions_url', {url_or_none}))
if captions_url and not automatic_captions and not subtitles:
locale = self._html_search_meta(
['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US')
(automatic_captions if is_broadcast else subtitles)[locale] = [{'url': captions_url}]
info = {
'id': v_id,
'formats': formats,
@ -624,6 +658,8 @@ def parse_graphql_video(video):
'timestamp': traverse_obj(video, 'publish_time', 'creation_time', expected_type=int_or_none),
'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000)
or float_or_none(video.get('length_in_second'))),
'automatic_captions': automatic_captions,
'subtitles': subtitles,
}
process_formats(info)
description = try_get(video, lambda x: x['savable_description']['text'])
@ -658,7 +694,8 @@ def parse_attachment(attachment, key='media'):
for edge in edges:
parse_attachment(edge, key='node')
video = data.get('video') or {}
video = traverse_obj(data, (
'event', 'cover_media_renderer', 'cover_video'), 'video', expected_type=dict) or {}
if video:
attachments = try_get(video, [
lambda x: x['story']['attachments'],
@ -677,6 +714,9 @@ def parse_attachment(attachment, key='media'):
# honor precise duration in video info
if video_info.get('duration'):
webpage_info['duration'] = video_info['duration']
# preserve preferred_thumbnail in video info
if video_info.get('thumbnail'):
webpage_info['thumbnail'] = video_info['thumbnail']
return merge_dicts(webpage_info, video_info)
if not video_data:
@ -907,3 +947,114 @@ def _real_extract(self, url):
video_id = self._match_id(url)
return self.url_result(
f'https://m.facebook.com/watch/?v={video_id}&_rdr', FacebookIE, video_id)
class FacebookAdsIE(InfoExtractor):
_VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/ads/library/?\?(?:[^#]+&)?id=(?P<id>\d+)'
IE_NAME = 'facebook:ads'
_TESTS = [{
'url': 'https://www.facebook.com/ads/library/?id=899206155126718',
'info_dict': {
'id': '899206155126718',
'ext': 'mp4',
'title': 'video by Kandao',
'uploader': 'Kandao',
'uploader_id': '774114102743284',
'uploader_url': r're:^https?://.*',
'timestamp': 1702548330,
'thumbnail': r're:^https?://.*',
'upload_date': '20231214',
'like_count': int,
}
}, {
'url': 'https://www.facebook.com/ads/library/?id=893637265423481',
'info_dict': {
'id': '893637265423481',
'title': 'Jusqu\u2019\u00e0 -25% sur une s\u00e9lection de vins p\u00e9tillants italiens ',
'uploader': 'Eataly Paris Marais',
'uploader_id': '2086668958314152',
'uploader_url': r're:^https?://.*',
'timestamp': 1703571529,
'upload_date': '20231226',
'like_count': int,
},
'playlist_count': 3,
}, {
'url': 'https://es-la.facebook.com/ads/library/?id=901230958115569',
'only_matching': True,
}, {
'url': 'https://m.facebook.com/ads/library/?id=901230958115569',
'only_matching': True,
}]
_FORMATS_MAP = {
'watermarked_video_sd_url': ('sd-wmk', 'SD, watermarked'),
'video_sd_url': ('sd', None),
'watermarked_video_hd_url': ('hd-wmk', 'HD, watermarked'),
'video_hd_url': ('hd', None),
}
def _extract_formats(self, video_dict):
formats = []
for format_key, format_url in traverse_obj(video_dict, (
{dict.items}, lambda _, v: v[0] in self._FORMATS_MAP and url_or_none(v[1])
)):
formats.append({
'format_id': self._FORMATS_MAP[format_key][0],
'format_note': self._FORMATS_MAP[format_key][1],
'url': format_url,
'ext': 'mp4',
'quality': qualities(tuple(self._FORMATS_MAP))(format_key),
})
return formats
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
post_data = [self._parse_json(j, video_id, fatal=False)
for j in re.findall(r's\.handle\(({.*})\);requireLazy\(', webpage)]
data = traverse_obj(post_data, (
..., 'require', ..., ..., ..., 'props', 'deeplinkAdCard', 'snapshot', {dict}), get_all=False)
if not data:
raise ExtractorError('Unable to extract ad data')
title = data.get('title')
if not title or title == '{{product.name}}':
title = join_nonempty('display_format', 'page_name', delim=' by ', from_dict=data)
info_dict = traverse_obj(data, {
'description': ('link_description', {str}, {lambda x: x if x != '{{product.description}}' else None}),
'uploader': ('page_name', {str}),
'uploader_id': ('page_id', {str_or_none}),
'uploader_url': ('page_profile_uri', {url_or_none}),
'timestamp': ('creation_time', {int_or_none}),
'like_count': ('page_like_count', {int_or_none}),
})
entries = []
for idx, entry in enumerate(traverse_obj(
data, (('videos', 'cards'), lambda _, v: any([url_or_none(v[f]) for f in self._FORMATS_MAP]))), 1
):
entries.append({
'id': f'{video_id}_{idx}',
'title': entry.get('title') or title,
'description': entry.get('link_description') or info_dict.get('description'),
'thumbnail': url_or_none(entry.get('video_preview_image_url')),
'formats': self._extract_formats(entry),
})
if len(entries) == 1:
info_dict.update(entries[0])
elif len(entries) > 1:
info_dict.update({
'title': entries[0]['title'],
'entries': entries,
'_type': 'playlist',
})
info_dict['id'] = video_id
return info_dict

View file

@ -0,0 +1,62 @@
from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
UserNotLive,
parse_iso8601,
str_or_none,
traverse_obj,
url_or_none,
)
class FlexTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?flextv\.co\.kr/channels/(?P<id>\d+)/live'
_TESTS = [{
'url': 'https://www.flextv.co.kr/channels/231638/live',
'info_dict': {
'id': '231638',
'ext': 'mp4',
'title': r're:^214하나만\.\.\. ',
'thumbnail': r're:^https?://.+\.jpg',
'upload_date': r're:\d{8}',
'timestamp': int,
'live_status': 'is_live',
'channel': 'Hi별',
'channel_id': '244396',
},
'skip': 'The channel is offline',
}, {
'url': 'https://www.flextv.co.kr/channels/746/live',
'only_matching': True,
}]
def _real_extract(self, url):
channel_id = self._match_id(url)
try:
stream_data = self._download_json(
f'https://api.flextv.co.kr/api/channels/{channel_id}/stream',
channel_id, query={'option': 'all'})
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 400:
raise UserNotLive(video_id=channel_id)
raise
playlist_url = stream_data['sources'][0]['url']
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
playlist_url, channel_id, 'mp4')
return {
'id': channel_id,
'formats': formats,
'subtitles': subtitles,
'is_live': True,
**traverse_obj(stream_data, {
'title': ('stream', 'title', {str}),
'timestamp': ('stream', 'createdAt', {parse_iso8601}),
'thumbnail': ('thumbUrl', {url_or_none}),
'channel': ('owner', 'name', {str}),
'channel_id': ('owner', 'id', {str_or_none}),
}),
}

View file

@ -11,6 +11,7 @@
join_nonempty,
parse_codecs,
parse_iso8601,
url_or_none,
urljoin,
)
from ..utils.traversal import traverse_obj
@ -108,6 +109,64 @@ class FloatplaneIE(InfoExtractor):
'availability': 'subscriber_only',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://www.floatplane.com/post/65B5PNoBtf',
'info_dict': {
'id': '65B5PNoBtf',
'description': 'I recorded the inbuilt demo mode for your 90\'s enjoyment, thanks for being Floaties!',
'display_id': '65B5PNoBtf',
'like_count': int,
'release_timestamp': 1701249480,
'uploader': 'The Trash Network',
'availability': 'subscriber_only',
'uploader_id': '61bc20c9a131fb692bf2a513',
'uploader_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home',
'channel_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home/thedrumthing',
'comment_count': int,
'title': 'The $50 electronic drum kit.',
'channel_id': '64424fe73cd58cbcf8d8e131',
'thumbnail': 'https://pbs.floatplane.com/blogPost_thumbnails/65B5PNoBtf/725555379422705_1701247052743.jpeg',
'dislike_count': int,
'channel': 'The Drum Thing',
'release_date': '20231129',
},
'playlist_count': 2,
'playlist': [{
'info_dict': {
'id': 'ISPJjexylS',
'ext': 'mp4',
'release_date': '20231129',
'release_timestamp': 1701249480,
'title': 'The $50 electronic drum kit. .mov',
'channel_id': '64424fe73cd58cbcf8d8e131',
'thumbnail': 'https://pbs.floatplane.com/video_thumbnails/ISPJjexylS/335202812134041_1701249383392.jpeg',
'availability': 'subscriber_only',
'uploader': 'The Trash Network',
'duration': 622,
'channel': 'The Drum Thing',
'uploader_id': '61bc20c9a131fb692bf2a513',
'channel_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home/thedrumthing',
'uploader_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home',
},
}, {
'info_dict': {
'id': 'qKfxu6fEpu',
'ext': 'aac',
'release_date': '20231129',
'release_timestamp': 1701249480,
'title': 'Roland TD-7 Demo.m4a',
'channel_id': '64424fe73cd58cbcf8d8e131',
'availability': 'subscriber_only',
'uploader': 'The Trash Network',
'duration': 114,
'channel': 'The Drum Thing',
'uploader_id': '61bc20c9a131fb692bf2a513',
'channel_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home/thedrumthing',
'uploader_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home',
},
}],
'skip': 'requires subscription: "The Trash Network"',
'params': {'skip_download': 'm3u8'},
}]
def _real_initialize(self):
@ -124,6 +183,22 @@ def _real_extract(self, url):
if not any(traverse_obj(post_data, ('metadata', ('hasVideo', 'hasAudio')))):
raise ExtractorError('Post does not contain a video or audio track', expected=True)
uploader_url = format_field(
post_data, [('creator', 'urlname')], 'https://www.floatplane.com/channel/%s/home') or None
common_info = {
'uploader_url': uploader_url,
'channel_url': urljoin(f'{uploader_url}/', traverse_obj(post_data, ('channel', 'urlname'))),
'availability': self._availability(needs_subscription=True),
**traverse_obj(post_data, {
'uploader': ('creator', 'title', {str}),
'uploader_id': ('creator', 'id', {str}),
'channel': ('channel', 'title', {str}),
'channel_id': ('channel', 'id', {str}),
'release_timestamp': ('releaseDate', {parse_iso8601}),
}),
}
items = []
for media in traverse_obj(post_data, (('videoAttachments', 'audioAttachments'), ...)):
media_id = media['id']
@ -150,11 +225,11 @@ def format_path(params):
formats = []
for quality in traverse_obj(stream, ('resource', 'data', 'qualityLevels', ...)):
url = urljoin(stream['cdn'], format_path(traverse_obj(
stream, ('resource', 'data', 'qualityLevelParams', quality['name']))))
stream, ('resource', 'data', 'qualityLevelParams', quality['name'], {dict}))))
formats.append({
**traverse_obj(quality, {
'format_id': 'name',
'format_note': 'label',
'format_id': ('name', {str}),
'format_note': ('label', {str}),
'width': ('width', {int}),
'height': ('height', {int}),
}),
@ -164,38 +239,28 @@ def format_path(params):
})
items.append({
**common_info,
'id': media_id,
**traverse_obj(metadata, {
'title': 'title',
'title': ('title', {str}),
'duration': ('duration', {int_or_none}),
'thumbnail': ('thumbnail', 'path'),
'thumbnail': ('thumbnail', 'path', {url_or_none}),
}),
'formats': formats,
})
uploader_url = format_field(
post_data, [('creator', 'urlname')], 'https://www.floatplane.com/channel/%s/home') or None
channel_url = urljoin(f'{uploader_url}/', traverse_obj(post_data, ('channel', 'urlname')))
post_info = {
**common_info,
'id': post_id,
'display_id': post_id,
**traverse_obj(post_data, {
'title': 'title',
'title': ('title', {str}),
'description': ('text', {clean_html}),
'uploader': ('creator', 'title'),
'uploader_id': ('creator', 'id'),
'channel': ('channel', 'title'),
'channel_id': ('channel', 'id'),
'like_count': ('likes', {int_or_none}),
'dislike_count': ('dislikes', {int_or_none}),
'comment_count': ('comments', {int_or_none}),
'release_timestamp': ('releaseDate', {parse_iso8601}),
'thumbnail': ('thumbnail', 'path'),
'thumbnail': ('thumbnail', 'path', {url_or_none}),
}),
'uploader_url': uploader_url,
'channel_url': channel_url,
'availability': self._availability(needs_subscription=True),
}
if len(items) > 1:

View file

@ -1,25 +1,29 @@
from .common import InfoExtractor
from .nexx import NexxIE
from ..utils import (
int_or_none,
str_or_none,
)
class FunkIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|origin\.)?funk\.net/(?:channel|playlist)/[^/]+/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.funk.net/channel/ba-793/die-lustigsten-instrumente-aus-dem-internet-teil-2-1155821',
'md5': '8dd9d9ab59b4aa4173b3197f2ea48e81',
'md5': '8610449476156f338761a75391b0017d',
'info_dict': {
'id': '1155821',
'ext': 'mp4',
'title': 'Die LUSTIGSTEN INSTRUMENTE aus dem Internet - Teil 2',
'description': 'md5:a691d0413ef4835588c5b03ded670c1f',
'description': 'md5:2a03b67596eda0d1b5125c299f45e953',
'timestamp': 1514507395,
'upload_date': '20171229',
'duration': 426.0,
'cast': ['United Creators PMB GmbH'],
'thumbnail': 'https://assets.nexx.cloud/media/75/56/79/3YKUSJN1LACN0CRxL.jpg',
'display_id': 'die-lustigsten-instrumente-aus-dem-internet-teil-2',
'alt_title': 'Die LUSTIGSTEN INSTRUMENTE aus dem Internet Teil 2',
'season_number': 0,
'season': 'Season 0',
'episode_number': 0,
'episode': 'Episode 0',
},
}, {
'url': 'https://www.funk.net/playlist/neuesteVideos/kameras-auf-dem-fusion-festival-1618699',
'only_matching': True,
@ -27,18 +31,10 @@ class FunkIE(InfoExtractor):
def _real_extract(self, url):
display_id, nexx_id = self._match_valid_url(url).groups()
video = self._download_json(
'https://www.funk.net/api/v4.0/videos/' + nexx_id, nexx_id)
return {
'_type': 'url_transparent',
'url': 'nexx:741:' + nexx_id,
'url': f'nexx:741:{nexx_id}',
'ie_key': NexxIE.ie_key(),
'id': nexx_id,
'title': video.get('title'),
'description': video.get('description'),
'duration': int_or_none(video.get('duration')),
'channel_id': str_or_none(video.get('channelId')),
'display_id': display_id,
'tags': video.get('tags'),
'thumbnail': video.get('imageUrlLandscape'),
}

View file

@ -66,7 +66,7 @@ def _entries(self, file_id):
query_params = {
'contentId': file_id,
'token': self._TOKEN,
'websiteToken': '7fd94ds12fds4', # From https://gofile.io/dist/js/alljs.js
'wt': '4fd6sg89d7s6', # From https://gofile.io/dist/js/alljs.js
}
password = self.get_param('videopassword')
if password:

View file

@ -40,6 +40,22 @@ class GoPlayIE(InfoExtractor):
'title': 'A Family for the Holidays',
},
'skip': 'This video is only available for registered users'
}, {
'url': 'https://www.goplay.be/video/de-mol/de-mol-s11/de-mol-s11-aflevering-1#autoplay',
'info_dict': {
'id': '03eb8f2f-153e-41cb-9805-0d3a29dab656',
'ext': 'mp4',
'title': 'S11 - Aflevering 1',
'episode': 'Episode 1',
'series': 'De Mol',
'season_number': 11,
'episode_number': 1,
'season': 'Season 11'
},
'params': {
'skip_download': True
},
'skip': 'This video is only available for registered users'
}]
_id_token = None
@ -77,16 +93,39 @@ def _real_extract(self, url):
api = self._download_json(
f'https://api.goplay.be/web/v1/videos/long-form/{video_id}',
video_id, headers={'Authorization': 'Bearer %s' % self._id_token})
video_id, headers={
'Authorization': 'Bearer %s' % self._id_token,
**self.geo_verification_headers(),
})
formats, subs = self._extract_m3u8_formats_and_subtitles(
api['manifestUrls']['hls'], video_id, ext='mp4', m3u8_id='HLS')
if 'manifestUrls' in api:
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
api['manifestUrls']['hls'], video_id, ext='mp4', m3u8_id='HLS')
else:
if 'ssai' not in api:
raise ExtractorError('expecting Google SSAI stream')
ssai_content_source_id = api['ssai']['contentSourceID']
ssai_video_id = api['ssai']['videoID']
dai = self._download_json(
f'https://dai.google.com/ondemand/dash/content/{ssai_content_source_id}/vid/{ssai_video_id}/streams',
video_id, data=b'{"api-key":"null"}',
headers={'content-type': 'application/json'})
periods = self._extract_mpd_periods(dai['stream_manifest'], video_id)
# skip pre-roll and mid-roll ads
periods = [p for p in periods if '-ad-' not in p['id']]
formats, subtitles = self._merge_mpd_periods(periods)
info_dict.update({
'id': video_id,
'formats': formats,
'subtitles': subtitles,
})
return info_dict

View file

@ -13,7 +13,7 @@ class LeFigaroVideoEmbedIE(InfoExtractor):
_TESTS = [{
'url': 'https://video.lefigaro.fr/embed/figaro/video/les-francais-ne-veulent-ils-plus-travailler-suivez-en-direct-le-club-le-figaro-idees/',
'md5': 'e94de44cd80818084352fcf8de1ce82c',
'md5': 'a0c3069b7e4c4526abf0053a7713f56f',
'info_dict': {
'id': 'g9j7Eovo',
'title': 'Les Français ne veulent-ils plus travailler ? Retrouvez Le Club Le Figaro Idées',
@ -26,7 +26,7 @@ class LeFigaroVideoEmbedIE(InfoExtractor):
},
}, {
'url': 'https://video.lefigaro.fr/embed/figaro/video/intelligence-artificielle-faut-il-sen-mefier/',
'md5': '0b3f10332b812034b3a3eda1ef877c5f',
'md5': '319c662943dd777bab835cae1e2d73a5',
'info_dict': {
'id': 'LeAgybyc',
'title': 'Intelligence artificielle : faut-il sen méfier ?',
@ -41,7 +41,7 @@ class LeFigaroVideoEmbedIE(InfoExtractor):
_WEBPAGE_TESTS = [{
'url': 'https://video.lefigaro.fr/figaro/video/suivez-en-direct-le-club-le-figaro-international-avec-philippe-gelie-9/',
'md5': '3972ddf2d5f8b98699f191687258e2f9',
'md5': '6289f9489efb969e38245f31721596fe',
'info_dict': {
'id': 'QChnbPYA',
'title': 'Où en est le couple franco-allemand ? Retrouvez Le Club Le Figaro International',
@ -55,7 +55,7 @@ class LeFigaroVideoEmbedIE(InfoExtractor):
},
}, {
'url': 'https://video.lefigaro.fr/figaro/video/la-philosophe-nathalie-sarthou-lajus-est-linvitee-du-figaro-live/',
'md5': '3ac0a0769546ee6be41ab52caea5d9a9',
'md5': 'f6df814cae53e85937621599d2967520',
'info_dict': {
'id': 'QJzqoNbf',
'title': 'La philosophe Nathalie Sarthou-Lajus est linvitée du Figaro Live',
@ -73,7 +73,8 @@ def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
player_data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['pageData']['playerData']
player_data = self._search_nextjs_data(
webpage, display_id)['props']['pageProps']['initialProps']['pageData']['playerData']
return self.url_result(
f'jwplatform:{player_data["videoId"]}', title=player_data.get('title'),

View file

@ -3,16 +3,15 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
ExtractorError,
extract_attributes,
float_or_none,
get_element_by_class,
int_or_none,
srt_subtitles_timecode,
strip_or_none,
mimetype2ext,
traverse_obj,
try_get,
url_or_none,
urlencode_postdata,
urljoin,
)
@ -83,15 +82,29 @@ def _get_video_id(self, video_data, course_slug, video_slug):
class LinkedInIE(LinkedInBaseIE):
_VALID_URL = r'https?://(?:www\.)?linkedin\.com/posts/.+?(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?linkedin\.com/posts/[^/?#]+-(?P<id>\d+)-\w{4}/?(?:[?#]|$)'
_TESTS = [{
'url': 'https://www.linkedin.com/posts/mishalkhawaja_sendinblueviews-toronto-digitalmarketing-ugcPost-6850898786781339649-mM20',
'info_dict': {
'id': '6850898786781339649',
'ext': 'mp4',
'title': 'Mishal K. on LinkedIn: #sendinblueviews #toronto #digitalmarketing',
'description': 'md5:be125430bab1c574f16aeb186a4d5b19',
'creator': 'Mishal K.'
'title': 'Mishal K. on LinkedIn: #sendinblueviews #toronto #digitalmarketing #nowhiring #sendinblue…',
'description': 'md5:2998a31f6f479376dd62831f53a80f71',
'uploader': 'Mishal K.',
'thumbnail': 're:^https?://media.licdn.com/dms/image/.*$',
'like_count': int
},
}, {
'url': 'https://www.linkedin.com/posts/the-mathworks_2_what-is-mathworks-cloud-center-activity-7151241570371948544-4Gu7',
'info_dict': {
'id': '7151241570371948544',
'ext': 'mp4',
'title': 'MathWorks on LinkedIn: What Is MathWorks Cloud Center?',
'description': 'md5:95f9d4eeb6337882fb47eefe13d7a40c',
'uploader': 'MathWorks',
'thumbnail': 're:^https?://media.licdn.com/dms/image/.*$',
'like_count': int,
'subtitles': 'mincount:1'
},
}]
@ -99,26 +112,30 @@ def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_extract_title(webpage)
description = clean_html(get_element_by_class('share-update-card__update-text', webpage))
like_count = int_or_none(get_element_by_class('social-counts-reactions__social-counts-numRections', webpage))
creator = strip_or_none(clean_html(get_element_by_class('comment__actor-name', webpage)))
sources = self._parse_json(extract_attributes(self._search_regex(r'(<video[^>]+>)', webpage, 'video'))['data-sources'], video_id)
video_attrs = extract_attributes(self._search_regex(r'(<video[^>]+>)', webpage, 'video'))
sources = self._parse_json(video_attrs['data-sources'], video_id)
formats = [{
'url': source['src'],
'ext': mimetype2ext(source.get('type')),
'tbr': float_or_none(source.get('data-bitrate'), scale=1000),
} for source in sources]
subtitles = {'en': [{
'url': video_attrs['data-captions-url'],
'ext': 'vtt',
}]} if url_or_none(video_attrs.get('data-captions-url')) else {}
return {
'id': video_id,
'formats': formats,
'title': title,
'like_count': like_count,
'creator': creator,
'title': self._og_search_title(webpage, default=None) or self._html_extract_title(webpage),
'like_count': int_or_none(self._search_regex(
r'\bdata-num-reactions="(\d+)"', webpage, 'reactions', default=None)),
'uploader': traverse_obj(
self._yield_json_ld(webpage, video_id),
(lambda _, v: v['@type'] == 'SocialMediaPosting', 'author', 'name', {str}), get_all=False),
'thumbnail': self._og_search_thumbnail(webpage),
'description': description,
'description': self._og_search_description(webpage, default=None),
'subtitles': subtitles,
}

282
yt_dlp/extractor/lsm.py Normal file
View file

@ -0,0 +1,282 @@
import re
import urllib.parse
from .common import InfoExtractor
from ..utils import (
ExtractorError,
determine_ext,
int_or_none,
js_to_json,
parse_iso8601,
parse_qs,
str_or_none,
url_or_none,
urljoin,
)
from ..utils.traversal import traverse_obj
class LSMLREmbedIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://(?:
(?:latvijasradio|lr1|lr2|klasika|lr4|naba|radioteatris)\.lsm|
pieci
)\.lv/[^/?#]+/(?:
pleijeris|embed
)/?\?(?:[^#]+&)?(?:show|id)=(?P<id>\d+)'''
_TESTS = [{
'url': 'https://latvijasradio.lsm.lv/lv/embed/?theme=black&size=16x9&showCaptions=0&id=183522',
'md5': '719b33875cd1429846eeeaeec6df2830',
'info_dict': {
'id': 'a342781',
'ext': 'mp3',
'duration': 1823,
'title': '#138 Nepilnīgā kompensējamo zāļu sistēma pat mēnešiem dzenā pacientus pa aptiekām',
'thumbnail': 'https://pic.latvijasradio.lv/public/assets/media/9/d/gallery_fd4675ac.jpg',
}
}, {
'url': 'https://radioteatris.lsm.lv/lv/embed/?id=&show=1270&theme=white&size=16x9',
'info_dict': {
'id': '1270',
},
'playlist_count': 3,
'playlist': [{
'md5': '2e61b6eceff00d14d57fdbbe6ab24cac',
'info_dict': {
'id': 'a297397',
'ext': 'mp3',
'title': 'Eriks Emanuels Šmits "Pilāta evaņģēlijs". 1. daļa',
'thumbnail': 'https://radioteatris.lsm.lv/public/assets/shows/62f131ae81e3c.jpg',
'duration': 3300,
},
}],
}, {
'url': 'https://radioteatris.lsm.lv/lv/embed/?id=&show=1269&theme=white&size=16x9',
'md5': '24810d4a961da2295d9860afdcaf4f5a',
'info_dict': {
'id': 'a230690',
'ext': 'mp3',
'title': 'Jens Ahlboms "Spārni". Radioizrāde ar Mārtiņa Freimaņa mūziku',
'thumbnail': 'https://radioteatris.lsm.lv/public/assets/shows/62f13023a457c.jpg',
'duration': 1788,
}
}, {
'url': 'https://lr1.lsm.lv/lv/embed/?id=166557&show=0&theme=white&size=16x9',
'info_dict': {
'id': '166557',
},
'playlist_count': 2,
'playlist': [{
'md5': '6a8b0927572f443f09c6e50a3ad65f2d',
'info_dict': {
'id': 'a303104',
'ext': 'mp3',
'thumbnail': 'https://pic.latvijasradio.lv/public/assets/media/c/5/gallery_a83ad2c2.jpg',
'title': 'Krustpunktā Lielā intervija: Valsts prezidents Egils Levits',
'duration': 3222,
},
}, {
'md5': '5d5e191e718b7644e5118b7b4e093a6d',
'info_dict': {
'id': 'v303104',
'ext': 'mp4',
'thumbnail': 'https://pic.latvijasradio.lv/public/assets/media/c/5/gallery_a83ad2c2.jpg',
'title': 'Krustpunktā Lielā intervija: Valsts prezidents Egils Levits - Video Version',
'duration': 3222,
},
}],
}, {
'url': 'https://lr1.lsm.lv/lv/embed/?id=183522&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://lr2.lsm.lv/lv/embed/?id=182126&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://klasika.lsm.lv/lv/embed/?id=110806&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://lr4.lsm.lv/lv/embed/?id=184282&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://pieci.lv/lv/embed/?id=168896&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://naba.lsm.lv/lv/embed/?id=182901&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://radioteatris.lsm.lv/lv/embed/?id=176439&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://lr1.lsm.lv/lv/pleijeris/?embed=0&id=48205&time=00%3A00&idx=0',
'only_matching': True,
}]
def _real_extract(self, url):
query = parse_qs(url)
video_id = traverse_obj(query, (
('show', 'id'), 0, {int_or_none}, {lambda x: x or None}, {str_or_none}), get_all=False)
webpage = self._download_webpage(url, video_id)
player_data, media_data = self._search_regex(
r'LR\.audio\.Player\s*\([^{]*(?P<player>\{.*?\}),(?P<media>\{.*\})\);',
webpage, 'player json', group=('player', 'media'))
player_json = self._parse_json(
player_data, video_id, transform_source=js_to_json, fatal=False) or {}
media_json = self._parse_json(media_data, video_id, transform_source=js_to_json)
entries = []
for item in traverse_obj(media_json, (('audio', 'video'), lambda _, v: v['id'])):
formats = []
for source_url in traverse_obj(item, ('sources', ..., 'file', {url_or_none})):
if determine_ext(source_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats(source_url, video_id, fatal=False))
else:
formats.append({'url': source_url})
id_ = item['id']
title = item.get('title')
if id_.startswith('v') and not title:
title = traverse_obj(
media_json, ('audio', lambda _, v: v['id'][1:] == id_[1:], 'title',
{lambda x: x and f'{x} - Video Version'}), get_all=False)
entries.append({
'formats': formats,
'thumbnail': urljoin(url, player_json.get('poster')),
'id': id_,
'title': title,
'duration': traverse_obj(item, ('duration', {int_or_none})),
})
if len(entries) == 1:
return entries[0]
return self.playlist_result(entries, video_id)
class LSMLTVEmbedIE(InfoExtractor):
_VALID_URL = r'https?://ltv\.lsm\.lv/embed\?(?:[^#]+&)?c=(?P<id>[^#&]+)'
_TESTS = [{
'url': 'https://ltv.lsm.lv/embed?c=eyJpdiI6IjQzbHVUeHAyaDJiamFjcjdSUUFKdnc9PSIsInZhbHVlIjoiMHl3SnJNRmd2TmFIdnZwOGtGUUpzODFzUEZ4SVVsN2xoRjliSW9vckUyMWZIWG8vbWVzaFFkY0lhNmRjbjRpaCIsIm1hYyI6ImMzNjdhMzFhNTFhZmY1ZmE0NWI5YmFjZGI1YmJiNGEyNjgzNDM4MjUzMWEwM2FmMDMyZDMwYWM1MDFjZmM5MGIiLCJ0YWciOiIifQ==',
'md5': '64f72a360ca530d5ed89c77646c9eee5',
'info_dict': {
'id': '46k_d23-6000-105',
'ext': 'mp4',
'timestamp': 1700589151,
'duration': 1442,
'upload_date': '20231121',
'title': 'D23-6000-105_cetstud',
'thumbnail': 'https://store.cloudycdn.services/tmsp00060/assets/media/660858/placeholder1700589200.jpg',
}
}, {
'url': 'https://ltv.lsm.lv/embed?enablesdkjs=1&c=eyJpdiI6IncwVzZmUFk2MU12enVWK1I3SUcwQ1E9PSIsInZhbHVlIjoid3FhV29vamc3T2sxL1RaRmJ5Rm1GTXozU0o2dVczdUtLK0cwZEZJMDQ2a3ZIRG5DK2pneGlnbktBQy9uazVleHN6VXhxdWIweWNvcHRDSnlISlNYOHlVZ1lpcTUrcWZSTUZPQW14TVdkMW9aOUtRWVNDcFF4eWpHNGcrT0VZbUNFQStKQk91cGpndW9FVjJIa0lpbkh3PT0iLCJtYWMiOiIyZGI1NDJlMWRlM2QyMGNhOGEwYTM2MmNlN2JlOGRhY2QyYjdkMmEzN2RlOTEzYTVkNzI1ODlhZDlhZjU4MjQ2IiwidGFnIjoiIn0=',
'md5': 'a1711e190fe680fdb68fd8413b378e87',
'info_dict': {
'id': 'wUnFArIPDSY',
'ext': 'mp4',
'uploader': 'LTV_16plus',
'release_date': '20220514',
'channel_url': 'https://www.youtube.com/channel/UCNMrnafwXD2XKeeQOyfkFCw',
'view_count': int,
'availability': 'public',
'thumbnail': 'https://i.ytimg.com/vi/wUnFArIPDSY/maxresdefault.jpg',
'release_timestamp': 1652544074,
'title': 'EIROVĪZIJA SALĀTOS',
'live_status': 'was_live',
'uploader_id': '@LTV16plus',
'comment_count': int,
'channel_id': 'UCNMrnafwXD2XKeeQOyfkFCw',
'channel_follower_count': int,
'categories': ['Entertainment'],
'duration': 5269,
'upload_date': '20220514',
'age_limit': 0,
'channel': 'LTV_16plus',
'playable_in_embed': True,
'tags': [],
'uploader_url': 'https://www.youtube.com/@LTV16plus',
'like_count': int,
'description': 'md5:7ff0c42ba971e3c13e4b8a2ff03b70b5',
}
}]
def _real_extract(self, url):
video_id = urllib.parse.unquote(self._match_id(url))
webpage = self._download_webpage(url, video_id)
data = self._search_json(
r'window\.ltvEmbedPayload\s*=', webpage, 'embed json', video_id)
embed_type = traverse_obj(data, ('source', 'name', {str}))
if embed_type == 'telia':
ie_key = 'CloudyCDN'
embed_url = traverse_obj(data, ('source', 'embed_url', {url_or_none}))
elif embed_type == 'youtube':
ie_key = 'Youtube'
embed_url = traverse_obj(data, ('source', 'id', {str}))
else:
raise ExtractorError(f'Unsupported embed type {embed_type!r}')
return self.url_result(
embed_url, ie_key, video_id, **traverse_obj(data, {
'title': ('parentInfo', 'title'),
'duration': ('parentInfo', 'duration', {int_or_none}),
'thumbnail': ('source', 'poster', {url_or_none}),
}))
class LSMReplayIE(InfoExtractor):
_VALID_URL = r'https?://replay\.lsm\.lv/[^/?#]+/(?:ieraksts|statja)/[^/?#]+/(?P<id>\d+)'
_TESTS = [{
'url': 'https://replay.lsm.lv/lv/ieraksts/ltv/311130/4-studija-zolitudes-tragedija-un-incupes-stacija',
'md5': '64f72a360ca530d5ed89c77646c9eee5',
'info_dict': {
'id': '46k_d23-6000-105',
'ext': 'mp4',
'timestamp': 1700586300,
'description': 'md5:0f1b14798cc39e1ae578bd0eb268f759',
'duration': 1442,
'upload_date': '20231121',
'title': '4. studija. Zolitūdes traģēdija un Inčupes stacija',
'thumbnail': 'https://ltv.lsm.lv/storage/media/8/7/large/5/1f9604e1.jpg',
}
}, {
'url': 'https://replay.lsm.lv/lv/ieraksts/lr/183522/138-nepilniga-kompensejamo-zalu-sistema-pat-menesiem-dzena-pacientus-pa-aptiekam',
'md5': '719b33875cd1429846eeeaeec6df2830',
'info_dict': {
'id': 'a342781',
'ext': 'mp3',
'duration': 1823,
'title': '#138 Nepilnīgā kompensējamo zāļu sistēma pat mēnešiem dzenā pacientus pa aptiekām',
'thumbnail': 'https://pic.latvijasradio.lv/public/assets/media/9/d/large_fd4675ac.jpg',
'upload_date': '20231102',
'timestamp': 1698921060,
'description': 'md5:7bac3b2dd41e44325032943251c357b1',
}
}, {
'url': 'https://replay.lsm.lv/ru/statja/ltv/311130/4-studija-zolitudes-tragedija-un-incupes-stacija',
'only_matching': True,
}]
def _fix_nuxt_data(self, webpage):
return re.sub(r'Object\.create\(null(?:,(\{.+\}))?\)', lambda m: m.group(1) or 'null', webpage)
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
data = self._search_nuxt_data(
self._fix_nuxt_data(webpage), video_id, context_name='__REPLAY__')
return {
'_type': 'url_transparent',
'id': video_id,
**traverse_obj(data, {
'url': ('playback', 'service', 'url', {url_or_none}),
'title': ('mediaItem', 'title'),
'description': ('mediaItem', ('lead', 'body')),
'duration': ('mediaItem', 'duration', {int_or_none}),
'timestamp': ('mediaItem', 'aired_at', {parse_iso8601}),
'thumbnail': ('mediaItem', 'largeThumbnail', {url_or_none}),
}, get_all=False),
}

View file

@ -28,12 +28,24 @@ class MagellanTVIE(InfoExtractor):
'tags': ['Investigation', 'True Crime', 'Justice', 'Europe'],
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://www.magellantv.com/watch/celebration-nation',
'info_dict': {
'id': 'celebration-nation',
'ext': 'mp4',
'tags': ['Art & Culture', 'Human Interest', 'Anthropology', 'China', 'History'],
'duration': 2640.0,
'title': 'Ancestors',
},
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['reactContext']['video']['detail']
data = traverse_obj(self._search_nextjs_data(webpage, video_id), (
'props', 'pageProps', 'reactContext',
(('video', 'detail'), ('series', 'currentEpisode')), {dict}), get_all=False)
formats, subtitles = self._extract_m3u8_formats_and_subtitles(data['jwpVideoUrl'], video_id)
return {

View file

@ -8,7 +8,8 @@
float_or_none,
int_or_none,
str_or_none,
traverse_obj
traverse_obj,
update_url_query,
)
@ -16,7 +17,7 @@ class MedalTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?medal\.tv/games/[^/?#&]+/clips/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://medal.tv/games/valorant/clips/jTBFnLKdLy15K',
'md5': '6930f8972914b6b9fdc2bb3918098ba0',
'md5': '03e4911fdcf7fce563090705c2e79267',
'info_dict': {
'id': 'jTBFnLKdLy15K',
'ext': 'mp4',
@ -33,8 +34,8 @@ class MedalTVIE(InfoExtractor):
'duration': 13,
}
}, {
'url': 'https://medal.tv/games/cod%20cold%20war/clips/2mA60jWAGQCBH',
'md5': '3d19d426fe0b2d91c26e412684e66a06',
'url': 'https://medal.tv/games/cod-cold-war/clips/2mA60jWAGQCBH',
'md5': 'fc7a3e4552ae8993c1c4006db46be447',
'info_dict': {
'id': '2mA60jWAGQCBH',
'ext': 'mp4',
@ -52,7 +53,7 @@ class MedalTVIE(InfoExtractor):
'duration': 23,
}
}, {
'url': 'https://medal.tv/games/cod%20cold%20war/clips/2um24TWdty0NA',
'url': 'https://medal.tv/games/cod-cold-war/clips/2um24TWdty0NA',
'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148',
'info_dict': {
'id': '2um24TWdty0NA',
@ -81,7 +82,7 @@ class MedalTVIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
webpage = self._download_webpage(update_url_query(url, {'mobilebypass': 'true'}), video_id)
hydration_data = self._search_json(
r'<script[^>]*>[^<]*\bhydrationData\s*=', webpage,

View file

@ -177,6 +177,7 @@ def _real_extract(self, url):
class MotherlessPaginatedIE(InfoExtractor):
_EXTRA_QUERY = {}
_PAGE_SIZE = 60
def _correct_path(self, url, item_id):
@ -199,7 +200,7 @@ def _real_extract(self, url):
def get_page(idx):
page = idx + 1
current_page = webpage if not idx else self._download_webpage(
real_url, item_id, note=f'Downloading page {page}', query={'page': page})
real_url, item_id, note=f'Downloading page {page}', query={'page': page, **self._EXTRA_QUERY})
yield from self._extract_entries(current_page, real_url)
return self.playlist_result(
@ -213,7 +214,7 @@ class MotherlessGroupIE(MotherlessPaginatedIE):
'url': 'http://motherless.com/gv/movie_scenes',
'info_dict': {
'id': 'movie_scenes',
'title': 'Movie Scenes',
'title': 'Movie Scenes - Videos - Hot and sexy scenes from "regular" movies... Beautiful actresses fully',
},
'playlist_mincount': 540,
}, {
@ -244,7 +245,7 @@ class MotherlessGalleryIE(MotherlessPaginatedIE):
'id': '338999F',
'title': 'Random',
},
'playlist_mincount': 190,
'playlist_mincount': 171,
}, {
'url': 'https://motherless.com/GVABD6213',
'info_dict': {
@ -270,3 +271,27 @@ class MotherlessGalleryIE(MotherlessPaginatedIE):
def _correct_path(self, url, item_id):
return urllib.parse.urljoin(url, f'/GV{item_id}')
class MotherlessUploaderIE(MotherlessPaginatedIE):
_VALID_URL = r'https?://(?:www\.)?motherless\.com/u/(?P<id>\w+)/?(?:$|[?#])'
_TESTS = [{
'url': 'https://motherless.com/u/Mrgo4hrs2023',
'info_dict': {
'id': 'Mrgo4hrs2023',
'title': "Mrgo4hrs2023's Uploads - Videos",
},
'playlist_mincount': 32,
}, {
'url': 'https://motherless.com/u/Happy_couple?t=v',
'info_dict': {
'id': 'Happy_couple',
'title': "Happy_couple's Uploads - Videos",
},
'playlist_mincount': 8,
}]
_EXTRA_QUERY = {'t': 'v'}
def _correct_path(self, url, item_id):
return urllib.parse.urljoin(url, f'/u/{item_id}?t=v')

171
yt_dlp/extractor/mx3.py Normal file
View file

@ -0,0 +1,171 @@
import re
from .common import InfoExtractor
from ..networking import HEADRequest
from ..utils import (
get_element_by_class,
int_or_none,
try_call,
url_or_none,
urlhandle_detect_ext,
)
from ..utils.traversal import traverse_obj
class Mx3BaseIE(InfoExtractor):
_VALID_URL_TMPL = r'https?://(?:www\.)?%s/t/(?P<id>\w+)'
_FORMATS = [{
'url': 'player_asset',
'format_id': 'default',
'quality': 0,
}, {
'url': 'player_asset?quality=hd',
'format_id': 'hd',
'quality': 1,
}, {
'url': 'download',
'format_id': 'download',
'quality': 2,
}, {
'url': 'player_asset?quality=source',
'format_id': 'source',
'quality': 2,
}]
def _extract_formats(self, track_id):
formats = []
for fmt in self._FORMATS:
format_url = f'https://{self._DOMAIN}/tracks/{track_id}/{fmt["url"]}'
urlh = self._request_webpage(
HEADRequest(format_url), track_id, fatal=False, expected_status=404,
note=f'Checking for format {fmt["format_id"]}')
if urlh and urlh.status == 200:
formats.append({
**fmt,
'url': format_url,
'ext': urlhandle_detect_ext(urlh),
'filesize': int_or_none(urlh.headers.get('Content-Length')),
})
return formats
def _real_extract(self, url):
track_id = self._match_id(url)
webpage = self._download_webpage(url, track_id)
more_info = get_element_by_class('single-more-info', webpage)
data = self._download_json(f'https://{self._DOMAIN}/t/{track_id}.json', track_id, fatal=False)
def get_info_field(name):
return self._html_search_regex(
rf'<dt[^>]*>\s*{name}\s*</dt>\s*<dd[^>]*>(.*?)</dd>',
more_info, name, default=None, flags=re.DOTALL)
return {
'id': track_id,
'formats': self._extract_formats(track_id),
'genre': self._html_search_regex(
r'<div\b[^>]+class="single-band-genre"[^>]*>([^<]+)</div>', webpage, 'genre', default=None),
'release_year': int_or_none(get_info_field('Year of creation')),
'description': get_info_field('Description'),
'tags': try_call(lambda: get_info_field('Tag').split(', '), list),
**traverse_obj(data, {
'title': ('title', {str}),
'artist': (('performer_name', 'artist'), {str}),
'album_artist': ('artist', {str}),
'composer': ('composer_name', {str}),
'thumbnail': (('picture_url_xlarge', 'picture_url'), {url_or_none}),
}, get_all=False),
}
class Mx3IE(Mx3BaseIE):
_DOMAIN = 'mx3.ch'
_VALID_URL = Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN)
_TESTS = [{
'url': 'https://mx3.ch/t/1Cru',
'md5': '7ba09e9826b4447d4e1ce9d69e0e295f',
'info_dict': {
'id': '1Cru',
'ext': 'wav',
'artist': 'Godina',
'album_artist': 'Tortue Tortue',
'composer': 'Olivier Godinat',
'genre': 'Rock',
'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/4643/square_xlarge/1-s-envoler-1.jpg?1630272813',
'title': "S'envoler",
'release_year': 2021,
'tags': [],
}
}, {
'url': 'https://mx3.ch/t/1LIY',
'md5': '48293cb908342547827f963a5a2e9118',
'info_dict': {
'id': '1LIY',
'ext': 'mov',
'artist': 'Tania Kimfumu',
'album_artist': 'The Broots',
'composer': 'Emmanuel Diserens',
'genre': 'Electro',
'thumbnail': 'https://mx3.ch/pictures/mx3/file/0110/0003/video_xlarge/frame_0000.png?1686963670',
'title': 'The Broots-Larytta remix "Begging For Help"',
'release_year': 2023,
'tags': ['the broots', 'cassata records', 'larytta'],
'description': '"Begging for Help" Larytta Remix Official Video\nRealized By Kali Donkilie in 2023',
}
}, {
'url': 'https://mx3.ch/t/1C6E',
'md5': '1afcd578493ddb8e5008e94bb6d97e25',
'info_dict': {
'id': '1C6E',
'ext': 'wav',
'artist': 'Alien Bubblegum',
'album_artist': 'Alien Bubblegum',
'composer': 'Alien Bubblegum',
'genre': 'Punk',
'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/1551/square_xlarge/pandora-s-box-cover-with-title.png?1627054733',
'title': 'Wide Awake',
'release_year': 2021,
'tags': ['alien bubblegum', 'bubblegum', 'alien', 'pop punk', 'poppunk'],
}
}]
class Mx3NeoIE(Mx3BaseIE):
_DOMAIN = 'neo.mx3.ch'
_VALID_URL = Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN)
_TESTS = [{
'url': 'https://neo.mx3.ch/t/1hpd',
'md5': '6d9986bbae5cac3296ec8813bf965eb2',
'info_dict': {
'id': '1hpd',
'ext': 'wav',
'artist': 'Baptiste Lopez',
'album_artist': 'Kammerorchester Basel',
'composer': 'Jannik Giger',
'genre': 'Composition, Orchestra',
'title': 'Troisième œil. Für Kammerorchester (2023)',
'thumbnail': 'https://neo.mx3.ch/pictures/neo/file/0000/0241/square_xlarge/kammerorchester-basel-group-photo-2_c_-lukasz-rajchert.jpg?1560341252',
'release_year': 2023,
'tags': [],
}
}]
class Mx3VolksmusikIE(Mx3BaseIE):
_DOMAIN = 'volksmusik.mx3.ch'
_VALID_URL = Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN)
_TESTS = [{
'url': 'https://volksmusik.mx3.ch/t/Zx',
'md5': 'dd967a7b0c1ef898f3e072cf9c2eae3c',
'info_dict': {
'id': 'Zx',
'ext': 'mp3',
'artist': 'Ländlerkapelle GrischArt',
'album_artist': 'Ländlerkapelle GrischArt',
'composer': 'Urs Glauser',
'genre': 'Instrumental, Graubünden',
'title': 'Chämilouf',
'thumbnail': 'https://volksmusik.mx3.ch/pictures/vxm/file/0000/3815/square_xlarge/grischart1.jpg?1450530120',
'release_year': 2012,
'tags': [],
}
}]

View file

@ -1,6 +1,7 @@
import itertools
import json
from .art19 import Art19IE
from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import (
@ -112,7 +113,8 @@ def _extract_video_metadata(self, episode):
class NebulaIE(NebulaBaseIE):
_VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[-\w]+)'
IE_NAME = 'nebula:video'
_VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast',
'info_dict': {
@ -236,8 +238,8 @@ def _real_extract(self, url):
class NebulaClassIE(NebulaBaseIE):
IE_NAME = 'nebula:class'
_VALID_URL = rf'{_BASE_URL_RE}/(?P<id>[-\w]+)/(?P<ep>\d+)'
IE_NAME = 'nebula:media'
_VALID_URL = rf'{_BASE_URL_RE}/(?!(?:myshows|library|videos)/)(?P<id>[\w-]+)/(?P<ep>[\w-]+)/?(?:$|[?#])'
_TESTS = [{
'url': 'https://nebula.tv/copyright-for-fun-and-profit/14',
'info_dict': {
@ -253,6 +255,46 @@ class NebulaClassIE(NebulaBaseIE):
'title': 'Photos, Sculpture, and Video',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://nebula.tv/extremitiespodcast/pyramiden-the-high-arctic-soviet-ghost-town',
'info_dict': {
'ext': 'mp3',
'id': '018f65f0-0033-4021-8f87-2d132beb19aa',
'description': 'md5:05d2b23ab780c955e2511a2b9127acff',
'series_id': '335e8159-d663-491a-888f-1732285706ac',
'modified_timestamp': 1599091504,
'episode_id': '018f65f0-0033-4021-8f87-2d132beb19aa',
'series': 'Extremities',
'modified_date': '20200903',
'upload_date': '20200902',
'title': 'Pyramiden: The High-Arctic Soviet Ghost Town',
'release_timestamp': 1571237958,
'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
'duration': 1546.05714,
'timestamp': 1599085608,
'release_date': '20191016',
},
}, {
'url': 'https://nebula.tv/thelayover/the-layover-episode-1',
'info_dict': {
'ext': 'mp3',
'id': '9d74a762-00bb-45a8-9e8d-9ed47c04a1d0',
'episode_number': 1,
'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
'release_date': '20230304',
'modified_date': '20230403',
'series': 'The Layover',
'episode_id': '9d74a762-00bb-45a8-9e8d-9ed47c04a1d0',
'modified_timestamp': 1680554566,
'duration': 3130.46401,
'release_timestamp': 1677943800,
'title': 'The Layover — Episode 1',
'series_id': '874303a5-4900-4626-a4b6-2aacac34466a',
'upload_date': '20230303',
'episode': 'Episode 1',
'timestamp': 1677883672,
'description': 'md5:002cca89258e3bc7c268d5b8c24ba482',
},
}]
def _real_extract(self, url):
@ -268,16 +310,38 @@ def _real_extract(self, url):
metadata = self._call_api(
f'https://content.api.nebula.app/content/{slug}/{episode}/?include=lessons',
slug, note='Fetching video metadata')
return {
**self._extract_video_metadata(metadata),
**self._extract_formats(metadata['id'], slug),
}
slug, note='Fetching class/podcast metadata')
content_type = metadata.get('type')
if content_type == 'lesson':
return {
**self._extract_video_metadata(metadata),
**self._extract_formats(metadata['id'], slug),
}
elif content_type == 'podcast_episode':
episode_url = metadata['episode_url']
if not episode_url and metadata.get('premium'):
self.raise_login_required()
if Art19IE.suitable(episode_url):
return self.url_result(episode_url, Art19IE)
return traverse_obj(metadata, {
'id': ('id', {str}),
'url': ('episode_url', {url_or_none}),
'title': ('title', {str}),
'description': ('description', {str}),
'timestamp': ('published_at', {parse_iso8601}),
'duration': ('duration', {int_or_none}),
'channel_id': ('channel_id', {str}),
'chnanel': ('channel_title', {str}),
'thumbnail': ('assets', 'regular', {url_or_none}),
})
raise ExtractorError(f'Unexpected content type {content_type!r}')
class NebulaSubscriptionsIE(NebulaBaseIE):
IE_NAME = 'nebula:subscriptions'
_VALID_URL = rf'{_BASE_URL_RE}/(?P<id>myshows|library/latest-videos)'
_VALID_URL = rf'{_BASE_URL_RE}/(?P<id>myshows|library/latest-videos)/?(?:$|[?#])'
_TESTS = [{
'url': 'https://nebula.tv/myshows',
'playlist_mincount': 1,
@ -310,7 +374,7 @@ def _real_extract(self, url):
class NebulaChannelIE(NebulaBaseIE):
IE_NAME = 'nebula:channel'
_VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|library|videos/)(?P<id>[-\w]+)/?(?:$|[?#])'
_VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|library|videos)(?P<id>[\w-]+)/?(?:$|[?#])'
_TESTS = [{
'url': 'https://nebula.tv/tom-scott-presents-money',
'info_dict': {
@ -343,6 +407,14 @@ class NebulaChannelIE(NebulaBaseIE):
'description': 'md5:6690248223eed044a9f11cd5a24f9742',
},
'playlist_count': 23,
}, {
'url': 'https://nebula.tv/trussissuespodcast',
'info_dict': {
'id': 'trussissuespodcast',
'title': 'The TLDR News Podcast',
'description': 'md5:a08c4483bc0b705881d3e0199e721385',
},
'playlist_mincount': 80,
}]
def _generate_playlist_entries(self, collection_id, collection_slug):
@ -365,6 +437,17 @@ def _generate_class_entries(self, channel):
lesson.get('share_url') or f'https://nebula.tv/{metadata["class_slug"]}/{metadata["slug"]}',
{'id': lesson['id']}), NebulaClassIE, url_transparent=True, **metadata)
def _generate_podcast_entries(self, collection_id, collection_slug):
next_url = f'https://content.api.nebula.app/podcast_channels/{collection_id}/podcast_episodes/?ordering=-published_at&premium=true'
for page_num in itertools.count(1):
episodes = self._call_api(next_url, collection_slug, note=f'Retrieving podcast page {page_num}')
for episode in traverse_obj(episodes, ('results', lambda _, v: url_or_none(v['share_url']))):
yield self.url_result(episode['share_url'], NebulaClassIE)
next_url = episodes.get('next')
if not next_url:
break
def _real_extract(self, url):
collection_slug = self._match_id(url)
channel = self._call_api(
@ -373,6 +456,8 @@ def _real_extract(self, url):
if channel.get('type') == 'class':
entries = self._generate_class_entries(channel)
elif channel.get('type') == 'podcast_channel':
entries = self._generate_podcast_entries(channel['id'], collection_slug)
else:
entries = self._generate_playlist_entries(channel['id'], collection_slug)

View file

@ -1,33 +1,38 @@
import datetime
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..utils import parse_iso8601, url_or_none
from ..utils.traversal import traverse_obj
class NerdCubedFeedIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?nerdcubed\.co\.uk/feed\.json'
_VALID_URL = r'https?://(?:www\.)?nerdcubed\.co\.uk/?(?:$|[#?])'
_TEST = {
'url': 'http://www.nerdcubed.co.uk/feed.json',
'url': 'http://www.nerdcubed.co.uk/',
'info_dict': {
'id': 'nerdcubed-feed',
'title': 'nerdcubed.co.uk feed',
},
'playlist_mincount': 1300,
'playlist_mincount': 5500,
}
def _extract_video(self, feed_entry):
return self.url_result(
f'https://www.youtube.com/watch?v={feed_entry["id"]}', YoutubeIE,
**traverse_obj(feed_entry, {
'id': ('id', {str}),
'title': ('title', {str}),
'description': ('description', {str}),
'timestamp': ('publishedAt', {parse_iso8601}),
'channel': ('source', 'name', {str}),
'channel_id': ('source', 'id', {str}),
'channel_url': ('source', 'url', {str}),
'thumbnail': ('thumbnail', 'source', {url_or_none}),
}), url_transparent=True)
def _real_extract(self, url):
feed = self._download_json(url, url, 'Downloading NerdCubed JSON feed')
video_id = 'nerdcubed-feed'
feed = self._download_json('https://www.nerdcubed.co.uk/_/cdn/videos.json', video_id)
entries = [{
'_type': 'url',
'title': feed_entry['title'],
'uploader': feed_entry['source']['name'] if feed_entry['source'] else None,
'upload_date': datetime.datetime.strptime(feed_entry['date'], '%Y-%m-%d').strftime('%Y%m%d'),
'url': 'http://www.youtube.com/watch?v=' + feed_entry['youtube_id'],
} for feed_entry in feed]
return {
'_type': 'playlist',
'title': 'nerdcubed.co.uk feed',
'id': 'nerdcubed-feed',
'entries': entries,
}
return self.playlist_result(
map(self._extract_video, traverse_obj(feed, ('videos', lambda _, v: v['id']))),
video_id, 'nerdcubed.co.uk feed')

View file

@ -3,15 +3,15 @@
from .common import InfoExtractor
from ..utils import (
OnDemandPagedList,
clean_html,
extract_attributes,
get_element_by_id,
int_or_none,
parse_count,
parse_duration,
traverse_obj,
unified_timestamp,
OnDemandPagedList,
try_get,
)
@ -263,19 +263,16 @@ class NewgroundsUserIE(InfoExtractor):
def _fetch_page(self, channel_id, url, page):
page += 1
posts_info = self._download_json(
f'{url}/page/{page}', channel_id,
f'{url}?page={page}', channel_id,
note=f'Downloading page {page}', headers={
'Accept': 'application/json, text/javascript, */*; q = 0.01',
'X-Requested-With': 'XMLHttpRequest',
})
sequence = posts_info.get('sequence', [])
for year in sequence:
posts = try_get(posts_info, lambda x: x['years'][str(year)]['items'])
for post in posts:
path, media_id = self._search_regex(
r'<a[^>]+\bhref=["\'][^"\']+((?:portal/view|audio/listen)/(\d+))[^>]+>',
post, 'url', group=(1, 2))
yield self.url_result(f'https://www.newgrounds.com/{path}', NewgroundsIE.ie_key(), media_id)
for post in traverse_obj(posts_info, ('items', ..., ..., {str})):
path, media_id = self._search_regex(
r'<a[^>]+\bhref=["\'][^"\']+((?:portal/view|audio/listen)/(\d+))[^>]+>',
post, 'url', group=(1, 2))
yield self.url_result(f'https://www.newgrounds.com/{path}', NewgroundsIE.ie_key(), media_id)
def _real_extract(self, url):
channel_id = self._match_id(url)

View file

@ -1,10 +1,54 @@
from .common import InfoExtractor
from ..utils import int_or_none
from ..utils import (
int_or_none,
join_nonempty,
merge_dicts,
parse_count,
url_or_none,
urljoin,
)
from ..utils.traversal import traverse_obj
class NFBIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?nfb\.ca/film/(?P<id>[^/?#&]+)'
class NFBBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?(?P<site>nfb|onf)\.ca'
_GEO_COUNTRIES = ['CA']
def _extract_ep_data(self, webpage, video_id, fatal=False):
return self._search_json(
r'const\s+episodesData\s*=', webpage, 'episode data', video_id,
contains_pattern=r'\[\s*{(?s:.+)}\s*\]', fatal=fatal) or []
def _extract_ep_info(self, data, video_id, slug=None):
info = traverse_obj(data, (lambda _, v: video_id in v['embed_url'], {
'description': ('description', {str}),
'thumbnail': ('thumbnail_url', {url_or_none}),
'uploader': ('data_layer', 'episodeMaker', {str}),
'release_year': ('data_layer', 'episodeYear', {int_or_none}),
'episode': ('data_layer', 'episodeTitle', {str}),
'season': ('data_layer', 'seasonTitle', {str}),
'season_number': ('data_layer', 'seasonTitle', {parse_count}),
'series': ('data_layer', 'seriesTitle', {str}),
}), get_all=False)
return {
**info,
'id': video_id,
'title': join_nonempty('series', 'episode', from_dict=info, delim=' - '),
'episode_number': int_or_none(self._search_regex(
r'[/-]e(?:pisode)?-?(\d+)(?:[/-]|$)', slug or video_id, 'episode number', default=None)),
}
class NFBIE(NFBBaseIE):
IE_NAME = 'nfb'
IE_DESC = 'nfb.ca and onf.ca films and episodes'
_VALID_URL = [
rf'{NFBBaseIE._VALID_URL_BASE}/(?P<type>film)/(?P<id>[^/?#&]+)',
rf'{NFBBaseIE._VALID_URL_BASE}/(?P<type>series?)/(?P<id>[^/?#&]+/s(?:ea|ai)son\d+/episode\d+)',
]
_TESTS = [{
'note': 'NFB film',
'url': 'https://www.nfb.ca/film/trafficopter/',
'info_dict': {
'id': 'trafficopter',
@ -14,29 +58,192 @@ class NFBIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Barrie Howells',
'release_year': 1972,
'duration': 600.0,
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'ONF film',
'url': 'https://www.onf.ca/film/mal-du-siecle/',
'info_dict': {
'id': 'mal-du-siecle',
'ext': 'mp4',
'title': 'Le mal du siècle',
'description': 'md5:1abf774d77569ebe603419f2d344102b',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Catherine Lepage',
'release_year': 2019,
'duration': 300.0,
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'NFB episode with English title',
'url': 'https://www.nfb.ca/series/true-north-inside-the-rise-of-toronto-basketball/season1/episode9/',
'info_dict': {
'id': 'true-north-episode9-true-north-finale-making-it',
'ext': 'mp4',
'title': 'True North: Inside the Rise of Toronto Basketball - Finale: Making It',
'description': 'We catch up with each player in the midst of their journey as they reflect on their road ahead.',
'series': 'True North: Inside the Rise of Toronto Basketball',
'release_year': 2018,
'season': 'Season 1',
'season_number': 1,
'episode': 'Finale: Making It',
'episode_number': 9,
'uploader': 'Ryan Sidhoo',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'ONF episode with French title',
'url': 'https://www.onf.ca/serie/direction-nord-la-montee-du-basketball-a-toronto/saison1/episode9/',
'info_dict': {
'id': 'direction-nord-episode-9',
'ext': 'mp4',
'title': 'Direction nord La montée du basketball à Toronto - Finale : Réussir',
'description': 'md5:349a57419b71432b97bf6083d92b029d',
'series': 'Direction nord La montée du basketball à Toronto',
'release_year': 2018,
'season': 'Saison 1',
'season_number': 1,
'episode': 'Finale : Réussir',
'episode_number': 9,
'uploader': 'Ryan Sidhoo',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'NFB episode with French title (needs geo-bypass)',
'url': 'https://www.nfb.ca/series/etoile-du-nord/saison1/episode1/',
'info_dict': {
'id': 'etoile-du-nord-episode-1-lobservation',
'ext': 'mp4',
'title': 'Étoile du Nord - L\'observation',
'description': 'md5:161a4617260dee3de70f509b2c9dd21b',
'series': 'Étoile du Nord',
'release_year': 2023,
'season': 'Saison 1',
'season_number': 1,
'episode': 'L\'observation',
'episode_number': 1,
'uploader': 'Patrick Bossé',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'ONF episode with English title (needs geo-bypass)',
'url': 'https://www.onf.ca/serie/north-star/season1/episode1/',
'info_dict': {
'id': 'north-star-episode-1-observation',
'ext': 'mp4',
'title': 'North Star - Observation',
'description': 'md5:c727f370839d8a817392b9e3f23655c7',
'series': 'North Star',
'release_year': 2023,
'season': 'Season 1',
'season_number': 1,
'episode': 'Observation',
'episode_number': 1,
'uploader': 'Patrick Bossé',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'NFB episode with /film/ URL and English title (needs geo-bypass)',
'url': 'https://www.nfb.ca/film/north-star-episode-1-observation/',
'info_dict': {
'id': 'north-star-episode-1-observation',
'ext': 'mp4',
'title': 'North Star - Observation',
'description': 'md5:c727f370839d8a817392b9e3f23655c7',
'series': 'North Star',
'release_year': 2023,
'season': 'Season 1',
'season_number': 1,
'episode': 'Observation',
'episode_number': 1,
'uploader': 'Patrick Bossé',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'ONF episode with /film/ URL and French title (needs geo-bypass)',
'url': 'https://www.onf.ca/film/etoile-du-nord-episode-1-lobservation/',
'info_dict': {
'id': 'etoile-du-nord-episode-1-lobservation',
'ext': 'mp4',
'title': 'Étoile du Nord - L\'observation',
'description': 'md5:161a4617260dee3de70f509b2c9dd21b',
'series': 'Étoile du Nord',
'release_year': 2023,
'season': 'Saison 1',
'season_number': 1,
'episode': 'L\'observation',
'episode_number': 1,
'uploader': 'Patrick Bossé',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'Season 2 episode w/o episode num in id, extract from json ld',
'url': 'https://www.onf.ca/film/liste-des-choses-qui-existent-saison-2-ours',
'info_dict': {
'id': 'liste-des-choses-qui-existent-saison-2-ours',
'ext': 'mp4',
'title': 'La liste des choses qui existent - L\'ours en peluche',
'description': 'md5:d5e8d8fc5f3a7385a9cf0f509b37e28a',
'series': 'La liste des choses qui existent',
'release_year': 2022,
'season': 'Saison 2',
'season_number': 2,
'episode': 'L\'ours en peluche',
'episode_number': 12,
'uploader': 'Francis Papillon',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'NFB film /embed/player/ page',
'url': 'https://www.nfb.ca/film/afterlife/embed/player/',
'info_dict': {
'id': 'afterlife',
'ext': 'mp4',
'title': 'Afterlife',
'description': 'md5:84951394f594f1fb1e62d9c43242fdf5',
'release_year': 1978,
'duration': 420.0,
'uploader': 'Ishu Patel',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
site, type_, slug = self._match_valid_url(url).group('site', 'type', 'id')
# Need to construct the URL since we match /embed/player/ URLs as well
webpage, urlh = self._download_webpage_handle(f'https://www.{site}.ca/{type_}/{slug}/', slug)
# type_ can change from film to serie(s) after redirect; new slug may have episode number
type_, slug = self._match_valid_url(urlh.url).group('type', 'id')
webpage = self._download_webpage('https://www.nfb.ca/film/%s/' % video_id, video_id)
embed_url = urljoin(f'https://www.{site}.ca', self._html_search_regex(
r'<[^>]+\bid=["\']player-iframe["\'][^>]*\bsrc=["\']([^"\']+)', webpage, 'embed url'))
video_id = self._match_id(embed_url) # embed url has unique slug
player = self._download_webpage(embed_url, video_id, 'Downloading player page')
if 'MESSAGE_GEOBLOCKED' in player:
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
iframe = self._html_search_regex(
r'<[^>]+\bid=["\']player-iframe["\'][^>]*src=["\']([^"\']+)',
webpage, 'iframe', default=None, fatal=True)
if iframe.startswith('/'):
iframe = f'https://www.nfb.ca{iframe}'
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
self._html_search_regex(r'source:\s*\'([^\']+)', player, 'm3u8 url'),
video_id, 'mp4', m3u8_id='hls')
player = self._download_webpage(iframe, video_id)
if dv_source := self._html_search_regex(r'dvSource:\s*\'([^\']+)', player, 'dv', default=None):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
dv_source, video_id, 'mp4', m3u8_id='dv', preference=-2, fatal=False)
for fmt in fmts:
fmt['format_note'] = 'described video'
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
source = self._html_search_regex(
r'source:\s*\'([^\']+)',
player, 'source', default=None, fatal=True)
formats, subtitles = self._extract_m3u8_formats_and_subtitles(source, video_id, ext='mp4')
return {
info = {
'id': video_id,
'title': self._html_search_regex(
r'<[^>]+\bid=["\']titleHeader["\'][^>]*>\s*<h1[^>]*>\s*([^<]+?)\s*</h1>',
@ -45,14 +252,49 @@ def _real_extract(self, url):
r'<[^>]+\bid=["\']tabSynopsis["\'][^>]*>\s*<p[^>]*>\s*([^<]+)',
webpage, 'description', default=None),
'thumbnail': self._html_search_regex(
r'poster:\s*\'([^\']+)',
player, 'thumbnail', default=None),
r'poster:\s*\'([^\']+)', player, 'thumbnail', default=None),
'uploader': self._html_search_regex(
r'<[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)',
webpage, 'uploader', default=None),
r'<[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', webpage, 'uploader', default=None),
'release_year': int_or_none(self._html_search_regex(
r'<[^>]+\bitemprop=["\']datePublished["\'][^>]*>([^<]+)',
webpage, 'release_year', default=None)),
} if type_ == 'film' else self._extract_ep_info(self._extract_ep_data(webpage, video_id, slug), video_id)
return merge_dicts({
'formats': formats,
'subtitles': subtitles,
}
}, info, self._search_json_ld(webpage, video_id, default={}))
class NFBSeriesIE(NFBBaseIE):
IE_NAME = 'nfb:series'
IE_DESC = 'nfb.ca and onf.ca series'
_VALID_URL = rf'{NFBBaseIE._VALID_URL_BASE}/(?P<type>series?)/(?P<id>[^/?#&]+)/?(?:[?#]|$)'
_TESTS = [{
'url': 'https://www.nfb.ca/series/true-north-inside-the-rise-of-toronto-basketball/',
'playlist_mincount': 9,
'info_dict': {
'id': 'true-north-inside-the-rise-of-toronto-basketball',
},
}, {
'url': 'https://www.onf.ca/serie/la-liste-des-choses-qui-existent-serie/',
'playlist_mincount': 26,
'info_dict': {
'id': 'la-liste-des-choses-qui-existent-serie',
},
}]
def _entries(self, episodes):
for episode in traverse_obj(episodes, lambda _, v: NFBIE.suitable(v['embed_url'])):
mobj = NFBIE._match_valid_url(episode['embed_url'])
yield self.url_result(
mobj[0], NFBIE, **self._extract_ep_info([episode], mobj.group('id')))
def _real_extract(self, url):
site, type_, series_id = self._match_valid_url(url).group('site', 'type', 'id')
season_path = 'saison' if type_ == 'serie' else 'season'
webpage = self._download_webpage(
f'https://www.{site}.ca/{type_}/{series_id}/{season_path}1/episode1', series_id)
episodes = self._extract_ep_data(webpage, series_id, fatal=True)
return self.playlist_result(self._entries(episodes), series_id)

View file

@ -9,6 +9,7 @@
join_nonempty,
parse_duration,
traverse_obj,
try_call,
unescapeHTML,
unified_timestamp,
url_or_none,
@ -473,22 +474,21 @@ class NhkRadiruIE(InfoExtractor):
IE_DESC = 'NHK らじる (Radiru/Rajiru)'
_VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?'
_TESTS = [{
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3853544',
'skip': 'Episode expired on 2023-04-16',
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3926210',
'skip': 'Episode expired on 2024-02-24',
'info_dict': {
'channel': 'NHK-FM',
'uploader': 'NHK-FM',
'description': 'md5:94b08bdeadde81a97df4ec882acce3e9',
'title': 'ジャズ・トゥナイト シリーズJAZZジャイアンツ 56 ジョニー・ホッジス',
'id': '0449_01_3926210',
'ext': 'm4a',
'id': '0449_01_3853544',
'series': 'ジャズ・トゥナイト',
'uploader': 'NHK-FM',
'channel': 'NHK-FM',
'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg',
'timestamp': 1680969600,
'title': 'ジャズ・トゥナイト NEWジャズ特集',
'upload_date': '20230408',
'release_timestamp': 1680962400,
'release_date': '20230408',
'was_live': True,
'release_date': '20240217',
'description': 'md5:a456ee8e5e59e6dd2a7d32e62386e811',
'timestamp': 1708185600,
'release_timestamp': 1708178400,
'upload_date': '20240217',
},
}, {
# playlist, airs every weekday so it should _hopefully_ be okay forever
@ -519,7 +519,8 @@ class NhkRadiruIE(InfoExtractor):
'series': 'らじる文庫 by ラジオ深夜便 ',
'release_timestamp': 1481126700,
'upload_date': '20211101',
}
},
'expected_warnings': ['Unable to download JSON metadata', 'Failed to get extended description'],
}, {
# news
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_3855109',
@ -539,9 +540,28 @@ class NhkRadiruIE(InfoExtractor):
},
}]
_API_URL_TMPL = None
def _extract_extended_description(self, episode_id, episode):
service, _, area = traverse_obj(episode, ('aa_vinfo2', {str}, {lambda x: (x or '').partition(',')}))
aa_vinfo3 = traverse_obj(episode, ('aa_vinfo3', {str}))
detail_url = try_call(
lambda: self._API_URL_TMPL.format(service=service, area=area, dateid=aa_vinfo3))
if not detail_url:
return
full_meta = traverse_obj(
self._download_json(detail_url, episode_id, 'Downloading extended metadata', fatal=False),
('list', service, 0, {dict})) or {}
return join_nonempty('subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta)
def _extract_episode_info(self, headline, programme_id, series_meta):
episode_id = f'{programme_id}_{headline["headline_id"]}'
episode = traverse_obj(headline, ('file_list', 0, {dict}))
description = self._extract_extended_description(episode_id, episode)
if not description:
self.report_warning('Failed to get extended description, falling back to summary')
description = traverse_obj(episode, ('file_title_sub', {str}))
return {
**series_meta,
@ -551,14 +571,21 @@ def _extract_episode_info(self, headline, programme_id, series_meta):
'was_live': True,
'series': series_meta.get('title'),
'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'),
'description': description,
**traverse_obj(episode, {
'title': 'file_title',
'description': 'file_title_sub',
'timestamp': ('open_time', {unified_timestamp}),
'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}),
}),
}
def _real_initialize(self):
if self._API_URL_TMPL:
return
api_config = self._download_xml(
'https://www.nhk.or.jp/radio/config/config_web.xml', None, 'Downloading API config', fatal=False)
NhkRadiruIE._API_URL_TMPL = try_call(lambda: f'https:{api_config.find(".//url_program_detail").text}')
def _real_extract(self, url):
site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline')
programme_id = f'{site_id}_{corner_id}'

View file

@ -172,9 +172,6 @@ class NiconicoIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch|nico\.ms)/(?P<id>(?:[a-z]{2})?[0-9]+)'
_NETRC_MACHINE = 'niconico'
_COMMENT_API_ENDPOINTS = (
'https://nvcomment.nicovideo.jp/legacy/api.json',
'https://nmsg.nicovideo.jp/api.json',)
_API_HEADERS = {
'X-Frontend-ID': '6',
'X-Frontend-Version': '0',
@ -470,93 +467,16 @@ def get_video_info(*items, get_first=True, **kwargs):
parse_duration(self._html_search_meta('video:duration', webpage, 'video duration', default=None))
or get_video_info('duration')),
'webpage_url': url_or_none(url) or f'https://www.nicovideo.jp/watch/{video_id}',
'subtitles': self.extract_subtitles(video_id, api_data, session_api_data),
'subtitles': self.extract_subtitles(video_id, api_data),
}
def _get_subtitles(self, video_id, api_data, session_api_data):
comment_user_key = traverse_obj(api_data, ('comment', 'keys', 'userKey'))
user_id_str = session_api_data.get('serviceUserId')
thread_ids = traverse_obj(api_data, ('comment', 'threads', lambda _, v: v['isActive']))
legacy_danmaku = self._extract_legacy_comments(video_id, thread_ids, user_id_str, comment_user_key) or []
new_comments = traverse_obj(api_data, ('comment', 'nvComment'))
new_danmaku = self._extract_new_comments(
new_comments.get('server'), video_id,
new_comments.get('params'), new_comments.get('threadKey'))
if not legacy_danmaku and not new_danmaku:
self.report_warning(f'Failed to get comments. {bug_reports_message()}')
return
return {
'comments': [{
'ext': 'json',
'data': json.dumps(legacy_danmaku + new_danmaku),
}],
}
def _extract_legacy_comments(self, video_id, threads, user_id, user_key):
auth_data = {
'user_id': user_id,
'userkey': user_key,
} if user_id and user_key else {'user_id': ''}
api_url = traverse_obj(threads, (..., 'server'), get_all=False)
# Request Start
post_data = [{'ping': {'content': 'rs:0'}}]
for i, thread in enumerate(threads):
thread_id = thread['id']
thread_fork = thread['fork']
# Post Start (2N)
post_data.append({'ping': {'content': f'ps:{i * 2}'}})
post_data.append({'thread': {
'fork': thread_fork,
'language': 0,
'nicoru': 3,
'scores': 1,
'thread': thread_id,
'version': '20090904',
'with_global': 1,
**auth_data,
}})
# Post Final (2N)
post_data.append({'ping': {'content': f'pf:{i * 2}'}})
# Post Start (2N+1)
post_data.append({'ping': {'content': f'ps:{i * 2 + 1}'}})
post_data.append({'thread_leaves': {
# format is '<bottom of minute range>-<top of minute range>:<comments per minute>,<total last comments'
# unfortunately NND limits (deletes?) comment returns this way, so you're only able to grab the last 1000 per language
'content': '0-999999:999999,999999,nicoru:999999',
'fork': thread_fork,
'language': 0,
'nicoru': 3,
'scores': 1,
'thread': thread_id,
**auth_data,
}})
# Post Final (2N+1)
post_data.append({'ping': {'content': f'pf:{i * 2 + 1}'}})
# Request Final
post_data.append({'ping': {'content': 'rf:0'}})
return self._download_json(
f'{api_url}/api.json', video_id, data=json.dumps(post_data).encode(), fatal=False,
headers={
'Referer': f'https://www.nicovideo.jp/watch/{video_id}',
'Origin': 'https://www.nicovideo.jp',
'Content-Type': 'text/plain;charset=UTF-8',
},
note='Downloading comments', errnote=f'Failed to access endpoint {api_url}')
def _extract_new_comments(self, endpoint, video_id, params, thread_key):
comments = self._download_json(
f'{endpoint}/v1/threads', video_id, data=json.dumps({
def _get_subtitles(self, video_id, api_data):
comments_info = traverse_obj(api_data, ('comment', 'nvComment', {dict})) or {}
danmaku = traverse_obj(self._download_json(
f'{comments_info.get("server")}/v1/threads', video_id, data=json.dumps({
'additionals': {},
'params': params,
'threadKey': thread_key,
'params': comments_info.get('params'),
'threadKey': comments_info.get('threadKey'),
}).encode(), fatal=False,
headers={
'Referer': 'https://www.nicovideo.jp/',
@ -566,8 +486,19 @@ def _extract_new_comments(self, endpoint, video_id, params, thread_key):
'x-frontend-id': '6',
'x-frontend-version': '0',
},
note='Downloading comments (new)', errnote='Failed to download comments (new)')
return traverse_obj(comments, ('data', 'threads', ..., 'comments', ...))
note='Downloading comments', errnote='Failed to download comments'),
('data', 'threads', ..., 'comments', ...))
if not danmaku:
self.report_warning(f'Failed to get comments. {bug_reports_message()}')
return
return {
'comments': [{
'ext': 'json',
'data': json.dumps(danmaku),
}],
}
class NiconicoPlaylistBaseIE(InfoExtractor):

View file

@ -0,0 +1,225 @@
from .common import InfoExtractor
from ..utils import int_or_none, mimetype2ext, parse_iso8601, url_or_none
from ..utils.traversal import traverse_obj
class NinaProtocolIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ninaprotocol\.com/releases/(?P<id>[^/#?]+)'
_TESTS = [{
'url': 'https://www.ninaprotocol.com/releases/3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ',
'info_dict': {
'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ',
'title': 'The Spatulas - March Chant',
'tags': ['punk', 'postpresentmedium', 'cambridge'],
'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A',
'channel': 'ppm',
'description': 'md5:bb9f9d39d8f786449cd5d0ff7c5772db',
'album': 'The Spatulas - March Chant',
'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50',
'timestamp': 1701417610,
'uploader': 'ppmrecs',
'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP',
'display_id': 'the-spatulas-march-chant',
'upload_date': '20231201',
'album_artist': 'Post Present Medium ',
},
'playlist': [{
'info_dict': {
'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ_1',
'title': 'March Chant In April',
'track': 'March Chant In April',
'ext': 'mp3',
'duration': 152,
'track_number': 1,
'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A',
'uploader': 'ppmrecs',
'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50',
'timestamp': 1701417610,
'channel': 'ppm',
'album': 'The Spatulas - March Chant',
'tags': ['punk', 'postpresentmedium', 'cambridge'],
'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP',
'upload_date': '20231201',
'album_artist': 'Post Present Medium ',
}
}, {
'info_dict': {
'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ_2',
'title': 'Rescue Mission',
'track': 'Rescue Mission',
'ext': 'mp3',
'duration': 212,
'track_number': 2,
'album_artist': 'Post Present Medium ',
'uploader': 'ppmrecs',
'tags': ['punk', 'postpresentmedium', 'cambridge'],
'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50',
'channel': 'ppm',
'upload_date': '20231201',
'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP',
'timestamp': 1701417610,
'album': 'The Spatulas - March Chant',
'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A',
}
}, {
'info_dict': {
'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ_3',
'title': 'Slinger Style',
'track': 'Slinger Style',
'ext': 'mp3',
'duration': 179,
'track_number': 3,
'timestamp': 1701417610,
'upload_date': '20231201',
'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP',
'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A',
'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50',
'album_artist': 'Post Present Medium ',
'album': 'The Spatulas - March Chant',
'tags': ['punk', 'postpresentmedium', 'cambridge'],
'uploader': 'ppmrecs',
'channel': 'ppm',
}
}, {
'info_dict': {
'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ_4',
'title': 'Psychic Signal',
'track': 'Psychic Signal',
'ext': 'mp3',
'duration': 220,
'track_number': 4,
'tags': ['punk', 'postpresentmedium', 'cambridge'],
'upload_date': '20231201',
'album': 'The Spatulas - March Chant',
'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50',
'timestamp': 1701417610,
'album_artist': 'Post Present Medium ',
'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP',
'channel': 'ppm',
'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A',
'uploader': 'ppmrecs',
}
}, {
'info_dict': {
'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ_5',
'title': 'Curvy Color',
'track': 'Curvy Color',
'ext': 'mp3',
'duration': 148,
'track_number': 5,
'timestamp': 1701417610,
'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A',
'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50',
'album': 'The Spatulas - March Chant',
'album_artist': 'Post Present Medium ',
'channel': 'ppm',
'tags': ['punk', 'postpresentmedium', 'cambridge'],
'uploader': 'ppmrecs',
'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP',
'upload_date': '20231201',
}
}, {
'info_dict': {
'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ_6',
'title': 'Caveman Star',
'track': 'Caveman Star',
'ext': 'mp3',
'duration': 121,
'track_number': 6,
'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP',
'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50',
'tags': ['punk', 'postpresentmedium', 'cambridge'],
'album_artist': 'Post Present Medium ',
'uploader': 'ppmrecs',
'timestamp': 1701417610,
'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A',
'album': 'The Spatulas - March Chant',
'channel': 'ppm',
'upload_date': '20231201',
},
}],
}, {
'url': 'https://www.ninaprotocol.com/releases/f-g-s-american-shield',
'info_dict': {
'id': '76PZnJwaMgViQHYfA4NYJXds7CmW6vHQKAtQUxGene6J',
'description': 'md5:63f08d5db558b4b36e1896f317062721',
'title': 'F.G.S. - American Shield',
'uploader_id': 'Ej3rozs11wYqFk1Gs6oggGCkGLz8GzBhmJfnUxf6gPci',
'channel_id': '6JuksCZPXuP16wJ1BUfwuukJzh42C7guhLrFPPkVJfyE',
'channel': 'tinkscough',
'tags': [],
'album_artist': 'F.G.S.',
'album': 'F.G.S. - American Shield',
'thumbnail': 'https://www.arweave.net/YJpgImkXLT9SbpFb576KuZ5pm6bdvs452LMs3Rx6lm8',
'display_id': 'f-g-s-american-shield',
'uploader': 'flannerysilva',
'timestamp': 1702395858,
'upload_date': '20231212',
},
'playlist_count': 1,
}, {
'url': 'https://www.ninaprotocol.com/releases/time-to-figure-things-out',
'info_dict': {
'id': '6Zi1nC5hj6b13NkpxVYwRhFy6mYA7oLBbe9DMrgGDcYh',
'display_id': 'time-to-figure-things-out',
'description': 'md5:960202ed01c3134bb8958f1008527e35',
'timestamp': 1706283607,
'title': 'DJ STEPDAD - time to figure things out',
'album_artist': 'DJ STEPDAD',
'uploader': 'tddvsss',
'upload_date': '20240126',
'album': 'time to figure things out',
'uploader_id': 'AXQNRgTyYsySyAMFDwxzumuGjfmoXshorCesjpquwCBi',
'thumbnail': 'https://www.arweave.net/O4i8bcKVqJVZvNeHHFp6r8knpFGh9ZwEgbeYacr4nss',
'tags': [],
},
'playlist_count': 4,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
release = self._download_json(
f'https://api.ninaprotocol.com/v1/releases/{video_id}', video_id)['release']
video_id = release.get('publicKey') or video_id
common_info = traverse_obj(release, {
'album': ('metadata', 'properties', 'title', {str}),
'album_artist': ((('hub', 'data'), 'publisherAccount'), 'displayName', {str}),
'timestamp': ('datetime', {parse_iso8601}),
'thumbnail': ('metadata', 'image', {url_or_none}),
'uploader': ('publisherAccount', 'handle', {str}),
'uploader_id': ('publisherAccount', 'publicKey', {str}),
'channel': ('hub', 'handle', {str}),
'channel_id': ('hub', 'publicKey', {str}),
}, get_all=False)
common_info['tags'] = traverse_obj(release, ('metadata', 'properties', 'tags', ..., {str}))
entries = []
for track_num, track in enumerate(traverse_obj(release, (
'metadata', 'properties', 'files', lambda _, v: url_or_none(v['uri']))), 1):
entries.append({
'id': f'{video_id}_{track_num}',
'url': track['uri'],
**traverse_obj(track, {
'title': ('track_title', {str}),
'track': ('track_title', {str}),
'ext': ('type', {mimetype2ext}),
'track_number': ('track', {int_or_none}),
'duration': ('duration', {int_or_none}),
}),
'vcodec': 'none',
**common_info,
})
return {
'_type': 'playlist',
'id': video_id,
'entries': entries,
**traverse_obj(release, {
'display_id': ('slug', {str}),
'title': ('metadata', 'name', {str}),
'description': ('metadata', 'description', {str}),
}),
**common_info,
}

View file

@ -135,14 +135,15 @@ class NovaIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)'
_TESTS = [{
'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260',
'md5': '249baab7d0104e186e78b0899c7d5f28',
'md5': 'da8f3f1fcdaf9fb0f112a32a165760a3',
'info_dict': {
'id': '1757139',
'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci',
'id': '8OvQqEvV3MW',
'display_id': '8OvQqEvV3MW',
'ext': 'mp4',
'title': 'Podzemní nemocnice v pražské Krči',
'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53',
'thumbnail': r're:^https?://.*\.(?:jpg)',
'duration': 151,
}
}, {
'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
@ -210,7 +211,7 @@ def _real_extract(self, url):
# novaplus
embed_id = self._search_regex(
r'<iframe[^>]+\bsrc=["\'](?:https?:)?//media\.cms\.nova\.cz/embed/([^/?#&]+)',
r'<iframe[^>]+\bsrc=["\'](?:https?:)?//media(?:tn)?\.cms\.nova\.cz/embed/([^/?#&"\']+)',
webpage, 'embed url', default=None)
if embed_id:
return {

View file

@ -35,6 +35,7 @@ class NTVRuIE(InfoExtractor):
'duration': 172,
'view_count': int,
},
'skip': '404 Not Found',
}, {
'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416',
'md5': '82dbd49b38e3af1d00df16acbeab260c',
@ -78,7 +79,8 @@ class NTVRuIE(InfoExtractor):
}]
_VIDEO_ID_REGEXES = [
r'<meta property="og:url" content="http://www\.ntv\.ru/video/(\d+)',
r'<meta property="og:url" content="https?://www\.ntv\.ru/video/(\d+)',
r'<meta property="og:video:(?:url|iframe)" content="https?://www\.ntv\.ru/embed/(\d+)',
r'<video embed=[^>]+><id>(\d+)</id>',
r'<video restriction[^>]+><key>(\d+)</key>',
]

199
yt_dlp/extractor/nuum.py Normal file
View file

@ -0,0 +1,199 @@
import functools
from .common import InfoExtractor
from ..utils import (
ExtractorError,
OnDemandPagedList,
UserNotLive,
filter_dict,
int_or_none,
parse_iso8601,
str_or_none,
url_or_none,
)
from ..utils.traversal import traverse_obj
class NuumBaseIE(InfoExtractor):
def _call_api(self, path, video_id, description, query={}):
response = self._download_json(
f'https://nuum.ru/api/v2/{path}', video_id, query=query,
note=f'Downloading {description} metadata',
errnote=f'Unable to download {description} metadata')
if error := response.get('error'):
raise ExtractorError(f'API returned error: {error!r}')
return response['result']
def _get_channel_info(self, channel_name):
return self._call_api(
'broadcasts/public', video_id=channel_name, description='channel',
query={
'with_extra': 'true',
'channel_name': channel_name,
'with_deleted': 'true',
})
def _parse_video_data(self, container, extract_formats=True):
stream = traverse_obj(container, ('media_container_streams', 0, {dict})) or {}
media = traverse_obj(stream, ('stream_media', 0, {dict})) or {}
media_url = traverse_obj(media, (
'media_meta', ('media_archive_url', 'media_url'), {url_or_none}), get_all=False)
video_id = str(container['media_container_id'])
is_live = media.get('media_status') == 'RUNNING'
formats, subtitles = None, None
if extract_formats:
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
media_url, video_id, 'mp4', live=is_live)
return filter_dict({
'id': video_id,
'is_live': is_live,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(container, {
'title': ('media_container_name', {str}),
'description': ('media_container_description', {str}),
'timestamp': ('created_at', {parse_iso8601}),
'channel': ('media_container_channel', 'channel_name', {str}),
'channel_id': ('media_container_channel', 'channel_id', {str_or_none}),
}),
**traverse_obj(stream, {
'view_count': ('stream_total_viewers', {int_or_none}),
'concurrent_view_count': ('stream_current_viewers', {int_or_none}),
}),
**traverse_obj(media, {
'duration': ('media_duration', {int_or_none}),
'thumbnail': ('media_meta', ('media_preview_archive_url', 'media_preview_url'), {url_or_none}),
}, get_all=False),
})
class NuumMediaIE(NuumBaseIE):
IE_NAME = 'nuum:media'
_VALID_URL = r'https?://nuum\.ru/(?:streams|videos|clips)/(?P<id>[\d]+)'
_TESTS = [{
'url': 'https://nuum.ru/streams/1592713-7-days-to-die',
'only_matching': True,
}, {
'url': 'https://nuum.ru/videos/1567547-toxi-hurtz',
'md5': 'f1d9118a30403e32b702a204eb03aca3',
'info_dict': {
'id': '1567547',
'ext': 'mp4',
'title': 'Toxi$ - Hurtz',
'description': '',
'timestamp': 1702631651,
'upload_date': '20231215',
'thumbnail': r're:^https?://.+\.jpg',
'view_count': int,
'concurrent_view_count': int,
'channel_id': '6911',
'channel': 'toxis',
'duration': 116,
},
}, {
'url': 'https://nuum.ru/clips/1552564-pro-misu',
'md5': 'b248ae1565b1e55433188f11beeb0ca1',
'info_dict': {
'id': '1552564',
'ext': 'mp4',
'title': 'Про Мису 🙃',
'timestamp': 1701971828,
'upload_date': '20231207',
'thumbnail': r're:^https?://.+\.jpg',
'view_count': int,
'concurrent_view_count': int,
'channel_id': '3320',
'channel': 'Misalelik',
'duration': 41,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._call_api(f'media-containers/{video_id}', video_id, 'media')
return self._parse_video_data(video_data)
class NuumLiveIE(NuumBaseIE):
IE_NAME = 'nuum:live'
_VALID_URL = r'https?://nuum\.ru/channel/(?P<id>[^/#?]+)/?(?:$|[#?])'
_TESTS = [{
'url': 'https://nuum.ru/channel/mts_live',
'only_matching': True,
}]
def _real_extract(self, url):
channel = self._match_id(url)
channel_info = self._get_channel_info(channel)
if traverse_obj(channel_info, ('channel', 'channel_is_live')) is False:
raise UserNotLive(video_id=channel)
info = self._parse_video_data(channel_info['media_container'])
return {
'webpage_url': f'https://nuum.ru/streams/{info["id"]}',
'extractor_key': NuumMediaIE.ie_key(),
'extractor': NuumMediaIE.IE_NAME,
**info,
}
class NuumTabIE(NuumBaseIE):
IE_NAME = 'nuum:tab'
_VALID_URL = r'https?://nuum\.ru/channel/(?P<id>[^/#?]+)/(?P<type>streams|videos|clips)'
_TESTS = [{
'url': 'https://nuum.ru/channel/dankon_/clips',
'info_dict': {
'id': 'dankon__clips',
'title': 'Dankon_',
},
'playlist_mincount': 29,
}, {
'url': 'https://nuum.ru/channel/dankon_/videos',
'info_dict': {
'id': 'dankon__videos',
'title': 'Dankon_',
},
'playlist_mincount': 2,
}, {
'url': 'https://nuum.ru/channel/dankon_/streams',
'info_dict': {
'id': 'dankon__streams',
'title': 'Dankon_',
},
'playlist_mincount': 1,
}]
_PAGE_SIZE = 50
def _fetch_page(self, channel_id, tab_type, tab_id, page):
CONTAINER_TYPES = {
'clips': ['SHORT_VIDEO', 'REVIEW_VIDEO'],
'videos': ['LONG_VIDEO'],
'streams': ['SINGLE'],
}
media_containers = self._call_api(
'media-containers', video_id=tab_id, description=f'{tab_type} tab page {page + 1}',
query={
'limit': self._PAGE_SIZE,
'offset': page * self._PAGE_SIZE,
'channel_id': channel_id,
'media_container_status': 'STOPPED',
'media_container_type': CONTAINER_TYPES[tab_type],
})
for container in traverse_obj(media_containers, (..., {dict})):
metadata = self._parse_video_data(container, extract_formats=False)
yield self.url_result(f'https://nuum.ru/videos/{metadata["id"]}', NuumMediaIE, **metadata)
def _real_extract(self, url):
channel_name, tab_type = self._match_valid_url(url).group('id', 'type')
tab_id = f'{channel_name}_{tab_type}'
channel_data = self._get_channel_info(channel_name)['channel']
return self.playlist_result(OnDemandPagedList(functools.partial(
self._fetch_page, channel_data['channel_id'], tab_type, tab_id), self._PAGE_SIZE),
playlist_id=tab_id, playlist_title=channel_data.get('channel_name'))

View file

@ -1,50 +1,93 @@
import hmac
import hashlib
import base64
import json
import uuid
from .common import InfoExtractor
from ..utils import (
ExtractorError,
clean_html,
determine_ext,
extract_attributes,
float_or_none,
get_elements_html_by_class,
int_or_none,
js_to_json,
merge_dicts,
mimetype2ext,
parse_iso8601,
remove_end,
remove_start,
str_or_none,
traverse_obj,
url_or_none,
)
class NYTimesBaseIE(InfoExtractor):
_SECRET = b'pX(2MbU2);4N{7J8)>YwKRJ+/pQ3JkiU2Q^V>mFYv6g6gYvt6v'
_DNS_NAMESPACE = uuid.UUID('36dd619a-56dc-595b-9e09-37f4152c7b5d')
_TOKEN = 'MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAuNIzKBOFB77aT/jN/FQ+/QVKWq5V1ka1AYmCR9hstz1pGNPH5ajOU9gAqta0T89iPnhjwla+3oec/Z3kGjxbpv6miQXufHFq3u2RC6HyU458cLat5kVPSOQCe3VVB5NRpOlRuwKHqn0txfxnwSSj8mqzstR997d3gKB//RO9zE16y3PoWlDQXkASngNJEWvL19iob/xwAkfEWCjyRILWFY0JYX3AvLMSbq7wsqOCE5srJpo7rRU32zsByhsp1D5W9OYqqwDmflsgCEQy2vqTsJjrJohuNg+urMXNNZ7Y3naMoqttsGDrWVxtPBafKMI8pM2ReNZBbGQsQXRzQNo7+QIDAQAB'
_GRAPHQL_API = 'https://samizdat-graphql.nytimes.com/graphql/v2'
_GRAPHQL_QUERY = '''query VideoQuery($id: String!) {
video(id: $id) {
... on Video {
bylines {
renderedRepresentation
}
duration
firstPublished
promotionalHeadline
promotionalMedia {
... on Image {
crops {
name
renditions {
name
width
height
url
}
}
}
}
renditions {
type
width
height
url
bitrate
}
summary
}
}
}'''
def _extract_video_from_id(self, video_id):
# Authorization generation algorithm is reverse engineered from `signer` in
# http://graphics8.nytimes.com/video/vhs/vhs-2.x.min.js
path = '/svc/video/api/v3/video/' + video_id
hm = hmac.new(self._SECRET, (path + ':vhs').encode(), hashlib.sha512).hexdigest()
video_data = self._download_json('http://www.nytimes.com' + path, video_id, 'Downloading video JSON', headers={
'Authorization': 'NYTV ' + base64.b64encode(hm.encode()).decode(),
'X-NYTV': 'vhs',
}, fatal=False)
if not video_data:
video_data = self._download_json(
'http://www.nytimes.com/svc/video/api/v2/video/' + video_id,
video_id, 'Downloading video JSON')
def _call_api(self, media_id):
# reference: `id-to-uri.js`
video_uuid = uuid.uuid5(self._DNS_NAMESPACE, 'video')
media_uuid = uuid.uuid5(video_uuid, media_id)
title = video_data['headline']
return traverse_obj(self._download_json(
self._GRAPHQL_API, media_id, 'Downloading JSON from GraphQL API', data=json.dumps({
'query': self._GRAPHQL_QUERY,
'variables': {'id': f'nyt://video/{media_uuid}'},
}, separators=(',', ':')).encode(), headers={
'Content-Type': 'application/json',
'Nyt-App-Type': 'vhs',
'Nyt-App-Version': 'v3.52.21',
'Nyt-Token': self._TOKEN,
'Origin': 'https://nytimes.com',
}, fatal=False), ('data', 'video', {dict})) or {}
def get_file_size(file_size):
if isinstance(file_size, int):
return file_size
elif isinstance(file_size, dict):
return int(file_size.get('value', 0))
else:
return None
def _extract_thumbnails(self, thumbs):
return traverse_obj(thumbs, (lambda _, v: url_or_none(v['url']), {
'url': 'url',
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
}), default=None)
def _extract_formats_and_subtitles(self, video_id, content_media_json):
urls = []
formats = []
subtitles = {}
for video in video_data.get('renditions', []):
for video in traverse_obj(content_media_json, ('renditions', ..., {dict})):
video_url = video.get('url')
format_id = video.get('type')
if not video_url or format_id == 'thumbs' or video_url in urls:
@ -56,11 +99,9 @@ def get_file_size(file_size):
video_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=format_id or 'hls', fatal=False)
formats.extend(m3u8_fmts)
subtitles = self._merge_subtitles(subtitles, m3u8_subs)
self._merge_subtitles(m3u8_subs, target=subtitles)
elif ext == 'mpd':
continue
# formats.extend(self._extract_mpd_formats(
# video_url, video_id, format_id or 'dash', fatal=False))
continue # all mpd urls give 404 errors
else:
formats.append({
'url': video_url,
@ -68,55 +109,50 @@ def get_file_size(file_size):
'vcodec': video.get('videoencoding') or video.get('video_codec'),
'width': int_or_none(video.get('width')),
'height': int_or_none(video.get('height')),
'filesize': get_file_size(video.get('file_size') or video.get('fileSize')),
'filesize': traverse_obj(video, (
('file_size', 'fileSize'), (None, ('value')), {int_or_none}), get_all=False),
'tbr': int_or_none(video.get('bitrate'), 1000) or None,
'ext': ext,
})
thumbnails = []
for image in video_data.get('images', []):
image_url = image.get('url')
if not image_url:
continue
thumbnails.append({
'url': 'http://www.nytimes.com/' + image_url,
'width': int_or_none(image.get('width')),
'height': int_or_none(image.get('height')),
})
return formats, subtitles
publication_date = video_data.get('publication_date')
timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None
def _extract_video(self, media_id):
data = self._call_api(media_id)
formats, subtitles = self._extract_formats_and_subtitles(media_id, data)
return {
'id': video_id,
'title': title,
'description': video_data.get('summary'),
'timestamp': timestamp,
'uploader': video_data.get('byline'),
'duration': float_or_none(video_data.get('duration'), 1000),
'id': media_id,
'title': data.get('promotionalHeadline'),
'description': data.get('summary'),
'timestamp': parse_iso8601(data.get('firstPublished')),
'duration': float_or_none(data.get('duration'), scale=1000),
'creator': ', '.join(traverse_obj(data, ( # TODO: change to 'creators'
'bylines', ..., 'renderedRepresentation', {lambda x: remove_start(x, 'By ')}))),
'formats': formats,
'subtitles': subtitles,
'thumbnails': thumbnails,
'thumbnails': self._extract_thumbnails(
traverse_obj(data, ('promotionalMedia', 'crops', ..., 'renditions', ...))),
}
class NYTimesIE(NYTimesBaseIE):
_VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'
_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>']
_TESTS = [{
'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',
'md5': 'd665342765db043f7e225cff19df0f2d',
'md5': 'a553aa344014e3723d33893d89d4defc',
'info_dict': {
'id': '100000002847155',
'ext': 'mov',
'ext': 'mp4',
'title': 'Verbatim: What Is a Photocopier?',
'description': 'md5:93603dada88ddbda9395632fdc5da260',
'timestamp': 1398631707,
'upload_date': '20140427',
'uploader': 'Brett Weiner',
'timestamp': 1398646132,
'upload_date': '20140428',
'creator': 'Brett Weiner',
'thumbnail': r're:https?://\w+\.nyt.com/images/.+\.jpg',
'duration': 419,
}
},
}, {
'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html',
'only_matching': True,
@ -125,138 +161,260 @@ class NYTimesIE(NYTimesBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url)
return self._extract_video_from_id(video_id)
return self._extract_video(video_id)
class NYTimesArticleIE(NYTimesBaseIE):
_VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?<!video))*?/(?:[^/]+/)*(?P<id>[^.]+)(?:\.html)?'
_VALID_URL = r'https?://(?:www\.)?nytimes\.com/\d{4}/\d{2}/\d{2}/(?!books|podcasts)[^/?#]+/(?:\w+/)?(?P<id>[^./?#]+)(?:\.html)?'
_TESTS = [{
'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0',
'md5': 'e2076d58b4da18e6a001d53fd56db3c9',
'md5': '3eb5ddb1d6f86254fe4f233826778737',
'info_dict': {
'id': '100000003628438',
'ext': 'mov',
'title': 'New Minimum Wage: $70,000 a Year',
'description': 'Dan Price, C.E.O. of Gravity Payments, surprised his 120-person staff by announcing that he planned over the next three years to raise the salary of every employee to $70,000 a year.',
'timestamp': 1429033037,
'ext': 'mp4',
'title': 'One Companys New Minimum Wage: $70,000 a Year',
'description': 'md5:89ba9ab67ca767bb92bf823d1f138433',
'timestamp': 1429047468,
'upload_date': '20150414',
'uploader': 'Matthew Williams',
}
'creator': 'Patricia Cohen',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
'duration': 119.0,
},
}, {
'url': 'http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html',
'md5': 'e0d52040cafb07662acf3c9132db3575',
# article with audio and no video
'url': 'https://www.nytimes.com/2023/09/29/health/mosquitoes-genetic-engineering.html',
'md5': '2365b3555c8aa7f4dd34ca735ad02e6a',
'info_dict': {
'id': '100000004709062',
'title': 'The Run-Up: He Was Like an Octopus',
'id': '100000009110381',
'ext': 'mp3',
'description': 'md5:fb5c6b93b12efc51649b4847fe066ee4',
'series': 'The Run-Up',
'episode': 'He Was Like an Octopus',
'episode_number': 20,
'duration': 2130,
}
'title': 'The Gamble: Can Genetically Modified Mosquitoes End Disease?',
'description': 'md5:9ff8b47acbaf7f3ca8c732f5c815be2e',
'timestamp': 1695960700,
'upload_date': '20230929',
'creator': 'Stephanie Nolen, Natalija Gormalova',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
'duration': 1322,
},
}, {
'url': 'http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html',
'url': 'https://www.nytimes.com/2023/11/29/business/dealbook/kamala-harris-biden-voters.html',
'md5': '3eb5ddb1d6f86254fe4f233826778737',
'info_dict': {
'id': '100000004709479',
'title': 'The Rise of Hitler',
'ext': 'mp3',
'description': 'md5:bce877fd9e3444990cb141875fab0028',
'creator': 'Pamela Paul',
'duration': 3475,
'id': '100000009202270',
'ext': 'mp4',
'title': 'Kamala Harris Defends Biden Policies, but Says More Work Needed to Reach Voters',
'description': 'md5:de4212a7e19bb89e4fb14210ca915f1f',
'timestamp': 1701290997,
'upload_date': '20231129',
'uploader': 'By The New York Times',
'creator': 'Katie Rogers',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
'duration': 97.631,
},
'params': {
'skip_download': True,
'skip_download': 'm3u8',
},
}, {
'url': 'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1',
# multiple videos in the same article
'url': 'https://www.nytimes.com/2023/12/02/business/air-traffic-controllers-safety.html',
'info_dict': {
'id': 'air-traffic-controllers-safety',
'title': 'Drunk and Asleep on the Job: Air Traffic Controllers Pushed to the Brink',
'description': 'md5:549e5a5e935bf7d048be53ba3d2c863d',
'upload_date': '20231202',
'creator': 'Emily Steel, Sydney Ember',
'timestamp': 1701511264,
},
'playlist_count': 3,
}, {
'url': 'https://www.nytimes.com/2023/12/02/business/media/netflix-squid-game-challenge.html',
'only_matching': True,
}]
def _extract_podcast_from_json(self, json, page_id, webpage):
podcast_audio = self._parse_json(
json, page_id, transform_source=js_to_json)
def _extract_content_from_block(self, block):
details = traverse_obj(block, {
'id': ('sourceId', {str}),
'uploader': ('bylines', ..., 'renderedRepresentation', {str}),
'duration': (None, (('duration', {lambda x: float_or_none(x, scale=1000)}), ('length', {int_or_none}))),
'timestamp': ('firstPublished', {parse_iso8601}),
'series': ('podcastSeries', {str}),
}, get_all=False)
audio_data = podcast_audio['data']
track = audio_data['track']
episode_title = track['title']
video_url = track['source']
description = track.get('description') or self._html_search_meta(
['og:description', 'twitter:description'], webpage)
podcast_title = audio_data.get('podcast', {}).get('title')
title = ('%s: %s' % (podcast_title, episode_title)
if podcast_title else episode_title)
episode = audio_data.get('podcast', {}).get('episode') or ''
episode_number = int_or_none(self._search_regex(
r'[Ee]pisode\s+(\d+)', episode, 'episode number', default=None))
formats, subtitles = self._extract_formats_and_subtitles(details.get('id'), block)
# audio articles will have an url and no formats
url = traverse_obj(block, ('fileUrl', {url_or_none}))
if not formats and url:
formats.append({'url': url, 'vcodec': 'none'})
return {
'id': remove_start(podcast_audio.get('target'), 'FT') or page_id,
'url': video_url,
'title': title,
'description': description,
'creator': track.get('credit'),
'series': podcast_title,
'episode': episode_title,
'episode_number': episode_number,
'duration': int_or_none(track.get('duration')),
**details,
'thumbnails': self._extract_thumbnails(traverse_obj(
block, ('promotionalMedia', 'crops', ..., 'renditions', ...))),
'formats': formats,
'subtitles': subtitles
}
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
art_json = self._search_json(
r'window\.__preloadedData\s*=', webpage, 'media details', page_id,
transform_source=lambda x: x.replace('undefined', 'null'))['initialData']['data']['article']
video_id = self._search_regex(
r'data-videoid=["\'](\d+)', webpage, 'video id',
default=None, fatal=False)
if video_id is not None:
return self._extract_video_from_id(video_id)
blocks = traverse_obj(art_json, (
'sprinkledBody', 'content', ..., ('ledeMedia', None),
lambda _, v: v['__typename'] in ('Video', 'Audio')))
if not blocks:
raise ExtractorError('Unable to extract any media blocks from webpage')
podcast_data = self._search_regex(
(r'NYTD\.FlexTypes\.push\s*\(\s*({.+?})\s*\)\s*;\s*</script',
r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'),
webpage, 'podcast data')
return self._extract_podcast_from_json(podcast_data, page_id, webpage)
common_info = {
'title': remove_end(self._html_extract_title(webpage), ' - The New York Times'),
'description': traverse_obj(art_json, (
'sprinkledBody', 'content', ..., 'summary', 'content', ..., 'text', {str}),
get_all=False) or self._html_search_meta(['og:description', 'twitter:description'], webpage),
'timestamp': traverse_obj(art_json, ('firstPublished', {parse_iso8601})),
'creator': ', '.join(
traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName'))), # TODO: change to 'creators' (list)
'thumbnails': self._extract_thumbnails(traverse_obj(
art_json, ('promotionalMedia', 'assetCrops', ..., 'renditions', ...))),
}
entries = []
for block in blocks:
entries.append(merge_dicts(self._extract_content_from_block(block), common_info))
if len(entries) > 1:
return self.playlist_result(entries, page_id, **common_info)
return {
'id': page_id,
**entries[0],
}
class NYTimesCookingIE(NYTimesBaseIE):
_VALID_URL = r'https?://cooking\.nytimes\.com/(?:guid|recip)es/(?P<id>\d+)'
IE_NAME = 'NYTimesCookingGuide'
_VALID_URL = r'https?://cooking\.nytimes\.com/guides/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart',
'md5': 'dab81fa2eaeb3f9ed47498bdcfcdc1d3',
'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey',
'info_dict': {
'id': '100000004756089',
'ext': 'mov',
'timestamp': 1479383008,
'uploader': 'By SHAW LASH, ADAM SAEWITZ and JAMES HERRON',
'title': 'Cranberry Tart',
'upload_date': '20161117',
'description': 'If you are a fan of lemon curd or the classic French tarte au citron, you will love this cranberry version.',
'id': '13-how-to-cook-a-turkey',
'title': 'How to Cook a Turkey',
'description': 'md5:726cfd3f9b161bdf5c279879e8050ca0',
},
'playlist_count': 2,
}, {
# single video example
'url': 'https://cooking.nytimes.com/guides/50-how-to-make-mac-and-cheese',
'md5': '64415805fe0b8640fce6b0b9def5989a',
'info_dict': {
'id': '100000005835845',
'ext': 'mp4',
'title': 'How to Make Mac and Cheese',
'description': 'md5:b8f2f33ec1fb7523b21367147c9594f1',
'timestamp': 1522950315,
'upload_date': '20180405',
'duration': 9.51,
'creator': 'Alison Roman',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
},
}, {
'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey',
'md5': '4b2e8c70530a89b8d905a2b572316eb8',
'url': 'https://cooking.nytimes.com/guides/20-how-to-frost-a-cake',
'md5': '64415805fe0b8640fce6b0b9def5989a',
'info_dict': {
'id': '100000003951728',
'ext': 'mov',
'timestamp': 1445509539,
'description': 'Turkey guide',
'upload_date': '20151022',
'title': 'Turkey',
}
'id': '20-how-to-frost-a-cake',
'title': 'How to Frost a Cake',
'description': 'md5:a31fe3b98a8ce7b98aae097730c269cd',
},
'playlist_count': 8,
}]
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
title = self._html_search_meta(['og:title', 'twitter:title'], webpage)
description = self._html_search_meta(['og:description', 'twitter:description'], webpage)
video_id = self._search_regex(
r'data-video-id=["\'](\d+)', webpage, 'video id')
lead_video_id = self._search_regex(
r'data-video-player-id="(\d+)"></div>', webpage, 'lead video')
media_ids = traverse_obj(
get_elements_html_by_class('video-item', webpage), (..., {extract_attributes}, 'data-video-id'))
return self._extract_video_from_id(video_id)
if media_ids:
media_ids.append(lead_video_id)
return self.playlist_result(
[self._extract_video(media_id) for media_id in media_ids], page_id, title, description)
return {
**self._extract_video(lead_video_id),
'title': title,
'description': description,
'creator': self._search_regex( # TODO: change to 'creators'
r'<span itemprop="author">([^<]+)</span></p>', webpage, 'author', default=None),
}
class NYTimesCookingRecipeIE(InfoExtractor):
_VALID_URL = r'https?://cooking\.nytimes\.com/recipes/(?P<id>\d+)'
_TESTS = [{
'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart',
'md5': '579e83bbe8e61e9de67f80edba8a78a8',
'info_dict': {
'id': '1017817',
'ext': 'mp4',
'title': 'Cranberry Curd Tart',
'description': 'md5:ad77a3fc321db636256d4343c5742152',
'timestamp': 1447804800,
'upload_date': '20151118',
'creator': 'David Tanis',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
},
}, {
'url': 'https://cooking.nytimes.com/recipes/1024781-neapolitan-checkerboard-cookies',
'md5': '58df35998241dcf0620e99e646331b42',
'info_dict': {
'id': '1024781',
'ext': 'mp4',
'title': 'Neapolitan Checkerboard Cookies',
'description': 'md5:ba12394c585ababea951cb6d2fcc6631',
'timestamp': 1701302400,
'upload_date': '20231130',
'creator': 'Sue Li',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
},
}, {
'url': 'https://cooking.nytimes.com/recipes/1019516-overnight-oats',
'md5': '2fe7965a3adc899913b8e25ada360823',
'info_dict': {
'id': '1019516',
'ext': 'mp4',
'timestamp': 1546387200,
'description': 'md5:8856ce10239161bd2596ac335b9f9bfb',
'upload_date': '20190102',
'title': 'Overnight Oats',
'creator': 'Genevieve Ko',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
},
}]
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
recipe_data = self._search_nextjs_data(webpage, page_id)['props']['pageProps']['recipe']
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
recipe_data['videoSrc'], page_id, 'mp4', m3u8_id='hls')
return {
**traverse_obj(recipe_data, {
'id': ('id', {str_or_none}),
'title': ('title', {str}),
'description': ('topnote', {clean_html}),
'timestamp': ('publishedAt', {int_or_none}),
'creator': ('contentAttribution', 'cardByline', {str}),
}),
'formats': formats,
'subtitles': subtitles,
'thumbnails': [{'url': thumb_url} for thumb_url in traverse_obj(
recipe_data, ('image', 'crops', 'recipe', ..., {url_or_none}))],
}

View file

@ -1,4 +1,6 @@
from .common import InfoExtractor
from .jwplatform import JWPlatformIE
from ..utils import make_archive_id
class OneFootballIE(InfoExtractor):
@ -7,41 +9,43 @@ class OneFootballIE(InfoExtractor):
_TESTS = [{
'url': 'https://onefootball.com/en/video/highlights-fc-zuerich-3-3-fc-basel-34012334',
'info_dict': {
'id': '34012334',
'id': 'Y2VtcWAT',
'ext': 'mp4',
'title': 'Highlights: FC Zürich 3-3 FC Basel',
'description': 'md5:33d9855cb790702c4fe42a513700aba8',
'thumbnail': 'https://photobooth-api.onefootball.com/api/screenshot/https:%2F%2Fperegrine-api.onefootball.com%2Fv2%2Fphotobooth%2Fcms%2Fen%2F34012334',
'timestamp': 1635874604,
'upload_date': '20211102'
'thumbnail': 'https://cdn.jwplayer.com/v2/media/Y2VtcWAT/poster.jpg?width=720',
'timestamp': 1635874895,
'upload_date': '20211102',
'duration': 375.0,
'tags': ['Football', 'Soccer', 'OneFootball'],
'_old_archive_ids': ['onefootball 34012334'],
},
'params': {'skip_download': True}
'params': {'skip_download': True},
'expected_warnings': ['Failed to download m3u8 information'],
}, {
'url': 'https://onefootball.com/en/video/klopp-fumes-at-var-decisions-in-west-ham-defeat-34041020',
'info_dict': {
'id': '34041020',
'id': 'leVJrMho',
'ext': 'mp4',
'title': 'Klopp fumes at VAR decisions in West Ham defeat',
'description': 'md5:9c50371095a01ad3f63311c73d8f51a5',
'thumbnail': 'https://photobooth-api.onefootball.com/api/screenshot/https:%2F%2Fperegrine-api.onefootball.com%2Fv2%2Fphotobooth%2Fcms%2Fen%2F34041020',
'timestamp': 1636314103,
'upload_date': '20211107'
'thumbnail': 'https://cdn.jwplayer.com/v2/media/leVJrMho/poster.jpg?width=720',
'timestamp': 1636315232,
'upload_date': '20211107',
'duration': 93.0,
'tags': ['Football', 'Soccer', 'OneFootball'],
'_old_archive_ids': ['onefootball 34041020'],
},
'params': {'skip_download': True}
}]
def _real_extract(self, url):
id = self._match_id(url)
webpage = self._download_webpage(url, id)
data_json = self._search_json_ld(webpage, id)
m3u8_url = self._html_search_regex(r'(https://cdn\.jwplayer\.com/manifests/.+\.m3u8)', webpage, 'm3u8_url')
formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id)
return {
'id': id,
'title': data_json.get('title'),
'description': data_json.get('description'),
'thumbnail': data_json.get('thumbnail'),
'timestamp': data_json.get('timestamp'),
'formats': formats,
'subtitles': subtitles,
}
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
data_json = self._search_json_ld(webpage, video_id, fatal=False)
data_json.pop('url', None)
m3u8_url = self._html_search_regex(r'(https://cdn\.jwplayer\.com/manifests/\w+\.m3u8)', webpage, 'm3u8_url')
return self.url_result(
m3u8_url, JWPlatformIE, video_id, _old_archive_ids=[make_archive_id(self, video_id)],
**data_json, url_transparent=True)

View file

@ -12,6 +12,8 @@
class OpenRecBaseIE(InfoExtractor):
_M3U8_HEADERS = {'Referer': 'https://www.openrec.tv/'}
def _extract_pagestore(self, webpage, video_id):
return self._parse_json(
self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id)
@ -21,7 +23,7 @@ def _expand_media(self, video_id, media):
if not m3u8_url:
continue
yield from self._extract_m3u8_formats(
m3u8_url, video_id, ext='mp4', m3u8_id=name)
m3u8_url, video_id, ext='mp4', m3u8_id=name, headers=self._M3U8_HEADERS)
def _extract_movie(self, webpage, video_id, name, is_live):
window_stores = self._extract_pagestore(webpage, video_id)
@ -60,6 +62,7 @@ def _extract_movie(self, webpage, video_id, name, is_live):
'uploader_id': get_first(movie_stores, ('channel', 'user', 'id')),
'timestamp': int_or_none(get_first(movie_stores, ['publishedAt', 'time']), scale=1000) or unified_timestamp(get_first(movie_stores, 'publishedAt')),
'is_live': is_live,
'http_headers': self._M3U8_HEADERS,
}
@ -110,7 +113,7 @@ def _real_extract(self, url):
raise ExtractorError('Cannot extract title')
formats = self._extract_m3u8_formats(
capture_data.get('source'), video_id, ext='mp4')
capture_data.get('source'), video_id, ext='mp4', headers=self._M3U8_HEADERS)
return {
'id': video_id,
@ -121,6 +124,7 @@ def _real_extract(self, url):
'uploader': traverse_obj(movie_store, ('channel', 'name'), expected_type=compat_str),
'uploader_id': traverse_obj(movie_store, ('channel', 'id'), expected_type=compat_str),
'upload_date': unified_strdate(capture_data.get('createdAt')),
'http_headers': self._M3U8_HEADERS,
}

View file

@ -1,3 +1,4 @@
import base64
import functools
import re
@ -565,3 +566,66 @@ def _real_extract(self, url):
})
return self.playlist_result(entries)
class ORFONIE(InfoExtractor):
IE_NAME = 'orf:on'
_VALID_URL = r'https?://on\.orf\.at/video/(?P<id>\d{8})/(?P<slug>[\w-]+)'
_TESTS = [{
'url': 'https://on.orf.at/video/14210000/school-of-champions-48',
'info_dict': {
'id': '14210000',
'ext': 'mp4',
'duration': 2651.08,
'thumbnail': 'https://api-tvthek.orf.at/assets/segments/0167/98/thumb_16697671_segments_highlight_teaser.jpeg',
'title': 'School of Champions (4/8)',
'description': 'md5:d09ad279fc2e8502611e7648484b6afd',
'media_type': 'episode',
'timestamp': 1706472362,
'upload_date': '20240128',
}
}]
def _extract_video(self, video_id, display_id):
encrypted_id = base64.b64encode(f'3dSlfek03nsLKdj4Jsd{video_id}'.encode()).decode()
api_json = self._download_json(
f'https://api-tvthek.orf.at/api/v4.3/public/episode/encrypted/{encrypted_id}', display_id)
formats, subtitles = [], {}
for manifest_type in traverse_obj(api_json, ('sources', {dict.keys}, ...)):
for manifest_url in traverse_obj(api_json, ('sources', manifest_type, ..., 'src', {url_or_none})):
if manifest_type == 'hls':
fmts, subs = self._extract_m3u8_formats_and_subtitles(
manifest_url, display_id, fatal=False, m3u8_id='hls')
elif manifest_type == 'dash':
fmts, subs = self._extract_mpd_formats_and_subtitles(
manifest_url, display_id, fatal=False, mpd_id='dash')
else:
continue
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(api_json, {
'duration': ('duration_second', {float_or_none}),
'title': (('title', 'headline'), {str}),
'description': (('description', 'teaser_text'), {str}),
'media_type': ('video_type', {str}),
}, get_all=False),
}
def _real_extract(self, url):
video_id, display_id = self._match_valid_url(url).group('id', 'slug')
webpage = self._download_webpage(url, display_id)
return {
'id': video_id,
'title': self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None),
'description': self._html_search_meta(
['description', 'og:description', 'twitter:description'], webpage, default=None),
**self._search_json_ld(webpage, display_id, fatal=False),
**self._extract_video(video_id, display_id),
}

View file

@ -275,7 +275,7 @@ def _real_extract(self, url):
'ext': ext,
'url': post_file['url'],
}
elif name == 'video':
elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8':
formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id)
return {
**info,

File diff suppressed because it is too large Load diff

View file

@ -1,10 +1,18 @@
import json
from .common import InfoExtractor
from ..utils import int_or_none, traverse_obj
from ..utils import (
ExtractorError,
int_or_none,
parse_qs,
traverse_obj,
update_url_query,
urlencode_postdata,
)
class PlaySuisseIE(InfoExtractor):
_NETRC_MACHINE = 'playsuisse'
_VALID_URL = r'https?://(?:www\.)?playsuisse\.ch/(?:watch|detail)/(?:[^#]*[?&]episodeId=)?(?P<id>[0-9]+)'
_TESTS = [
{
@ -134,12 +142,47 @@ class PlaySuisseIE(InfoExtractor):
id
url
}'''
_LOGIN_BASE_URL = 'https://login.srgssr.ch/srgssrlogin.onmicrosoft.com'
_LOGIN_PATH = 'B2C_1A__SignInV2'
_ID_TOKEN = None
def _perform_login(self, username, password):
login_page = self._download_webpage(
'https://www.playsuisse.ch/api/sso/login', None, note='Downloading login page',
query={'x': 'x', 'locale': 'de', 'redirectUrl': 'https://www.playsuisse.ch/'})
settings = self._search_json(r'var\s+SETTINGS\s*=', login_page, 'settings', None)
csrf_token = settings['csrf']
query = {'tx': settings['transId'], 'p': self._LOGIN_PATH}
status = traverse_obj(self._download_json(
f'{self._LOGIN_BASE_URL}/{self._LOGIN_PATH}/SelfAsserted', None, 'Logging in',
query=query, headers={'X-CSRF-TOKEN': csrf_token}, data=urlencode_postdata({
'request_type': 'RESPONSE',
'signInName': username,
'password': password
}), expected_status=400), ('status', {int_or_none}))
if status == 400:
raise ExtractorError('Invalid username or password', expected=True)
urlh = self._request_webpage(
f'{self._LOGIN_BASE_URL}/{self._LOGIN_PATH}/api/CombinedSigninAndSignup/confirmed',
None, 'Downloading ID token', query={
'rememberMe': 'false',
'csrf_token': csrf_token,
**query,
'diags': '',
})
self._ID_TOKEN = traverse_obj(parse_qs(urlh.url), ('id_token', 0))
if not self._ID_TOKEN:
raise ExtractorError('Login failed')
def _get_media_data(self, media_id):
# NOTE In the web app, the "locale" header is used to switch between languages,
# However this doesn't seem to take effect when passing the header here.
response = self._download_json(
'https://4bbepzm4ef.execute-api.eu-central-1.amazonaws.com/prod/graphql',
'https://www.playsuisse.ch/api/graphql',
media_id, data=json.dumps({
'operationName': 'AssetWatch',
'query': self._GRAPHQL_QUERY,
@ -150,6 +193,9 @@ def _get_media_data(self, media_id):
return response['data']['assetV2']
def _real_extract(self, url):
if not self._ID_TOKEN:
self.raise_login_required(method='password')
media_id = self._match_id(url)
media_data = self._get_media_data(media_id)
info = self._extract_single(media_data)
@ -168,7 +214,8 @@ def _extract_single(self, media_data):
if not media.get('url') or media.get('type') != 'HLS':
continue
f, subs = self._extract_m3u8_formats_and_subtitles(
media['url'], media_data['id'], 'mp4', m3u8_id='HLS', fatal=False)
update_url_query(media['url'], {'id_token': self._ID_TOKEN}),
media_data['id'], 'mp4', m3u8_id='HLS', fatal=False)
formats.extend(f)
self._merge_subtitles(subs, target=subtitles)

View file

@ -87,8 +87,8 @@ def _login(self, host):
def is_logged(webpage):
return any(re.search(p, webpage) for p in (
r'class=["\']signOut',
r'>Sign\s+[Oo]ut\s*<'))
r'id="profileMenuDropdown"',
r'class="ph-icon-logout"'))
if is_logged(login_page):
self._logged_in = True

View file

@ -18,7 +18,6 @@
class Pr0grammIE(InfoExtractor):
_VALID_URL = r'https?://pr0gramm\.com\/(?:[^/?#]+/)+(?P<id>[\d]+)(?:[/?#:]|$)'
_TESTS = [{
# Tags require account
'url': 'https://pr0gramm.com/new/video/5466437',
'info_dict': {
'id': '5466437',
@ -36,7 +35,6 @@ class Pr0grammIE(InfoExtractor):
'_old_archive_ids': ['pr0grammstatic 5466437'],
},
}, {
# Tags require account
'url': 'https://pr0gramm.com/new/3052805:comment28391322',
'info_dict': {
'id': '3052805',
@ -71,6 +69,23 @@ class Pr0grammIE(InfoExtractor):
'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
'_old_archive_ids': ['pr0grammstatic 5848332'],
},
}, {
'url': 'https://pr0gramm.com/top/5895149',
'info_dict': {
'id': '5895149',
'ext': 'mp4',
'title': 'pr0gramm-5895149 by algoholigSeeManThrower',
'tags': 'count:19',
'uploader': 'algoholigSeeManThrower',
'uploader_id': 457556,
'upload_timestamp': 1697580902,
'upload_date': '20231018',
'like_count': int,
'dislike_count': int,
'age_limit': 0,
'thumbnail': 'https://thumb.pr0gramm.com/2023/10/18/db47bb3db5e1a1b3.jpg',
'_old_archive_ids': ['pr0grammstatic 5895149'],
},
}, {
'url': 'https://pr0gramm.com/static/5466437',
'only_matching': True,
@ -92,15 +107,15 @@ def _is_logged_in(self):
def _maximum_flags(self):
# We need to guess the flags for the content otherwise the api will raise an error
# We can guess the maximum allowed flags for the account from the cookies
# Bitflags are (msbf): nsfp, nsfl, nsfw, sfw
flags = 0b0001
# Bitflags are (msbf): pol, nsfp, nsfl, nsfw, sfw
flags = 0b10001
if self._is_logged_in:
flags |= 0b1000
flags |= 0b01000
cookies = self._get_cookies(self.BASE_URL)
if 'me' not in cookies:
self._download_webpage(self.BASE_URL, None, 'Refreshing verification information')
if traverse_obj(cookies, ('me', {lambda x: x.value}, {unquote}, {json.loads}, 'verified')):
flags |= 0b0110
flags |= 0b00110
return flags
@ -134,14 +149,12 @@ def _real_extract(self, url):
if not source or not source.endswith('mp4'):
self.raise_no_formats('Could not extract a video', expected=bool(source), video_id=video_id)
tags = None
if self._is_logged_in:
metadata = self._call_api('info', video_id, {'itemId': video_id}, note='Downloading tags')
tags = traverse_obj(metadata, ('tags', ..., 'tag', {str}))
# Sorted by "confidence", higher confidence = earlier in list
confidences = traverse_obj(metadata, ('tags', ..., 'confidence', ({int}, {float})))
if confidences:
tags = [tag for _, tag in sorted(zip(confidences, tags), reverse=True)]
metadata = self._call_api('info', video_id, {'itemId': video_id}, note='Downloading tags')
tags = traverse_obj(metadata, ('tags', ..., 'tag', {str}))
# Sorted by "confidence", higher confidence = earlier in list
confidences = traverse_obj(metadata, ('tags', ..., 'confidence', ({int}, {float})))
if confidences:
tags = [tag for _, tag in sorted(zip(confidences, tags), reverse=True)]
formats = traverse_obj(video_info, ('variants', ..., {
'format_id': ('name', {str}),

View file

@ -1,5 +1,8 @@
import json
from .common import InfoExtractor
from ..utils import parse_iso8601, traverse_obj, try_call
from ..utils import float_or_none, parse_iso8601, str_or_none, try_call
from ..utils.traversal import traverse_obj
class PrankCastIE(InfoExtractor):
@ -64,3 +67,71 @@ def _real_extract(self, url):
'categories': [json_info.get('broadcast_category')],
'tags': try_call(lambda: json_info['broadcast_tags'].split(','))
}
class PrankCastPostIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?prankcast\.com/[^/?#]+/posts/(?P<id>\d+)-(?P<display_id>[^/?#]+)'
_TESTS = [{
'url': 'https://prankcast.com/devonanustart/posts/6214-happy-national-rachel-day-',
'info_dict': {
'id': '6214',
'ext': 'mp3',
'title': 'Happy National Rachel Day!',
'display_id': 'happy-national-rachel-day-',
'timestamp': 1704333938,
'uploader': 'Devonanustart',
'channel_id': '4',
'duration': 13175,
'cast': ['Devonanustart'],
'description': '',
'categories': ['prank call'],
'upload_date': '20240104'
}
}, {
'url': 'https://prankcast.com/despicabledogs/posts/6217-jake-the-work-crow-',
'info_dict': {
'id': '6217',
'ext': 'mp3',
'title': 'Jake the Work Crow!',
'display_id': 'jake-the-work-crow-',
'timestamp': 1704346592,
'uploader': 'despicabledogs',
'channel_id': '957',
'duration': 263.287,
'cast': ['despicabledogs'],
'description': 'https://imgur.com/a/vtxLvKU',
'categories': [],
'upload_date': '20240104'
}
}]
def _real_extract(self, url):
video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
webpage = self._download_webpage(url, video_id)
post = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['ssr_data_posts']
content = self._parse_json(post['post_contents_json'], video_id)[0]
uploader = post.get('user_name')
guests_json = traverse_obj(content, ('guests_json', {json.loads}, {dict})) or {}
return {
'id': video_id,
'title': post.get('post_title') or self._og_search_title(webpage),
'display_id': display_id,
'url': content.get('url'),
'timestamp': parse_iso8601(content.get('start_date') or content.get('crdate'), ' '),
'uploader': uploader,
'channel_id': str_or_none(post.get('user_id')),
'duration': float_or_none(content.get('duration')),
'cast': list(filter(None, [uploader] + traverse_obj(guests_json, (..., 'name')))),
'description': post.get('post_body'),
'categories': list(filter(None, [content.get('category')])),
'tags': try_call(lambda: list(filter('', post['post_tags'].split(',')))),
'subtitles': {
'live_chat': [{
'url': f'https://prankcast.com/api/private/chat/select-broadcast?id={post["content_id"]}&cache=',
'ext': 'json',
}],
} if post.get('content_id') else None
}

View file

@ -1,5 +1,6 @@
import base64
import random
import re
import urllib.parse
from .common import InfoExtractor
@ -11,6 +12,7 @@
unified_timestamp,
update_url_query,
)
from ..utils.traversal import traverse_obj
class RadikoBaseIE(InfoExtractor):
@ -159,6 +161,12 @@ def _extract_formats(self, video_id, station, is_onair, ft, cursor, auth_token,
return formats
def _extract_performers(self, prog):
performers = traverse_obj(prog, (
'pfm/text()', ..., {lambda x: re.split(r'[//、 ,]', x)}, ..., {str.strip}))
# TODO: change 'artist' fields to 'artists' and return traversal list instead of str
return ', '.join(performers) or None
class RadikoIE(RadikoBaseIE):
_VALID_URL = r'https?://(?:www\.)?radiko\.jp/#!/ts/(?P<station>[A-Z0-9-]+)/(?P<id>\d+)'
@ -186,10 +194,12 @@ def _real_extract(self, url):
return {
'id': video_id,
'title': try_call(lambda: prog.find('title').text),
'artist': self._extract_performers(prog),
'description': clean_html(try_call(lambda: prog.find('info').text)),
'uploader': try_call(lambda: station_program.find('.//name').text),
'uploader_id': station,
'timestamp': vid_int,
'duration': try_call(lambda: unified_timestamp(radio_end, False) - unified_timestamp(radio_begin, False)),
'is_live': True,
'formats': self._extract_formats(
video_id=video_id, station=station, is_onair=False,
@ -243,6 +253,7 @@ def _real_extract(self, url):
return {
'id': station,
'title': title,
'artist': self._extract_performers(prog),
'description': description,
'uploader': station_name,
'uploader_id': station,

View file

@ -1,6 +1,7 @@
import re
from .common import InfoExtractor
from ..networking import HEADRequest
from ..utils import (
clean_html,
determine_ext,
@ -91,7 +92,7 @@ def fix_cdata(s):
self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
if not audio_only and not is_live:
formats.extend(self._create_http_urls(media_url, relinker_url, formats))
formats.extend(self._create_http_urls(media_url, relinker_url, formats, video_id))
return filter_dict({
'is_live': is_live,
@ -99,7 +100,7 @@ def fix_cdata(s):
'formats': formats,
})
def _create_http_urls(self, manifest_url, relinker_url, fmts):
def _create_http_urls(self, manifest_url, relinker_url, fmts, video_id):
_MANIFEST_REG = r'/(?P<id>\w+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4)?(?:\.csmil)?/playlist\.m3u8'
_MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s'
_QUALITY = {
@ -166,6 +167,14 @@ def get_format_info(tbr):
'fps': 25,
}
# Check if MP4 download is available
try:
self._request_webpage(
HEADRequest(_MP4_TMPL % (relinker_url, '*')), video_id, 'Checking MP4 availability')
except ExtractorError as e:
self.to_screen(f'{video_id}: MP4 direct download is not available: {e.cause}')
return []
# filter out single-stream formats
fmts = [f for f in fmts
if not f.get('vcodec') == 'none' and not f.get('acodec') == 'none']

135
yt_dlp/extractor/redge.py Normal file
View file

@ -0,0 +1,135 @@
import functools
from .common import InfoExtractor
from ..networking import HEADRequest
from ..utils import (
float_or_none,
int_or_none,
join_nonempty,
parse_qs,
update_url_query,
)
from ..utils.traversal import traverse_obj
class RedCDNLivxIE(InfoExtractor):
_VALID_URL = r'https?://[^.]+\.(?:dcs\.redcdn|atmcdn)\.pl/(?:live(?:dash|hls|ss)|nvr)/o2/(?P<tenant>[^/?#]+)/(?P<id>[^?#]+)\.livx'
IE_NAME = 'redcdnlivx'
_TESTS = [{
'url': 'https://r.dcs.redcdn.pl/livedash/o2/senat/ENC02/channel.livx?indexMode=true&startTime=638272860000&stopTime=638292544000',
'info_dict': {
'id': 'ENC02-638272860000-638292544000',
'ext': 'mp4',
'title': 'ENC02',
'duration': 19683.982,
'live_status': 'was_live',
},
}, {
'url': 'https://r.dcs.redcdn.pl/livedash/o2/sejm/ENC18/live.livx?indexMode=true&startTime=722333096000&stopTime=722335562000',
'info_dict': {
'id': 'ENC18-722333096000-722335562000',
'ext': 'mp4',
'title': 'ENC18',
'duration': 2463.995,
'live_status': 'was_live',
},
}, {
'url': 'https://r.dcs.redcdn.pl/livehls/o2/sportevolution/live/triathlon2018/warsaw.livx/playlist.m3u8?startTime=550305000000&stopTime=550327620000',
'info_dict': {
'id': 'triathlon2018-warsaw-550305000000-550327620000',
'ext': 'mp4',
'title': 'triathlon2018/warsaw',
'duration': 22619.98,
'live_status': 'was_live',
},
}, {
'url': 'https://n-25-12.dcs.redcdn.pl/nvr/o2/sejm/Migacz-ENC01/1.livx?startTime=722347200000&stopTime=722367345000',
'only_matching': True,
}, {
'url': 'https://redir.atmcdn.pl/nvr/o2/sejm/ENC08/1.livx?startTime=503831270000&stopTime=503840040000',
'only_matching': True,
}]
"""
Known methods (first in url path):
- `livedash` - DASH MPD
- `livehls` - HTTP Live Streaming
- `livess` - IIS Smooth Streaming
- `nvr` - CCTV mode, directly returns a file, typically flv, avc1, aac
- `sc` - shoutcast/icecast (audio streams, like radio)
"""
def _real_extract(self, url):
tenant, path = self._match_valid_url(url).group('tenant', 'id')
qs = parse_qs(url)
start_time = traverse_obj(qs, ('startTime', 0, {int_or_none}))
stop_time = traverse_obj(qs, ('stopTime', 0, {int_or_none}))
def livx_mode(mode):
suffix = ''
if mode == 'livess':
suffix = '/manifest'
elif mode == 'livehls':
suffix = '/playlist.m3u8'
file_qs = {}
if start_time:
file_qs['startTime'] = start_time
if stop_time:
file_qs['stopTime'] = stop_time
if mode == 'nvr':
file_qs['nolimit'] = 1
elif mode != 'sc':
file_qs['indexMode'] = 'true'
return update_url_query(f'https://r.dcs.redcdn.pl/{mode}/o2/{tenant}/{path}.livx{suffix}', file_qs)
# no id or title for a transmission. making ones up.
title = path \
.replace('/live', '').replace('live/', '') \
.replace('/channel', '').replace('channel/', '') \
.strip('/')
video_id = join_nonempty(title.replace('/', '-'), start_time, stop_time)
formats = []
# downloading the manifest separately here instead of _extract_ism_formats to also get some stream metadata
ism_res = self._download_xml_handle(
livx_mode('livess'), video_id,
note='Downloading ISM manifest',
errnote='Failed to download ISM manifest',
fatal=False)
ism_doc = None
if ism_res is not False:
ism_doc, ism_urlh = ism_res
formats, _ = self._parse_ism_formats_and_subtitles(ism_doc, ism_urlh.url, 'ss')
nvr_urlh = self._request_webpage(
HEADRequest(livx_mode('nvr')), video_id, 'Follow flv file redirect', fatal=False,
expected_status=lambda _: True)
if nvr_urlh and nvr_urlh.status == 200:
formats.append({
'url': nvr_urlh.url,
'ext': 'flv',
'format_id': 'direct-0',
'preference': -1, # might be slow
})
formats.extend(self._extract_mpd_formats(livx_mode('livedash'), video_id, mpd_id='dash', fatal=False))
formats.extend(self._extract_m3u8_formats(
livx_mode('livehls'), video_id, m3u8_id='hls', ext='mp4', fatal=False))
time_scale = traverse_obj(ism_doc, ('@TimeScale', {int_or_none})) or 10000000
duration = traverse_obj(
ism_doc, ('@Duration', {functools.partial(float_or_none, scale=time_scale)})) or None
live_status = None
if traverse_obj(ism_doc, '@IsLive') == 'TRUE':
live_status = 'is_live'
elif duration:
live_status = 'was_live'
return {
'id': video_id,
'title': title,
'formats': formats,
'duration': duration,
'live_status': live_status,
}

View file

@ -7,11 +7,12 @@
str_to_int,
unified_strdate,
url_or_none,
urljoin,
)
class RedTubeIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)'
_VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com(?:\.br)?/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)'
_EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)']
_TESTS = [{
'url': 'https://www.redtube.com/38864951',
@ -34,6 +35,9 @@ class RedTubeIE(InfoExtractor):
}, {
'url': 'http://it.redtube.com/66418',
'only_matching': True,
}, {
'url': 'https://www.redtube.com.br/103224331',
'only_matching': True,
}]
def _real_extract(self, url):
@ -79,7 +83,7 @@ def _real_extract(self, url):
'media definitions', default='{}'),
video_id, fatal=False)
for media in medias if isinstance(medias, list) else []:
format_url = url_or_none(media.get('videoUrl'))
format_url = urljoin('https://www.redtube.com', media.get('videoUrl'))
if not format_url:
continue
format_id = media.get('format')

View file

@ -247,17 +247,17 @@ class MujRozhlasIE(RozhlasBaseIE):
'url': 'https://www.mujrozhlas.cz/vykopavky/ach-jo-zase-teleci-rizek-je-mnohem-min-cesky-nez-jsme-si-mysleli',
'md5': '6f8fd68663e64936623e67c152a669e0',
'info_dict': {
'id': '10739193',
'id': '10787730',
'ext': 'mp3',
'title': 'Ach jo, zase to telecí! Řízek je mnohem míň český, než jsme si mysleli',
'description': 'md5:db7141e9caaedc9041ec7cefb9a62908',
'timestamp': 1684915200,
'modified_timestamp': 1684922446,
'modified_timestamp': 1687550432,
'series': 'Vykopávky',
'thumbnail': 'https://portal.rozhlas.cz/sites/default/files/images/84377046610af6ddc54d910b1dd7a22b.jpg',
'channel_id': 'radio-wave',
'upload_date': '20230524',
'modified_date': '20230524',
'modified_date': '20230623',
},
}, {
# serial extraction
@ -277,6 +277,26 @@ class MujRozhlasIE(RozhlasBaseIE):
'title': 'Nespavci',
'description': 'md5:c430adcbf9e2b9eac88b745881e814dc',
},
}, {
# serialPart
'url': 'https://www.mujrozhlas.cz/povidka/gustavo-adolfo-becquer-hora-duchu',
'info_dict': {
'id': '8889035',
'ext': 'm4a',
'title': 'Gustavo Adolfo Bécquer: Hora duchů',
'description': 'md5:343a15257b376c276e210b78e900ffea',
'chapter': 'Hora duchů a Polibek dva tajemné příběhy Gustava Adolfa Bécquera',
'thumbnail': 'https://portal.rozhlas.cz/sites/default/files/images/2adfe1387fb140634be725c1ccf26214.jpg',
'timestamp': 1708173000,
'episode': 'Episode 1',
'episode_number': 1,
'series': 'Povídka',
'modified_date': '20240217',
'upload_date': '20240217',
'modified_timestamp': 1708173198,
'channel_id': 'vltava',
},
'params': {'skip_download': 'dash'},
}]
def _call_api(self, path, item_id, msg='API JSON'):
@ -322,7 +342,7 @@ def _real_extract(self, url):
entity = info['siteEntityBundle']
if entity == 'episode':
if entity in ('episode', 'serialPart'):
return self._extract_audio_entry(self._call_api(
'episodes', info['contentId'], 'episode info API JSON'))

View file

@ -9,7 +9,6 @@
get_element_html_by_class,
get_elements_by_class,
int_or_none,
join_nonempty,
parse_count,
parse_duration,
unescapeHTML,
@ -18,10 +17,10 @@
class Rule34VideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?rule34video\.com/videos/(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?rule34video\.com/videos?/(?P<id>\d+)'
_TESTS = [
{
'url': 'https://rule34video.com/videos/3065157/shot-it-mmd-hmv/',
'url': 'https://rule34video.com/video/3065157/shot-it-mmd-hmv/',
'md5': 'ffccac2c23799dabbd192621ae4d04f3',
'info_dict': {
'id': '3065157',
@ -57,7 +56,7 @@ class Rule34VideoIE(InfoExtractor):
'comment_count': int,
'timestamp': 1640131200,
'description': '',
'creator': 'WildeerStudio',
'creators': ['WildeerStudio'],
'upload_date': '20211222',
'uploader': 'CerZule',
'uploader_url': 'https://rule34video.com/members/36281/',
@ -81,13 +80,13 @@ def _real_extract(self, url):
'quality': quality,
})
categories, creator, uploader, uploader_url = [None] * 4
categories, creators, uploader, uploader_url = [None] * 4
for col in get_elements_by_class('col', webpage):
label = clean_html(get_element_by_class('label', col))
if label == 'Categories:':
categories = list(map(clean_html, get_elements_by_class('item', col)))
elif label == 'Artist:':
creator = join_nonempty(*map(clean_html, get_elements_by_class('item', col)), delim=', ')
creators = list(map(clean_html, get_elements_by_class('item', col)))
elif label == 'Uploaded By:':
uploader = clean_html(get_element_by_class('name', col))
uploader_url = extract_attributes(get_element_html_by_class('name', col) or '').get('href')
@ -115,7 +114,7 @@ def _real_extract(self, url):
'comment_count': int_or_none(self._search_regex(
r'[^(]+\((\d+)\)', get_element_by_attribute('href', '#tab_comments', webpage), 'comment count', fatal=False)),
'age_limit': 18,
'creator': creator,
'creators': creators,
'uploader': uploader,
'uploader_url': uploader_url,
'categories': categories,

View file

@ -383,7 +383,7 @@ def entries(self, url, playlist_id):
if isinstance(e.cause, HTTPError) and e.cause.status == 404:
break
raise
for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage):
for video_url in re.findall(r'class="[^>"]*videostream__link[^>]+href="([^"]+\.html)"', webpage):
yield self.url_result('https://rumble.com' + video_url)
def _real_extract(self, url):

View file

@ -5,7 +5,10 @@
class ScreencastifyIE(InfoExtractor):
_VALID_URL = r'https?://watch\.screencastify\.com/v/(?P<id>[^/?#]+)'
_VALID_URL = [
r'https?://watch\.screencastify\.com/v/(?P<id>[^/?#]+)',
r'https?://app\.screencastify\.com/v[23]/watch/(?P<id>[^/?#]+)',
]
_TESTS = [{
'url': 'https://watch.screencastify.com/v/sYVkZip3quLKhHw4Ybk8',
'info_dict': {
@ -19,6 +22,21 @@ class ScreencastifyIE(InfoExtractor):
'params': {
'skip_download': 'm3u8',
},
}, {
'url': 'https://app.screencastify.com/v3/watch/J5N7H11wofDN1jZUCr3t',
'info_dict': {
'id': 'J5N7H11wofDN1jZUCr3t',
'ext': 'mp4',
'uploader': 'Scott Piesen',
'description': '',
'title': 'Lesson Recording 1-17 Burrr...',
},
'params': {
'skip_download': 'm3u8',
},
}, {
'url': 'https://app.screencastify.com/v2/watch/BQ26VbUdfbQLhKzkktOk',
'only_matching': True,
}]
def _real_extract(self, url):

218
yt_dlp/extractor/sejmpl.py Normal file
View file

@ -0,0 +1,218 @@
import datetime
from .common import InfoExtractor
from .redge import RedCDNLivxIE
from ..utils import (
clean_html,
join_nonempty,
js_to_json,
strip_or_none,
update_url_query,
)
from ..utils.traversal import traverse_obj
def is_dst(date):
last_march = datetime.datetime(date.year, 3, 31)
last_october = datetime.datetime(date.year, 10, 31)
last_sunday_march = last_march - datetime.timedelta(days=last_march.isoweekday() % 7)
last_sunday_october = last_october - datetime.timedelta(days=last_october.isoweekday() % 7)
return last_sunday_march.replace(hour=2) <= date <= last_sunday_october.replace(hour=3)
def rfc3339_to_atende(date):
date = datetime.datetime.fromisoformat(date)
date = date + datetime.timedelta(hours=1 if is_dst(date) else 0)
return int((date.timestamp() - 978307200) * 1000)
class SejmIE(InfoExtractor):
_VALID_URL = (
r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P<term>\d+)\.nsf/transmisje(?:_arch)?\.xsp(?:\?[^#]*)?#(?P<id>[\dA-F]+)',
r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P<term>\d+)\.nsf/transmisje(?:_arch)?\.xsp\?(?:[^#]+&)?unid=(?P<id>[\dA-F]+)',
r'https?://sejm-embed\.redcdn\.pl/[Ss]ejm(?P<term>\d+)\.nsf/VideoFrame\.xsp/(?P<id>[\dA-F]+)',
)
IE_NAME = 'sejm'
_TESTS = [{
# multiple cameras, polish SL iterpreter
'url': 'https://www.sejm.gov.pl/Sejm10.nsf/transmisje_arch.xsp#6181EF1AD9CEEBB5C1258A6D006452B5',
'info_dict': {
'id': '6181EF1AD9CEEBB5C1258A6D006452B5',
'title': '1. posiedzenie Sejmu X kadencji',
'duration': 20145,
'live_status': 'was_live',
'location': 'Sala Posiedzeń',
},
'playlist': [{
'info_dict': {
'id': 'ENC01-722340000000-722360145000',
'ext': 'mp4',
'duration': 20145,
'title': '1. posiedzenie Sejmu X kadencji - ENC01',
'live_status': 'was_live',
},
}, {
'info_dict': {
'id': 'ENC30-722340000000-722360145000',
'ext': 'mp4',
'duration': 20145,
'title': '1. posiedzenie Sejmu X kadencji - ENC30',
'live_status': 'was_live',
},
}, {
'info_dict': {
'id': 'ENC31-722340000000-722360145000',
'ext': 'mp4',
'duration': 20145,
'title': '1. posiedzenie Sejmu X kadencji - ENC31',
'live_status': 'was_live',
},
}, {
'info_dict': {
'id': 'ENC32-722340000000-722360145000',
'ext': 'mp4',
'duration': 20145,
'title': '1. posiedzenie Sejmu X kadencji - ENC32',
'live_status': 'was_live',
},
}, {
# sign lang interpreter
'info_dict': {
'id': 'Migacz-ENC01-1-722340000000-722360145000',
'ext': 'mp4',
'duration': 20145,
'title': '1. posiedzenie Sejmu X kadencji - Migacz-ENC01',
'live_status': 'was_live',
},
}],
}, {
'url': 'https://www.sejm.gov.pl/Sejm8.nsf/transmisje.xsp?unid=9377A9D65518E9A5C125808E002E9FF2',
'info_dict': {
'id': '9377A9D65518E9A5C125808E002E9FF2',
'title': 'Debata "Lepsza Polska: obywatelska"',
'description': 'KP .Nowoczesna',
'duration': 8770,
'live_status': 'was_live',
'location': 'sala kolumnowa im. Kazimierza Pużaka (bud. C-D)',
},
'playlist': [{
'info_dict': {
'id': 'ENC08-1-503831270000-503840040000',
'ext': 'mp4',
'duration': 8770,
'title': 'Debata "Lepsza Polska: obywatelska" - ENC08',
'live_status': 'was_live',
},
}],
}, {
# 7th term is very special, since it does not use redcdn livx
'url': 'https://www.sejm.gov.pl/sejm7.nsf/transmisje_arch.xsp?rok=2015&month=11#A6E6D475ECCC6FE5C1257EF90034817F',
'info_dict': {
'id': 'A6E6D475ECCC6FE5C1257EF90034817F',
'title': 'Konferencja prasowa - Stanowisko SLD ws. składu nowego rządu',
'description': 'SLD - Biuro Prasowe Klubu',
'duration': 514,
'location': 'sala 101/bud. C',
'live_status': 'was_live',
},
'playlist': [{
'info_dict': {
'id': 'A6E6D475ECCC6FE5C1257EF90034817F',
'ext': 'mp4',
'title': 'Konferencja prasowa - Stanowisko SLD ws. składu nowego rządu',
'duration': 514,
},
}],
}, {
'url': 'https://sejm-embed.redcdn.pl/Sejm10.nsf/VideoFrame.xsp/FED58EABB97FBD53C1258A7400386492',
'only_matching': True,
}]
def _real_extract(self, url):
term, video_id = self._match_valid_url(url).group('term', 'id')
frame = self._download_webpage(
f'https://sejm-embed.redcdn.pl/Sejm{term}.nsf/VideoFrame.xsp/{video_id}',
video_id)
# despite it says "transmisje_arch", it works for live streams too!
data = self._download_json(
f'https://www.sejm.gov.pl/Sejm{term}.nsf/transmisje_arch.xsp/json/{video_id}',
video_id)
params = data['params']
title = strip_or_none(data.get('title'))
if data.get('status') == 'VIDEO_ENDED':
live_status = 'was_live'
elif data.get('status') == 'VIDEO_PLAYING':
live_status = 'is_live'
else:
live_status = None
self.report_warning(f'unknown status: {data.get("status")}')
start_time = rfc3339_to_atende(params['start'])
# current streams have a stop time of *expected* end of session, but actual times
# can change during the transmission. setting a stop_time would artificially
# end the stream at that time, while the session actually keeps going.
if live_status == 'was_live':
stop_time = rfc3339_to_atende(params['stop'])
duration = (stop_time - start_time) // 1000
else:
stop_time, duration = None, None
entries = []
def add_entry(file, legacy_file=False):
if not file:
return
file = self._proto_relative_url(file)
if not legacy_file:
file = update_url_query(file, {'startTime': start_time})
if stop_time is not None:
file = update_url_query(file, {'stopTime': stop_time})
stream_id = self._search_regex(r'/o2/sejm/([^/]+)/[^./]+\.livx', file, 'stream id')
common_info = {
'url': file,
'duration': duration,
}
if legacy_file:
entries.append({
**common_info,
'id': video_id,
'title': title,
})
else:
entries.append({
**common_info,
'_type': 'url_transparent',
'ie_key': RedCDNLivxIE.ie_key(),
'id': stream_id,
'title': join_nonempty(title, stream_id, delim=' - '),
})
cameras = self._search_json(
r'var\s+cameras\s*=', frame, 'camera list', video_id,
contains_pattern=r'\[(?s:.+)\]', transform_source=js_to_json,
fatal=False) or []
for camera_file in traverse_obj(cameras, (..., 'file', {dict})):
if camera_file.get('flv'):
add_entry(camera_file['flv'])
elif camera_file.get('mp4'):
# this is only a thing in 7th term. no streams before, and starting 8th it's redcdn livx
add_entry(camera_file['mp4'], legacy_file=True)
else:
self.report_warning('Unknown camera stream type found')
if params.get('mig'):
add_entry(self._search_regex(r"var sliUrl\s*=\s*'([^']+)'", frame, 'sign language interpreter url', fatal=False))
return {
'_type': 'playlist',
'entries': entries,
'id': video_id,
'title': title,
'description': clean_html(data.get('desc')) or None,
'duration': duration,
'live_status': live_status,
'location': strip_or_none(data.get('location')),
}

View file

@ -7,8 +7,6 @@
determine_ext,
dict_get,
int_or_none,
str_or_none,
strip_or_none,
traverse_obj,
try_get,
unified_timestamp,
@ -388,15 +386,55 @@ def _real_extract(self, url):
dict_get(series, ('longDescription', 'shortDescription')))
class SVTPageIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?svt\.se/(?P<path>(?:[^/]+/)*(?P<id>[^/?&#]+))'
class SVTPageIE(SVTBaseIE):
_VALID_URL = r'https?://(?:www\.)?svt\.se/(?:[^/?#]+/)*(?P<id>[^/?&#]+)'
_TESTS = [{
'url': 'https://www.svt.se/nyheter/lokalt/skane/viktor-18-forlorade-armar-och-ben-i-sepsis-vill-ateruppta-karaten-och-bli-svetsare',
'info_dict': {
'title': 'Viktor, 18, förlorade armar och ben i sepsis vill återuppta karaten och bli svetsare',
'id': 'viktor-18-forlorade-armar-och-ben-i-sepsis-vill-ateruppta-karaten-och-bli-svetsare',
},
'playlist_count': 2,
}, {
'url': 'https://www.svt.se/nyheter/lokalt/skane/forsvarsmakten-om-trafikkaoset-pa-e22-kunde-inte-varit-dar-snabbare',
'info_dict': {
'id': 'jXvk42E',
'title': 'Försvarsmakten om trafikkaoset på E22: Kunde inte varit där snabbare',
'ext': 'mp4',
"duration": 80,
'age_limit': 0,
'timestamp': 1704370009,
'episode': 'Försvarsmakten om trafikkaoset på E22: Kunde inte varit där snabbare',
'series': 'Lokala Nyheter Skåne',
'upload_date': '20240104'
},
'params': {
'skip_download': True,
}
}, {
'url': 'https://www.svt.se/nyheter/svtforum/2023-tungt-ar-for-svensk-media',
'info_dict': {
'title': '2023 tungt år för svensk media',
'id': 'ewqAZv4',
'ext': 'mp4',
"duration": 3074,
'age_limit': 0,
'series': '',
'timestamp': 1702980479,
'upload_date': '20231219',
'episode': 'Mediestudier'
},
'params': {
'skip_download': True,
}
}, {
'url': 'https://www.svt.se/sport/ishockey/bakom-masken-lehners-kamp-mot-mental-ohalsa',
'info_dict': {
'id': '25298267',
'title': 'Bakom masken Lehners kamp mot mental ohälsa',
},
'playlist_count': 4,
'skip': 'Video is gone'
}, {
'url': 'https://www.svt.se/nyheter/utrikes/svenska-andrea-ar-en-mil-fran-branderna-i-kalifornien',
'info_dict': {
@ -404,6 +442,7 @@ class SVTPageIE(InfoExtractor):
'title': 'Svenska Andrea redo att fly sitt hem i Kalifornien',
},
'playlist_count': 2,
'skip': 'Video is gone'
}, {
# only programTitle
'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
@ -414,6 +453,7 @@ class SVTPageIE(InfoExtractor):
'duration': 27,
'age_limit': 0,
},
'skip': 'Video is gone'
}, {
'url': 'https://www.svt.se/nyheter/lokalt/vast/svt-testar-tar-nagon-upp-skrapet-1',
'only_matching': True,
@ -427,26 +467,23 @@ def suitable(cls, url):
return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPageIE, cls).suitable(url)
def _real_extract(self, url):
path, display_id = self._match_valid_url(url).groups()
display_id = self._match_id(url)
article = self._download_json(
'https://api.svt.se/nss-api/page/' + path, display_id,
query={'q': 'articles'})['articles']['content'][0]
webpage = self._download_webpage(url, display_id)
title = self._og_search_title(webpage)
entries = []
urql_state = self._search_json(
r'window\.svt\.nyh\.urqlState\s*=', webpage, 'json data', display_id)
def _process_content(content):
if content.get('_type') in ('VIDEOCLIP', 'VIDEOEPISODE'):
video_id = compat_str(content['image']['svtId'])
entries.append(self.url_result(
'svt:' + video_id, SVTPlayIE.ie_key(), video_id))
data = traverse_obj(urql_state, (..., 'data', {str}, {json.loads}), get_all=False) or {}
for media in article.get('media', []):
_process_content(media)
def entries():
for video_id in set(traverse_obj(data, (
'page', (('topMedia', 'svtId'), ('body', ..., 'video', 'svtId')), {str}
))):
info = self._extract_video(
self._download_json(f'https://api.svt.se/video/{video_id}', video_id), video_id)
info['title'] = title
yield info
for obj in article.get('structuredBody', []):
_process_content(obj.get('content') or {})
return self.playlist_result(
entries, str_or_none(article.get('id')),
strip_or_none(article.get('title')))
return self.playlist_result(entries(), display_id, title)

View file

@ -1,5 +1,5 @@
from .common import InfoExtractor
from ..utils import int_or_none, traverse_obj
from ..utils import ExtractorError, int_or_none, traverse_obj
class SwearnetEpisodeIE(InfoExtractor):
@ -51,7 +51,13 @@ def _real_extract(self, url):
display_id, season_number, episode_number = self._match_valid_url(url).group('id', 'season_num', 'episode_num')
webpage = self._download_webpage(url, display_id)
external_id = self._search_regex(r'externalid\s*=\s*"([^"]+)', webpage, 'externalid')
try:
external_id = self._search_regex(r'externalid\s*=\s*"([^"]+)', webpage, 'externalid')
except ExtractorError:
if 'Upgrade Now' in webpage:
self.raise_login_required()
raise
json_data = self._download_json(
f'https://play.vidyard.com/player/{external_id}.json', display_id)['payload']['chapters'][0]

View file

@ -6,7 +6,7 @@
import time
from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote, compat_urllib_parse_urlparse
from ..compat import compat_urllib_parse_urlparse
from ..networking import HEADRequest
from ..utils import (
ExtractorError,
@ -15,7 +15,6 @@
UserNotLive,
determine_ext,
format_field,
get_first,
int_or_none,
join_nonempty,
merge_dicts,
@ -219,8 +218,8 @@ def audio_meta(url):
def extract_addr(addr, add_meta={}):
parsed_meta, res = parse_url_key(addr.get('url_key', ''))
if res:
known_resolutions.setdefault(res, {}).setdefault('height', add_meta.get('height') or addr.get('height'))
known_resolutions[res].setdefault('width', add_meta.get('width') or addr.get('width'))
known_resolutions.setdefault(res, {}).setdefault('height', int_or_none(addr.get('height')))
known_resolutions[res].setdefault('width', int_or_none(addr.get('width')))
parsed_meta.update(known_resolutions.get(res, {}))
add_meta.setdefault('height', int_or_none(res[:-1]))
return [{
@ -237,22 +236,26 @@ def extract_addr(addr, add_meta={}):
# Hack: Add direct video links first to prioritize them when removing duplicate formats
formats = []
width = int_or_none(video_info.get('width'))
height = int_or_none(video_info.get('height'))
if video_info.get('play_addr'):
formats.extend(extract_addr(video_info['play_addr'], {
'format_id': 'play_addr',
'format_note': 'Direct video',
'vcodec': 'h265' if traverse_obj(
video_info, 'is_bytevc1', 'is_h265') else 'h264', # TODO: Check for "direct iOS" videos, like https://www.tiktok.com/@cookierun_dev/video/7039716639834656002
'width': video_info.get('width'),
'height': video_info.get('height'),
'width': width,
'height': height,
}))
if video_info.get('download_addr'):
formats.extend(extract_addr(video_info['download_addr'], {
download_addr = video_info['download_addr']
dl_width = int_or_none(download_addr.get('width'))
formats.extend(extract_addr(download_addr, {
'format_id': 'download_addr',
'format_note': 'Download video%s' % (', watermarked' if video_info.get('has_watermark') else ''),
'vcodec': 'h264',
'width': video_info.get('width'),
'height': video_info.get('height'),
'width': dl_width or width,
'height': try_call(lambda: int(dl_width / 0.5625)) or height, # download_addr['height'] is wrong
'preference': -2 if video_info.get('has_watermark') else -1,
}))
if video_info.get('play_addr_h264'):
@ -315,9 +318,6 @@ def extract_addr(addr, add_meta={}):
return {
'id': aweme_id,
'extractor_key': TikTokIE.ie_key(),
'extractor': TikTokIE.IE_NAME,
'webpage_url': self._create_url(author_info.get('uid'), aweme_id),
**traverse_obj(aweme_detail, {
'title': ('desc', {str}),
'description': ('desc', {str}),
@ -921,20 +921,23 @@ class DouyinIE(TikTokBaseIE):
_VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://www.douyin.com/video/6961737553342991651',
'md5': 'a97db7e3e67eb57bf40735c022ffa228',
'md5': '9ecce7bc5b302601018ecb2871c63a75',
'info_dict': {
'id': '6961737553342991651',
'ext': 'mp4',
'title': '#杨超越 小小水手带你去远航❤️',
'description': '#杨超越 小小水手带你去远航❤️',
'uploader': '6897520xka',
'uploader_id': '110403406559',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'creator': '杨超越',
'duration': 19782,
'creators': ['杨超越'],
'duration': 19,
'timestamp': 1620905839,
'upload_date': '20210513',
'track': '@杨超越创作的原声',
'artists': ['杨超越'],
'view_count': int,
'like_count': int,
'repost_count': int,
@ -943,20 +946,23 @@ class DouyinIE(TikTokBaseIE):
},
}, {
'url': 'https://www.douyin.com/video/6982497745948921092',
'md5': '34a87ebff3833357733da3fe17e37c0e',
'md5': '15c5e660b7048af3707304e3cc02bbb5',
'info_dict': {
'id': '6982497745948921092',
'ext': 'mp4',
'title': '这个夏日和小羊@杨超越 一起遇见白色幻想',
'description': '这个夏日和小羊@杨超越 一起遇见白色幻想',
'uploader': '0731chaoyue',
'uploader_id': '408654318141572',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
'channel_id': 'MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
'creator': '杨超越工作室',
'duration': 42479,
'creators': ['杨超越工作室'],
'duration': 42,
'timestamp': 1625739481,
'upload_date': '20210708',
'track': '@杨超越工作室创作的原声',
'artists': ['杨超越工作室'],
'view_count': int,
'like_count': int,
'repost_count': int,
@ -965,20 +971,23 @@ class DouyinIE(TikTokBaseIE):
},
}, {
'url': 'https://www.douyin.com/video/6953975910773099811',
'md5': 'dde3302460f19db59c47060ff013b902',
'md5': '0e6443758b8355db9a3c34864a4276be',
'info_dict': {
'id': '6953975910773099811',
'ext': 'mp4',
'title': '#一起看海 出现在你的夏日里',
'description': '#一起看海 出现在你的夏日里',
'uploader': '6897520xka',
'uploader_id': '110403406559',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'creator': '杨超越',
'duration': 17343,
'creators': ['杨超越'],
'duration': 17,
'timestamp': 1619098692,
'upload_date': '20210422',
'track': '@杨超越创作的原声',
'artists': ['杨超越'],
'view_count': int,
'like_count': int,
'repost_count': int,
@ -1004,20 +1013,23 @@ class DouyinIE(TikTokBaseIE):
'skip': 'No longer available',
}, {
'url': 'https://www.douyin.com/video/6963263655114722595',
'md5': 'cf9f11f0ec45d131445ec2f06766e122',
'md5': '1440bcf59d8700f8e014da073a4dfea8',
'info_dict': {
'id': '6963263655114722595',
'ext': 'mp4',
'title': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
'description': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
'uploader': '6897520xka',
'uploader_id': '110403406559',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'creator': '杨超越',
'duration': 15115,
'creators': ['杨超越'],
'duration': 15,
'timestamp': 1621261163,
'upload_date': '20210517',
'track': '@杨超越创作的原声',
'artists': ['杨超越'],
'view_count': int,
'like_count': int,
'repost_count': int,
@ -1025,34 +1037,23 @@ class DouyinIE(TikTokBaseIE):
'thumbnail': r're:https?://.+\.jpe?g',
},
}]
_APP_VERSIONS = [('23.3.0', '230300')]
_APP_NAME = 'aweme'
_AID = 1128
_API_HOSTNAME = 'aweme.snssdk.com'
_UPLOADER_URL_FORMAT = 'https://www.douyin.com/user/%s'
_WEBPAGE_HOST = 'https://www.douyin.com/'
def _real_extract(self, url):
video_id = self._match_id(url)
try:
return self._extract_aweme_app(video_id)
except ExtractorError as e:
e.expected = True
self.to_screen(f'{e}; trying with webpage')
webpage = self._download_webpage(url, video_id)
render_data = self._search_json(
r'<script [^>]*\bid=[\'"]RENDER_DATA[\'"][^>]*>', webpage, 'render data', video_id,
contains_pattern=r'%7B(?s:.+)%7D', fatal=False, transform_source=compat_urllib_parse_unquote)
if not render_data:
detail = traverse_obj(self._download_json(
'https://www.douyin.com/aweme/v1/web/aweme/detail/', video_id,
'Downloading web detail JSON', 'Failed to download web detail JSON',
query={'aweme_id': video_id}, fatal=False), ('aweme_detail', {dict}))
if not detail:
# TODO: Run verification challenge code to generate signature cookies
cookies = self._get_cookies(self._WEBPAGE_HOST)
expected = not cookies.get('s_v_web_id') or not cookies.get('ttwid')
raise ExtractorError(
'Fresh cookies (not necessarily logged in) are needed', expected=expected)
'Fresh cookies (not necessarily logged in) are needed',
expected=not self._get_cookies(self._WEBPAGE_HOST).get('s_v_web_id'))
return self._parse_aweme_video_web(get_first(render_data, ('aweme', 'detail')), url, video_id)
return self._parse_aweme_video_app(detail)
class TikTokVMIE(InfoExtractor):

View file

@ -21,7 +21,7 @@
class TVPIE(InfoExtractor):
IE_NAME = 'tvp'
IE_DESC = 'Telewizja Polska'
_VALID_URL = r'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|tvpworld\.com|swipeto\.pl)/(?:(?!\d+/)[^/]+/)*(?P<id>\d+)'
_VALID_URL = r'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|tvpworld\.com|swipeto\.pl)/(?:(?!\d+/)[^/]+/)*(?P<id>\d+)(?:[/?#]|$)'
_TESTS = [{
# TVPlayer 2 in js wrapper
@ -514,7 +514,7 @@ def _parse_video(self, video, with_url=True):
class TVPVODVideoIE(TVPVODBaseIE):
IE_NAME = 'tvp:vod'
_VALID_URL = r'https?://vod\.tvp\.pl/[a-z\d-]+,\d+/[a-z\d-]+(?<!-odcinki)(?:-odcinki,\d+/odcinek-\d+,S\d+E\d+)?,(?P<id>\d+)(?:\?[^#]+)?(?:#.+)?$'
_VALID_URL = r'https?://vod\.tvp\.pl/(?P<category>[a-z\d-]+,\d+)/[a-z\d-]+(?<!-odcinki)(?:-odcinki,\d+/odcinek-\d+,S\d+E\d+)?,(?P<id>\d+)/?(?:[?#]|$)'
_TESTS = [{
'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338/odcinek-24,S01E24,311357',
@ -560,12 +560,23 @@ class TVPVODVideoIE(TVPVODBaseIE):
'thumbnail': 're:https?://.+',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://vod.tvp.pl/live,1/tvp-world,399731',
'info_dict': {
'id': '399731',
'ext': 'mp4',
'title': r're:TVP WORLD \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
'live_status': 'is_live',
'thumbnail': 're:https?://.+',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
category, video_id = self._match_valid_url(url).group('category', 'id')
info_dict = self._parse_video(self._call_api(f'vods/{video_id}', video_id), with_url=False)
is_live = category == 'live,1'
entity = 'lives' if is_live else 'vods'
info_dict = self._parse_video(self._call_api(f'{entity}/{video_id}', video_id), with_url=False)
playlist = self._call_api(f'{video_id}/videos/playlist', video_id, query={'videoType': 'MOVIE'})
@ -582,6 +593,8 @@ def _real_extract(self, url):
'ext': 'ttml',
})
info_dict['is_live'] = is_live
return info_dict

View file

@ -100,9 +100,13 @@ def _extract_variant_formats(self, variant, video_id):
if not variant_url:
return [], {}
elif '.m3u8' in variant_url:
return self._extract_m3u8_formats_and_subtitles(
fmts, subs = self._extract_m3u8_formats_and_subtitles(
variant_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False)
for f in traverse_obj(fmts, lambda _, v: v['vcodec'] == 'none' and v.get('tbr') is None):
if mobj := re.match(r'hls-[Aa]udio-(?P<bitrate>\d{4,})', f['format_id']):
f['tbr'] = int_or_none(mobj.group('bitrate'), 1000)
return fmts, subs
else:
tbr = int_or_none(dict_get(variant, ('bitrate', 'bit_rate')), 1000) or None
f = {
@ -471,6 +475,7 @@ class TwitterIE(TwitterBaseIE):
'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!',
'thumbnail': r're:^https?://.*\.jpg',
'description': 'FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ',
'channel_id': '549749560',
'uploader': 'FREE THE NIPPLE',
'uploader_id': 'freethenipple',
'duration': 12.922,
@ -484,6 +489,7 @@ class TwitterIE(TwitterBaseIE):
'age_limit': 18,
'_old_archive_ids': ['twitter 643211948184596480'],
},
'skip': 'Requires authentication',
}, {
'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1',
'md5': 'f36dcd5fb92bf7057f155e7d927eeb42',
@ -506,6 +512,7 @@ class TwitterIE(TwitterBaseIE):
'ext': 'mp4',
'title': r're:Star Wars.*A new beginning is coming December 18.*',
'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ',
'channel_id': '20106852',
'uploader_id': 'starwars',
'uploader': r're:Star Wars.*',
'timestamp': 1447395772,
@ -551,6 +558,7 @@ class TwitterIE(TwitterBaseIE):
'title': 'jaydin donte geer - BEAT PROD: @suhmeduh #Damndaniel',
'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ',
'thumbnail': r're:^https?://.*\.jpg',
'channel_id': '1383165541',
'uploader': 'jaydin donte geer',
'uploader_id': 'jaydingeer',
'duration': 30.0,
@ -591,6 +599,7 @@ class TwitterIE(TwitterBaseIE):
'ext': 'mp4',
'title': 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.',
'description': '@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI',
'channel_id': '701615052',
'uploader_id': 'CaptainAmerica',
'uploader': 'Captain America',
'duration': 3.17,
@ -627,6 +636,7 @@ class TwitterIE(TwitterBaseIE):
'ext': 'mp4',
'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة',
'description': 'كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN',
'channel_id': '2526757026',
'uploader': 'عالم الأخبار',
'uploader_id': 'news_al3alm',
'duration': 277.4,
@ -651,6 +661,7 @@ class TwitterIE(TwitterBaseIE):
'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.',
'thumbnail': r're:^https?://.*\.jpg',
'description': '[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo',
'channel_id': '2319432498',
'uploader': 'Préfet de Guadeloupe',
'uploader_id': 'Prefet971',
'duration': 47.48,
@ -677,6 +688,7 @@ class TwitterIE(TwitterBaseIE):
'title': 're:.*?Shep is on a roll today.*?',
'thumbnail': r're:^https?://.*\.jpg',
'description': 'md5:37b9f2ff31720cef23b2bd42ee8a0f09',
'channel_id': '255036353',
'uploader': 'Lis Power',
'uploader_id': 'LisPower1',
'duration': 111.278,
@ -741,6 +753,7 @@ class TwitterIE(TwitterBaseIE):
'title': 'md5:d1c4941658e4caaa6cb579260d85dcba',
'thumbnail': r're:^https?://.*\.jpg',
'description': 'md5:71ead15ec44cee55071547d6447c6a3e',
'channel_id': '18552281',
'uploader': 'Brooklyn Nets',
'uploader_id': 'BrooklynNets',
'duration': 324.484,
@ -763,10 +776,11 @@ class TwitterIE(TwitterBaseIE):
'id': '1577855447914409984',
'display_id': '1577855540407197696',
'ext': 'mp4',
'title': 'md5:9d198efb93557b8f8d5b78c480407214',
'title': 'md5:466a3a8b049b5f5a13164ce915484b51',
'description': 'md5:b9c3699335447391d11753ab21c70a74',
'upload_date': '20221006',
'uploader': 'oshtru',
'channel_id': '143077138',
'uploader': 'Oshtru',
'uploader_id': 'oshtru',
'uploader_url': 'https://twitter.com/oshtru',
'thumbnail': r're:^https?://.*\.jpg',
@ -784,9 +798,10 @@ class TwitterIE(TwitterBaseIE):
'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464',
'info_dict': {
'id': '1577719286659006464',
'title': 'Ultima - Test',
'title': 'Ultima Reload - Test',
'description': 'Test https://t.co/Y3KEZD7Dad',
'uploader': 'Ultima',
'channel_id': '168922496',
'uploader': 'Ultima Reload',
'uploader_id': 'UltimaShadowX',
'uploader_url': 'https://twitter.com/UltimaShadowX',
'upload_date': '20221005',
@ -808,6 +823,7 @@ class TwitterIE(TwitterBaseIE):
'title': 'md5:eec26382babd0f7c18f041db8ae1c9c9',
'thumbnail': r're:^https?://.*\.jpg',
'description': 'md5:95aea692fda36a12081b9629b02daa92',
'channel_id': '1094109584',
'uploader': 'Max Olson',
'uploader_id': 'MesoMax919',
'uploader_url': 'https://twitter.com/MesoMax919',
@ -830,6 +846,7 @@ class TwitterIE(TwitterBaseIE):
'ext': 'mp4',
'title': str,
'description': str,
'channel_id': '1217167793541480450',
'uploader': str,
'uploader_id': 'Rizdraws',
'uploader_url': 'https://twitter.com/Rizdraws',
@ -840,7 +857,8 @@ class TwitterIE(TwitterBaseIE):
'repost_count': int,
'comment_count': int,
'age_limit': 18,
'tags': []
'tags': [],
'_old_archive_ids': ['twitter 1575199173472927762'],
},
'params': {'skip_download': 'The media could not be played'},
'skip': 'Requires authentication',
@ -852,6 +870,7 @@ class TwitterIE(TwitterBaseIE):
'id': '1395079556562706435',
'title': str,
'tags': [],
'channel_id': '21539378',
'uploader': str,
'like_count': int,
'upload_date': '20210519',
@ -869,6 +888,7 @@ class TwitterIE(TwitterBaseIE):
'info_dict': {
'id': '1578353380363501568',
'title': str,
'channel_id': '2195866214',
'uploader_id': 'DavidToons_',
'repost_count': int,
'like_count': int,
@ -888,6 +908,7 @@ class TwitterIE(TwitterBaseIE):
'id': '1578401165338976258',
'title': str,
'description': 'md5:659a6b517a034b4cee5d795381a2dc41',
'channel_id': '19338359',
'uploader': str,
'uploader_id': 'primevideouk',
'timestamp': 1665155137,
@ -929,6 +950,7 @@ class TwitterIE(TwitterBaseIE):
'description': 'md5:591c19ce66fadc2359725d5cd0d1052c',
'comment_count': int,
'uploader_id': 'CTVJLaidlaw',
'channel_id': '80082014',
'repost_count': int,
'tags': ['colorectalcancer', 'cancerjourney', 'imnotaquitter'],
'upload_date': '20221208',
@ -946,6 +968,7 @@ class TwitterIE(TwitterBaseIE):
'title': 'md5:7662a0a27ce6faa3e5b160340f3cfab1',
'thumbnail': r're:^https?://.+\.jpg',
'timestamp': 1670459604.0,
'channel_id': '80082014',
'uploader_id': 'CTVJLaidlaw',
'uploader': 'Jocelyn Laidlaw',
'repost_count': int,
@ -972,6 +995,7 @@ class TwitterIE(TwitterBaseIE):
'title': '뽀 - 아 최우제 이동속도 봐',
'description': '아 최우제 이동속도 봐 https://t.co/dxu2U5vXXB',
'duration': 24.598,
'channel_id': '1281839411068432384',
'uploader': '',
'uploader_id': 's2FAKER',
'uploader_url': 'https://twitter.com/s2FAKER',
@ -985,6 +1009,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'_old_archive_ids': ['twitter 1621117700482416640'],
},
'skip': 'Requires authentication',
}, {
'url': 'https://twitter.com/hlo_again/status/1599108751385972737/video/2',
'info_dict': {
@ -992,6 +1017,7 @@ class TwitterIE(TwitterBaseIE):
'display_id': '1599108751385972737',
'ext': 'mp4',
'title': '\u06ea - \U0001F48B',
'channel_id': '1347791436809441283',
'uploader_url': 'https://twitter.com/hlo_again',
'like_count': int,
'uploader_id': 'hlo_again',
@ -1014,6 +1040,7 @@ class TwitterIE(TwitterBaseIE):
'id': '1600009362759733248',
'display_id': '1600009574919962625',
'ext': 'mp4',
'channel_id': '211814412',
'uploader_url': 'https://twitter.com/MunTheShinobi',
'description': 'This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525 https://t.co/cNsA0MoOml',
'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig',
@ -1061,6 +1088,7 @@ class TwitterIE(TwitterBaseIE):
'display_id': '1695424220702888009',
'title': 'md5:e8daa9527bc2b947121395494f786d9d',
'description': 'md5:004f2d37fd58737724ec75bc7e679938',
'channel_id': '15212187',
'uploader': 'Benny Johnson',
'uploader_id': 'bennyjohnson',
'uploader_url': 'https://twitter.com/bennyjohnson',
@ -1084,6 +1112,7 @@ class TwitterIE(TwitterBaseIE):
'display_id': '1695424220702888009',
'title': 'md5:e8daa9527bc2b947121395494f786d9d',
'description': 'md5:004f2d37fd58737724ec75bc7e679938',
'channel_id': '15212187',
'uploader': 'Benny Johnson',
'uploader_id': 'bennyjohnson',
'uploader_url': 'https://twitter.com/bennyjohnson',
@ -1117,7 +1146,7 @@ class TwitterIE(TwitterBaseIE):
},
'add_ie': ['TwitterBroadcast'],
}, {
# Animated gif and quote tweet video, with syndication API
# Animated gif and quote tweet video
'url': 'https://twitter.com/BAKKOOONN/status/1696256659889565950',
'playlist_mincount': 2,
'info_dict': {
@ -1125,6 +1154,7 @@ class TwitterIE(TwitterBaseIE):
'title': 'BAKOON - https://t.co/zom968d0a0',
'description': 'https://t.co/zom968d0a0',
'tags': [],
'channel_id': '1263540390',
'uploader': 'BAKOON',
'uploader_id': 'BAKKOOONN',
'uploader_url': 'https://twitter.com/BAKKOOONN',
@ -1132,19 +1162,21 @@ class TwitterIE(TwitterBaseIE):
'timestamp': 1693254077.0,
'upload_date': '20230828',
'like_count': int,
'comment_count': int,
'repost_count': int,
},
'params': {'extractor_args': {'twitter': {'api': ['syndication']}}},
'expected_warnings': ['Not all metadata'],
'skip': 'Requires authentication',
}, {
# "stale tweet" with typename "TweetWithVisibilityResults"
'url': 'https://twitter.com/RobertKennedyJr/status/1724884212803834154',
'md5': '62b1e11cdc2cdd0e527f83adb081f536',
'md5': '511377ff8dfa7545307084dca4dce319',
'info_dict': {
'id': '1724883339285544960',
'ext': 'mp4',
'title': 'md5:cc56716f9ed0b368de2ba54c478e493c',
'description': 'md5:9dc14f5b0f1311fc7caf591ae253a164',
'display_id': '1724884212803834154',
'channel_id': '337808606',
'uploader': 'Robert F. Kennedy Jr',
'uploader_id': 'RobertKennedyJr',
'uploader_url': 'https://twitter.com/RobertKennedyJr',
@ -1386,6 +1418,7 @@ def _real_extract(self, url):
'description': description,
'uploader': uploader,
'timestamp': unified_timestamp(status.get('created_at')),
'channel_id': str_or_none(status.get('user_id_str')) or str_or_none(user.get('id_str')),
'uploader_id': uploader_id,
'uploader_url': format_field(uploader_id, None, 'https://twitter.com/%s'),
'like_count': int_or_none(status.get('favorite_count')),

View file

@ -10,6 +10,7 @@
parse_duration,
traverse_obj,
try_call,
url_or_none,
urljoin,
variadic,
)
@ -83,6 +84,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.txxx.tube/contents/videos_sources/16574000/16574965/screenshots/1.jpg',
}
}, {
'url': 'https://txxx.tube/videos/16574965/digital-desire-malena-morgan/',
@ -98,6 +100,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.txxx.tube/contents/videos_sources/16574000/16574965/screenshots/1.jpg',
}
}, {
'url': 'https://vxxx.com/video-68925/',
@ -113,6 +116,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.vxxx.com/contents/videos_sources/68000/68925/screenshots/1.jpg',
}
}, {
'url': 'https://hclips.com/videos/6291073/malena-morgan-masturbates-her-sweet/',
@ -128,6 +132,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://hctn.nv7s.com/contents/videos_sources/6291000/6291073/screenshots/1.jpg',
}
}, {
'url': 'https://hdzog.com/videos/67063/gorgeous-malena-morgan-will-seduce-you-at-the-first-glance/',
@ -143,6 +148,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.hdzog.com/contents/videos_sources/67000/67063/screenshots/1.jpg',
}
}, {
'url': 'https://hdzog.tube/videos/67063/gorgeous-malena-morgan-will-seduce-you-at-the-first-glance/',
@ -158,6 +164,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.hdzog.com/contents/videos_sources/67000/67063/screenshots/1.jpg',
}
}, {
'url': 'https://hotmovs.com/videos/8789287/unbelievable-malena-morgan-performing-in-incredible-masturantion/',
@ -173,6 +180,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.hotmovs.com/contents/videos_sources/8789000/8789287/screenshots/10.jpg',
}
}, {
'url': 'https://hotmovs.tube/videos/8789287/unbelievable-malena-morgan-performing-in-incredible-masturantion/',
@ -188,6 +196,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.hotmovs.com/contents/videos_sources/8789000/8789287/screenshots/10.jpg',
}
}, {
'url': 'https://inporn.com/video/517897/malena-morgan-solo/',
@ -203,6 +212,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://iptn.m3pd.com/media/tn/sources/517897_1.jpg',
}
}, {
'url': 'https://privatehomeclips.com/videos/3630599/malena-morgan-cam-show/',
@ -218,6 +228,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://hctn.nv7s.com/contents/videos_sources/3630000/3630599/screenshots/15.jpg',
}
}, {
'url': 'https://tubepornclassic.com/videos/1015455/mimi-rogers-full-body-massage-nude-compilation/',
@ -233,6 +244,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.tubepornclassic.com/contents/videos_sources/1015000/1015455/screenshots/6.jpg',
}
}, {
'url': 'https://upornia.com/videos/1498858/twistys-malena-morgan-starring-at-dr-morgan-baller/',
@ -248,6 +260,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.upornia.com/contents/videos_sources/1498000/1498858/screenshots/1.jpg',
}
}, {
'url': 'https://upornia.tube/videos/1498858/twistys-malena-morgan-starring-at-dr-morgan-baller/',
@ -263,6 +276,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.upornia.com/contents/videos_sources/1498000/1498858/screenshots/1.jpg',
}
}, {
'url': 'https://vjav.com/videos/11761/yui-hatano-in-if-yui-was-my-girlfriend2/',
@ -278,6 +292,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.vjav.com/contents/videos_sources/11000/11761/screenshots/23.jpg',
}
}, {
'url': 'https://vjav.tube/videos/11761/yui-hatano-in-if-yui-was-my-girlfriend2/',
@ -293,6 +308,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.vjav.com/contents/videos_sources/11000/11761/screenshots/23.jpg',
}
}, {
'url': 'https://voyeurhit.com/videos/332875/charlotte-stokely-elle-alexandra-malena-morgan-lingerie/',
@ -308,6 +324,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.voyeurhit.com/contents/videos_sources/332000/332875/screenshots/1.jpg',
}
}, {
'url': 'https://voyeurhit.tube/videos/332875/charlotte-stokely-elle-alexandra-malena-morgan-lingerie/',
@ -323,6 +340,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.voyeurhit.com/contents/videos_sources/332000/332875/screenshots/1.jpg',
}
}]
_WEBPAGE_TESTS = [{
@ -338,6 +356,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://hctn.nv7s.com/contents/videos_sources/5119000/5119660/screenshots/1.jpg',
}
}]
@ -371,6 +390,7 @@ def _real_extract(self, url):
'like_count': int_or_none(traverse_obj(video_info, ('video', 'statistics', 'likes'))),
'dislike_count': int_or_none(traverse_obj(video_info, ('video', 'statistics', 'dislikes'))),
'age_limit': 18,
'thumbnail': traverse_obj(video_info, ('video', 'thumbsrc', {url_or_none})),
'formats': get_formats(host, video_file),
}

View file

@ -10,7 +10,8 @@
class UtreonIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?utreon\.com/v/(?P<id>[\w-]+)'
IE_NAME = 'playeur'
_VALID_URL = r'https?://(?:www\.)?(?:utreon|playeur)\.com/v/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://utreon.com/v/z_I7ikQbuDw',
'info_dict': {
@ -19,8 +20,9 @@ class UtreonIE(InfoExtractor):
'title': 'Freedom Friday meditation - Rising in the wind',
'description': 'md5:a9bf15a42434a062fe313b938343ad1b',
'uploader': 'Heather Dawn Elemental Health',
'thumbnail': 'https://data-1.utreon.com/v/MG/M2/NT/z_I7ikQbuDw/z_I7ikQbuDw_preview.jpg',
'thumbnail': r're:^https?://.+\.jpg',
'release_date': '20210723',
'duration': 586,
}
}, {
'url': 'https://utreon.com/v/jerJw5EOOVU',
@ -28,10 +30,11 @@ class UtreonIE(InfoExtractor):
'id': 'jerJw5EOOVU',
'ext': 'mp4',
'title': 'When I\'m alone, I love to reflect in peace, to make my dreams come true... [Quotes and Poems]',
'description': 'md5:61ee6c2da98be51b04b969ca80273aaa',
'description': 'md5:4026aa3a2c10169c3649926ac8ef62b6',
'uploader': 'Frases e Poemas Quotes and Poems',
'thumbnail': 'https://data-1.utreon.com/v/Mz/Zh/ND/jerJw5EOOVU/jerJw5EOOVU_89af85470a4b16eededde7f8674c96d9_cover.jpg',
'thumbnail': r're:^https?://.+\.jpg',
'release_date': '20210723',
'duration': 60,
}
}, {
'url': 'https://utreon.com/v/C4ZxXhYBBmE',
@ -39,10 +42,11 @@ class UtreonIE(InfoExtractor):
'id': 'C4ZxXhYBBmE',
'ext': 'mp4',
'title': 'Bidens Capital Gains Tax Rate to Test Worlds Highest',
'description': 'md5:fb5a6c2e506f013cc76f133f673bc5c8',
'description': 'md5:995aa9ad0733c0e5863ebdeff954f40e',
'uploader': 'Nomad Capitalist',
'thumbnail': 'https://data-1.utreon.com/v/ZD/k1/Mj/C4ZxXhYBBmE/C4ZxXhYBBmE_628342076198c9c06dd6b2c665978584_cover.jpg',
'thumbnail': r're:^https?://.+\.jpg',
'release_date': '20210723',
'duration': 884,
}
}, {
'url': 'https://utreon.com/v/Y-stEH-FBm8',
@ -52,15 +56,28 @@ class UtreonIE(InfoExtractor):
'title': 'Creeper-Chan Pranks Steve! 💚 [MINECRAFT ANIME]',
'description': 'md5:7a48450b0d761b96dec194be0c5ecb5f',
'uploader': 'Merryweather Comics',
'thumbnail': 'https://data-1.utreon.com/v/MT/E4/Zj/Y-stEH-FBm8/Y-stEH-FBm8_5290676a41a4a1096db133b09f54f77b_cover.jpg',
'thumbnail': r're:^https?://.+\.jpg',
'release_date': '20210718',
}},
]
'duration': 151,
}
}, {
'url': 'https://playeur.com/v/Wzqp-UrxSeu',
'info_dict': {
'id': 'Wzqp-UrxSeu',
'ext': 'mp4',
'title': 'Update: Clockwork Basilisk Books on the Way!',
'description': 'md5:d9756b0b1884c904655b0e170d17cea5',
'uploader': 'Forgotten Weapons',
'release_date': '20240208',
'thumbnail': r're:^https?://.+\.jpg',
'duration': 262,
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
json_data = self._download_json(
'https://api.utreon.com/v1/videos/' + video_id,
'https://api.playeur.com/v1/videos/' + video_id,
video_id)
videos_json = json_data['videos']
formats = [{

View file

@ -1,5 +1,6 @@
from .common import InfoExtractor
from ..utils import ExtractorError
from ..utils import ExtractorError, base_url, int_or_none, url_basename
from ..utils.traversal import traverse_obj
class Vbox7IE(InfoExtractor):
@ -19,7 +20,7 @@ class Vbox7IE(InfoExtractor):
_GEO_COUNTRIES = ['BG']
_TESTS = [{
'url': 'http://vbox7.com/play:0946fff23c',
'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf',
'md5': '50ca1f78345a9c15391af47d8062d074',
'info_dict': {
'id': '0946fff23c',
'ext': 'mp4',
@ -29,19 +30,25 @@ class Vbox7IE(InfoExtractor):
'timestamp': 1470982814,
'upload_date': '20160812',
'uploader': 'zdraveibulgaria',
},
'params': {
'proxy': '127.0.0.1:8118',
'view_count': int,
'duration': 2640,
},
}, {
'url': 'http://vbox7.com/play:249bb972c2',
'md5': '99f65c0c9ef9b682b97313e052734c3f',
'md5': 'da1dd2eb245200cb86e6d09d43232116',
'info_dict': {
'id': '249bb972c2',
'ext': 'mp4',
'title': 'Смях! Чудо - чист за секунди - Скрита камера',
'uploader': 'svideteliat_ot_varshava',
'view_count': int,
'timestamp': 1360215023,
'thumbnail': 'https://i49.vbox7.com/design/iconci/png/noimg6.png',
'description': 'Смях! Чудо - чист за секунди - Скрита камера',
'upload_date': '20130207',
'duration': 83,
},
'skip': 'georestricted',
'expected_warnings': ['Failed to download m3u8 information'],
}, {
'url': 'http://vbox7.com/emb/external.php?vid=a240d20f9c&autoplay=1',
'only_matching': True,
@ -53,41 +60,38 @@ class Vbox7IE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
response = self._download_json(
'https://www.vbox7.com/ajax/video/nextvideo.php?vid=%s' % video_id,
video_id)
data = self._download_json(
'https://www.vbox7.com/aj/player/item/options', video_id,
query={'vid': video_id})['options']
if 'error' in response:
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, response['error']), expected=True)
src_url = data.get('src')
if src_url in (None, '', 'blank'):
raise ExtractorError('Video is unavailable', expected=True)
video = response['options']
fmt_base = url_basename(src_url).rsplit('.', 1)[0].rsplit('_', 1)[0]
if fmt_base == 'vn':
self.raise_geo_restricted()
title = video['title']
video_url = video['src']
fmt_base = base_url(src_url) + fmt_base
if '/na.mp4' in video_url:
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
formats = self._extract_m3u8_formats(
f'{fmt_base}.m3u8', video_id, m3u8_id='hls', fatal=False)
# TODO: Add MPD formats, when dash range support is added
for res in traverse_obj(data, ('resolutions', lambda _, v: v != 0, {int})):
formats.append({
'url': f'{fmt_base}_{res}.mp4',
'format_id': f'http-{res}',
'height': res,
})
uploader = video.get('uploader')
webpage = self._download_webpage(
'http://vbox7.com/play:%s' % video_id, video_id, fatal=None)
info = {}
if webpage:
info = self._search_json_ld(
webpage.replace('"/*@context"', '"@context"'), video_id,
fatal=False)
info.update({
return {
'id': video_id,
'title': title,
'url': video_url,
'uploader': uploader,
'thumbnail': self._proto_relative_url(
info.get('thumbnail') or self._og_search_thumbnail(webpage),
'http:'),
})
return info
'formats': formats,
**self._search_json_ld(self._download_webpage(
f'https://www.vbox7.com/play:{video_id}', video_id, fatal=False) or '', video_id, fatal=False),
**traverse_obj(data, {
'title': ('title', {str}),
'uploader': ('uploader', {str}),
'duration': ('duration', {int_or_none}),
}),
}

View file

@ -12,7 +12,7 @@
class ViewLiftBaseIE(InfoExtractor):
_API_BASE = 'https://prod-api.viewlift.com/'
_DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm|failarmy|ftfnext|lnppass\.legapallacanestro|moviespree|app\.myoutdoortv|neoufitness|pflmma|theidentitytb)\.com|(?:hoichoi|app\.horseandcountry|kronon|marquee|supercrosslive)\.tv'
_DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm|failarmy|ftfnext|lnppass\.legapallacanestro|moviespree|app\.myoutdoortv|neoufitness|pflmma|theidentitytb|chorki)\.com|(?:hoichoi|app\.horseandcountry|kronon|marquee|supercrosslive)\.tv'
_SITE_MAP = {
'ftfnext': 'lax',
'funnyforfree': 'snagfilms',
@ -27,6 +27,7 @@ class ViewLiftBaseIE(InfoExtractor):
'snagxtreme': 'snagfilms',
'theidentitytb': 'tampabay',
'vayafilm': 'snagfilms',
'chorki': 'prothomalo',
}
_TOKENS = {}
@ -296,6 +297,33 @@ class ViewLiftIE(ViewLiftBaseIE):
}, { # Premium movie
'url': 'https://www.hoichoi.tv/movies/detective-2020',
'only_matching': True
}, { # Chorki Premium series
'url': 'https://www.chorki.com/bn/series/sinpaat',
'playlist_mincount': 7,
'info_dict': {
'id': 'bn/series/sinpaat',
},
}, { # Chorki free movie
'url': 'https://www.chorki.com/bn/videos/bangla-movie-bikkhov',
'info_dict': {
'id': '564e755b-f5c7-4515-aee6-8959bee18c93',
'title': 'Bikkhov',
'ext': 'mp4',
'upload_date': '20230824',
'timestamp': 1692860553,
'categories': ['Action Movies', 'Salman Special'],
'tags': 'count:14',
'thumbnail': 'https://snagfilms-a.akamaihd.net/dd078ff5-b16e-45e4-9723-501b56b9df0a/images/2023/08/24/1692860450729_1920x1080_16x9Images.jpg',
'display_id': 'bn/videos/bangla-movie-bikkhov',
'description': 'md5:71492b086450625f4374a3eb824f27dc',
'duration': 8002,
},
'params': {
'skip_download': True,
},
}, { # Chorki Premium movie
'url': 'https://www.chorki.com/bn/videos/something-like-an-autobiography',
'only_matching': True,
}]
@classmethod

View file

@ -48,17 +48,15 @@ def _unsmuggle_headers(self, url):
return url, data, headers
def _perform_login(self, username, password):
webpage = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
token, vuid = self._extract_xsrft_and_vuid(webpage)
viewer = self._download_json('https://vimeo.com/_next/viewer', None, 'Downloading login token')
data = {
'action': 'login',
'email': username,
'password': password,
'service': 'vimeo',
'token': token,
'token': viewer['xsrft'],
}
self._set_vimeo_cookie('vuid', vuid)
self._set_vimeo_cookie('vuid', viewer['vuid'])
try:
self._download_webpage(
self._LOGIN_URL, None, 'Logging in',
@ -269,7 +267,7 @@ def _extract_original_format(self, url, video_id, unlisted_hash=None):
'https://vimeo.com/_rv/viewer', video_id, note='Downloading jwt token', fatal=False) or {}
if not jwt_response.get('jwt'):
return
headers = {'Authorization': 'jwt %s' % jwt_response['jwt']}
headers = {'Authorization': 'jwt %s' % jwt_response['jwt'], 'Accept': 'application/json'}
original_response = self._download_json(
f'https://api.vimeo.com/videos/{video_id}', video_id,
headers=headers, fatal=False, expected_status=(403, 404)) or {}
@ -751,6 +749,7 @@ def _extract_from_api(self, video_id, unlisted_hash=None):
video = self._download_json(
api_url, video_id, headers={
'Authorization': 'jwt ' + token,
'Accept': 'application/json',
}, query={
'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays',
})
@ -785,7 +784,7 @@ def _try_album_password(self, url):
jwt = viewer['jwt']
album = self._download_json(
'https://api.vimeo.com/albums/' + album_id,
album_id, headers={'Authorization': 'jwt ' + jwt},
album_id, headers={'Authorization': 'jwt ' + jwt, 'Accept': 'application/json'},
query={'fields': 'description,name,privacy'})
if try_get(album, lambda x: x['privacy']['view']) == 'password':
password = self.get_param('videopassword')
@ -1147,10 +1146,12 @@ def _fetch_page(self, album_id, authorization, hashed_pass, page):
'https://api.vimeo.com/albums/%s/videos' % album_id,
album_id, 'Downloading page %d' % api_page, query=query, headers={
'Authorization': 'jwt ' + authorization,
'Accept': 'application/json',
})['data']
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 400:
return
raise
for video in videos:
link = video.get('link')
if not link:
@ -1171,7 +1172,7 @@ def _real_extract(self, url):
jwt = viewer['jwt']
album = self._download_json(
'https://api.vimeo.com/albums/' + album_id,
album_id, headers={'Authorization': 'jwt ' + jwt},
album_id, headers={'Authorization': 'jwt ' + jwt, 'Accept': 'application/json'},
query={'fields': 'description,name,privacy'})
hashed_pass = None
if try_get(album, lambda x: x['privacy']['view']) == 'password':

View file

@ -1,159 +0,0 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
parse_iso8601,
traverse_obj,
try_get,
)
class WASDTVBaseIE(InfoExtractor):
def _fetch(self, path, video_id, description, query={}):
response = self._download_json(
f'https://wasd.tv/api/{path}', video_id, query=query,
note=f'Downloading {description} metadata',
errnote=f'Unable to download {description} metadata')
error = response.get('error')
if error:
raise ExtractorError(f'{self.IE_NAME} returned error: {error}', expected=True)
return response.get('result')
def _extract_thumbnails(self, thumbnails_dict):
return [{
'url': url,
'preference': index,
} for index, url in enumerate(
traverse_obj(thumbnails_dict, (('small', 'medium', 'large'),))) if url]
def _real_extract(self, url):
container = self._get_container(url)
stream = traverse_obj(container, ('media_container_streams', 0))
media = try_get(stream, lambda x: x['stream_media'][0])
if not media:
raise ExtractorError('Can not extract media data.', expected=True)
media_meta = media.get('media_meta')
media_url, is_live = self._get_media_url(media_meta)
video_id = media.get('media_id') or container.get('media_container_id')
formats, subtitles = self._extract_m3u8_formats_and_subtitles(media_url, video_id, 'mp4')
return {
'id': str(video_id),
'title': container.get('media_container_name') or self._og_search_title(self._download_webpage(url, video_id)),
'description': container.get('media_container_description'),
'thumbnails': self._extract_thumbnails(media_meta.get('media_preview_images')),
'timestamp': parse_iso8601(container.get('created_at')),
'view_count': int_or_none(stream.get('stream_current_viewers' if is_live else 'stream_total_viewers')),
'is_live': is_live,
'formats': formats,
'subtitles': subtitles,
}
def _get_container(self, url):
raise NotImplementedError('Subclass for get media container')
def _get_media_url(self, media_meta):
raise NotImplementedError('Subclass for get media url')
class WASDTVStreamIE(WASDTVBaseIE):
IE_NAME = 'wasdtv:stream'
_VALID_URL = r'https?://wasd\.tv/(?P<id>[^/#?]+)$'
_TESTS = [{
'url': 'https://wasd.tv/24_7',
'info_dict': {
'id': '559738',
'ext': 'mp4',
'title': 'Live 24/7 Music',
'description': '24&#x2F;7 Music',
'timestamp': int,
'upload_date': r're:^\d{8}$',
'is_live': True,
'view_count': int,
},
}]
def _get_container(self, url):
nickname = self._match_id(url)
channel = self._fetch(f'channels/nicknames/{nickname}', video_id=nickname, description='channel')
channel_id = channel.get('channel_id')
containers = self._fetch(
'v2/media-containers', channel_id, 'running media containers',
query={
'channel_id': channel_id,
'media_container_type': 'SINGLE',
'media_container_status': 'RUNNING',
})
if not containers:
raise ExtractorError(f'{nickname} is offline', expected=True)
return containers[0]
def _get_media_url(self, media_meta):
return media_meta['media_url'], True
class WASDTVRecordIE(WASDTVBaseIE):
IE_NAME = 'wasdtv:record'
_VALID_URL = r'https?://wasd\.tv/[^/#?]+(?:/videos)?\?record=(?P<id>\d+)$'
_TESTS = [{
'url': 'https://wasd.tv/spacemita/videos?record=907755',
'md5': 'c9899dd85be4cc997816ff9f9ca516ce',
'info_dict': {
'id': '906825',
'ext': 'mp4',
'title': 'Музыкальный',
'description': 'md5:f510388d929ff60ae61d4c3cab3137cc',
'timestamp': 1645812079,
'upload_date': '20220225',
'thumbnail': r're:^https?://.+\.jpg',
'is_live': False,
'view_count': int,
},
}, {
'url': 'https://wasd.tv/spacemita?record=907755',
'only_matching': True,
}]
def _get_container(self, url):
container_id = self._match_id(url)
return self._fetch(
f'v2/media-containers/{container_id}', container_id, 'media container')
def _get_media_url(self, media_meta):
media_archive_url = media_meta.get('media_archive_url')
if media_archive_url:
return media_archive_url, False
return media_meta['media_url'], True
class WASDTVClipIE(WASDTVBaseIE):
IE_NAME = 'wasdtv:clip'
_VALID_URL = r'https?://wasd\.tv/[^/#?]+/clips\?clip=(?P<id>\d+)$'
_TESTS = [{
'url': 'https://wasd.tv/spacemita/clips?clip=26804',
'md5': '818885e720143d7a4e776ff66fcff148',
'info_dict': {
'id': '26804',
'ext': 'mp4',
'title': 'Пуш флексит на голове стримера',
'timestamp': 1646682908,
'upload_date': '20220307',
'thumbnail': r're:^https?://.+\.jpg',
'view_count': int,
},
}]
def _real_extract(self, url):
clip_id = self._match_id(url)
clip = self._fetch(f'v2/clips/{clip_id}', video_id=clip_id, description='clip')
clip_data = clip.get('clip_data')
formats, subtitles = self._extract_m3u8_formats_and_subtitles(clip_data.get('url'), video_id=clip_id, ext='mp4')
return {
'id': clip_id,
'title': clip.get('clip_title') or self._og_search_title(self._download_webpage(url, clip_id, fatal=False)),
'thumbnails': self._extract_thumbnails(clip_data.get('preview')),
'timestamp': parse_iso8601(clip.get('created_at')),
'view_count': int_or_none(clip.get('clip_views_count')),
'formats': formats,
'subtitles': subtitles,
}

Some files were not shown because too many files have changed in this diff Show more