diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index dacb41758..5df13ad9b 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting that yt-dlp is broken on a **supported** site required: true - - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -61,19 +61,18 @@ body: description: | It should start like this: placeholder: | - [debug] Command-line config: ['-vU', 'test:youtube'] - [debug] Portable config "yt-dlp.conf": ['-i'] + [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] Checking exe version: ffmpeg -bsfs - [debug] Checking exe version: ffprobe -bsfs [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.10.07, Current version: 2023.10.07 - yt-dlp is up to date (2023.10.07) + [debug] Request Handlers: urllib, requests + [debug] Loaded 1893 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) + [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index ec6e298a1..644c87a7e 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -73,19 +73,18 @@ body: description: | It should start like this: placeholder: | - [debug] Command-line config: ['-vU', 'test:youtube'] - [debug] Portable config "yt-dlp.conf": ['-i'] + [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] Checking exe version: ffmpeg -bsfs - [debug] Checking exe version: ffprobe -bsfs [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.10.07, Current version: 2023.10.07 - yt-dlp is up to date (2023.10.07) + [debug] Request Handlers: urllib, requests + [debug] Loaded 1893 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) + [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index cf3cdd21f..59d0474c2 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -69,19 +69,18 @@ body: description: | It should start like this: placeholder: | - [debug] Command-line config: ['-vU', 'test:youtube'] - [debug] Portable config "yt-dlp.conf": ['-i'] + [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] Checking exe version: ffmpeg -bsfs - [debug] Checking exe version: ffprobe -bsfs [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.10.07, Current version: 2023.10.07 - yt-dlp is up to date (2023.10.07) + [debug] Request Handlers: urllib, requests + [debug] Loaded 1893 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) + [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 1bbcf6895..e20739673 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -54,19 +54,18 @@ body: description: | It should start like this: placeholder: | - [debug] Command-line config: ['-vU', 'test:youtube'] - [debug] Portable config "yt-dlp.conf": ['-i'] + [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] Checking exe version: ffmpeg -bsfs - [debug] Checking exe version: ffprobe -bsfs [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.10.07, Current version: 2023.10.07 - yt-dlp is up to date (2023.10.07) + [debug] Request Handlers: urllib, requests + [debug] Loaded 1893 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) + [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index d3bc06e80..e06db9ccf 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true @@ -50,18 +50,17 @@ body: description: | It should start like this: placeholder: | - [debug] Command-line config: ['-vU', 'test:youtube'] - [debug] Portable config "yt-dlp.conf": ['-i'] + [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] Checking exe version: ffmpeg -bsfs - [debug] Checking exe version: ffprobe -bsfs [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.10.07, Current version: 2023.10.07 - yt-dlp is up to date (2023.10.07) + [debug] Request Handlers: urllib, requests + [debug] Loaded 1893 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) + [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 30311d5b5..571223a9c 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -26,7 +26,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true @@ -56,18 +56,17 @@ body: description: | It should start like this: placeholder: | - [debug] Command-line config: ['-vU', 'test:youtube'] - [debug] Portable config "yt-dlp.conf": ['-i'] + [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] Checking exe version: ffmpeg -bsfs - [debug] Checking exe version: ffprobe -bsfs [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.10.07, Current version: 2023.10.07 - yt-dlp is up to date (2023.10.07) + [debug] Request Handlers: urllib, requests + [debug] Loaded 1893 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) + [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell diff --git a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml index a51db789f..bff28ae4e 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml @@ -12,7 +12,7 @@ body: options: - label: I'm reporting that yt-dlp is broken on a **supported** site required: true - - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml index 75d62e7bb..2bffe738d 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml @@ -12,7 +12,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml index 18b30f578..6c3127983 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml @@ -12,7 +12,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml index 9ab490267..5f357d96e 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml @@ -12,7 +12,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml index ef3bb2269..99107ff58 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml @@ -14,7 +14,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/6_question.yml b/.github/ISSUE_TEMPLATE_tmpl/6_question.yml index 4bef82d5a..bd742109a 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/6_question.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/6_question.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index cbed82173..c4d3e812e 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -40,10 +40,4 @@ ### What is the purpose of your *pull request*? - [ ] Core bug fix/improvement - [ ] New feature (It is strongly [recommended to open an issue first](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#adding-new-feature-or-making-overarching-changes)) - - -
Copilot Summary - -copilot:all -
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ac0cfdf7c..036ce4348 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -30,6 +30,10 @@ on: meta_files: default: true type: boolean + origin: + required: false + default: '' + type: string secrets: GPG_SIGNING_KEY: required: false @@ -37,11 +41,13 @@ on: workflow_dispatch: inputs: version: - description: Version tag (YYYY.MM.DD[.REV]) + description: | + VERSION: yyyy.mm.dd[.rev] or rev required: true type: string channel: - description: Update channel (stable/nightly/...) + description: | + SOURCE of this build's updates: stable/nightly/master/ required: true default: stable type: string @@ -73,16 +79,34 @@ on: description: SHA2-256SUMS, SHA2-512SUMS, _update_spec default: true type: boolean + origin: + description: Origin + required: false + default: 'current repo' + type: choice + options: + - 'current repo' permissions: contents: read jobs: + process: + runs-on: ubuntu-latest + outputs: + origin: ${{ steps.process_origin.outputs.origin }} + steps: + - name: Process origin + id: process_origin + run: | + echo "origin=${{ inputs.origin == 'current repo' && github.repository || inputs.origin }}" | tee "$GITHUB_OUTPUT" + unix: + needs: process if: inputs.unix runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: python-version: "3.10" @@ -96,22 +120,21 @@ jobs: auto-activate-base: false - name: Install Requirements run: | - sudo apt-get -y install zip pandoc man sed - python -m pip install -U pip setuptools wheel - python -m pip install -U Pyinstaller -r requirements.txt + sudo apt -y install zip pandoc man sed reqs=$(mktemp) - cat > $reqs << EOF + cat > "$reqs" << EOF python=3.10.* pyinstaller cffi brotli-python + secretstorage EOF - sed '/^brotli.*/d' requirements.txt >> $reqs - mamba create -n build --file $reqs + sed -E '/^(brotli|secretstorage).*/d' requirements.txt >> "$reqs" + mamba create -n build --file "$reqs" - name: Prepare run: | - python devscripts/update-version.py -c ${{ inputs.channel }} ${{ inputs.version }} + python devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" python devscripts/make_lazy_extractors.py - name: Build Unix platform-independent binary run: | @@ -150,6 +173,7 @@ jobs: yt-dlp_linux.zip linux_arm: + needs: process if: inputs.linux_arm permissions: contents: read @@ -162,7 +186,7 @@ jobs: - aarch64 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: path: ./repo - name: Virtualized Install, Prepare & Build @@ -180,12 +204,12 @@ jobs: apt -y install zlib1g-dev python3.8 python3.8-dev python3.8-distutils python3-pip python3.8 -m pip install -U pip setuptools wheel # Cannot access requirements.txt from the repo directory at this stage - python3.8 -m pip install -U Pyinstaller mutagen pycryptodomex websockets brotli certifi + python3.8 -m pip install -U Pyinstaller mutagen pycryptodomex websockets brotli certifi secretstorage run: | cd repo - python3.8 -m pip install -U Pyinstaller -r requirements.txt # Cached version may be out of date - python3.8 devscripts/update-version.py -c ${{ inputs.channel }} ${{ inputs.version }} + python3.8 -m pip install -U Pyinstaller secretstorage -r requirements.txt # Cached version may be out of date + python3.8 devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" python3.8 devscripts/make_lazy_extractors.py python3.8 pyinst.py @@ -206,11 +230,12 @@ jobs: repo/dist/yt-dlp_linux_${{ (matrix.architecture == 'armv7' && 'armv7l') || matrix.architecture }} macos: + needs: process if: inputs.macos runs-on: macos-11 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 # NB: Building universal2 does not work with python from actions/setup-python - name: Install Requirements run: | @@ -221,7 +246,7 @@ jobs: - name: Prepare run: | - python3 devscripts/update-version.py -c ${{ inputs.channel }} ${{ inputs.version }} + python3 devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" python3 devscripts/make_lazy_extractors.py - name: Build run: | @@ -247,11 +272,12 @@ jobs: dist/yt-dlp_macos.zip macos_legacy: + needs: process if: inputs.macos_legacy runs-on: macos-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install Python # We need the official Python, because the GA ones only support newer macOS versions env: @@ -272,7 +298,7 @@ jobs: - name: Prepare run: | - python3 devscripts/update-version.py -c ${{ inputs.channel }} ${{ inputs.version }} + python3 devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" python3 devscripts/make_lazy_extractors.py - name: Build run: | @@ -296,11 +322,12 @@ jobs: dist/yt-dlp_macos_legacy windows: + needs: process if: inputs.windows runs-on: windows-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: # 3.8 is used for Win7 support python-version: "3.8" @@ -311,7 +338,7 @@ jobs: - name: Prepare run: | - python devscripts/update-version.py -c ${{ inputs.channel }} ${{ inputs.version }} + python devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" python devscripts/make_lazy_extractors.py - name: Build run: | @@ -343,14 +370,15 @@ jobs: dist/yt-dlp_win.zip windows32: + needs: process if: inputs.windows32 runs-on: windows-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 - with: # 3.7 is used for Vista support. See https://github.com/yt-dlp/yt-dlp/issues/390 - python-version: "3.7" + with: + python-version: "3.8" architecture: "x86" - name: Install Requirements run: | @@ -359,7 +387,7 @@ jobs: - name: Prepare run: | - python devscripts/update-version.py -c ${{ inputs.channel }} ${{ inputs.version }} + python devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" python devscripts/make_lazy_extractors.py - name: Build run: | @@ -387,6 +415,7 @@ jobs: meta_files: if: inputs.meta_files && always() && !cancelled() needs: + - process - unix - linux_arm - macos @@ -407,7 +436,16 @@ jobs: run: | cat >> _update_spec << EOF # This file is used for regulating self-update - lock 2022.08.18.36 .+ Python 3.6 + lock 2022.08.18.36 .+ Python 3\.6 + lock 2023.11.16 (?!win_x86_exe).+ Python 3\.7 + lock 2023.11.16 win_x86_exe .+ Windows-(?:Vista|2008Server) + lockV2 yt-dlp/yt-dlp 2022.08.18.36 .+ Python 3\.6 + lockV2 yt-dlp/yt-dlp 2023.11.16 (?!win_x86_exe).+ Python 3\.7 + lockV2 yt-dlp/yt-dlp 2023.11.16 win_x86_exe .+ Windows-(?:Vista|2008Server) + lockV2 yt-dlp/yt-dlp-nightly-builds 2023.11.15.232826 (?!win_x86_exe).+ Python 3\.7 + lockV2 yt-dlp/yt-dlp-nightly-builds 2023.11.15.232826 win_x86_exe .+ Windows-(?:Vista|2008Server) + lockV2 yt-dlp/yt-dlp-master-builds 2023.11.15.232812 (?!win_x86_exe).+ Python 3\.7 + lockV2 yt-dlp/yt-dlp-master-builds 2023.11.15.232812 win_x86_exe .+ Windows-(?:Vista|2008Server) EOF - name: Sign checksum files diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 2821d90d0..170a6ac19 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -29,7 +29,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index 7fcf11dfa..eaaf03dee 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -1,8 +1,32 @@ name: Core Tests -on: [push, pull_request] +on: + push: + paths: + - .github/** + - devscripts/** + - test/** + - yt_dlp/**.py + - '!yt_dlp/extractor/*.py' + - yt_dlp/extractor/__init__.py + - yt_dlp/extractor/common.py + - yt_dlp/extractor/extractors.py + pull_request: + paths: + - .github/** + - devscripts/** + - test/** + - yt_dlp/**.py + - '!yt_dlp/extractor/*.py' + - yt_dlp/extractor/__init__.py + - yt_dlp/extractor/common.py + - yt_dlp/extractor/extractors.py permissions: contents: read +concurrency: + group: core-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + jobs: tests: name: Core Tests @@ -12,30 +36,26 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - # CPython 3.11 is in quick-test - python-version: ['3.8', '3.9', '3.10', '3.12', pypy-3.7, pypy-3.8, pypy-3.10] - run-tests-ext: [sh] + # CPython 3.8 is in quick-test + python-version: ['3.9', '3.10', '3.11', '3.12', pypy-3.8, pypy-3.10] include: # atleast one of each CPython/PyPy tests must be in windows - os: windows-latest - python-version: '3.7' - run-tests-ext: bat + python-version: '3.8' - os: windows-latest python-version: '3.12' - run-tests-ext: bat - os: windows-latest python-version: pypy-3.9 - run-tests-ext: bat steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - name: Install pytest - run: pip install pytest + - name: Install test requirements + run: pip install pytest -r requirements.txt - name: Run tests continue-on-error: False run: | python3 -m yt_dlp -v || true # Print debug head - ./devscripts/run_tests.${{ matrix.run-tests-ext }} core + python3 ./devscripts/run_tests.py core diff --git a/.github/workflows/download.yml b/.github/workflows/download.yml index c3478721c..9f47d6718 100644 --- a/.github/workflows/download.yml +++ b/.github/workflows/download.yml @@ -9,16 +9,16 @@ jobs: if: "contains(github.event.head_commit.message, 'ci run dl')" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v4 with: python-version: 3.9 - name: Install test requirements - run: pip install pytest + run: pip install pytest -r requirements.txt - name: Run tests continue-on-error: true - run: ./devscripts/run_tests.sh download + run: python3 ./devscripts/run_tests.py download full: name: Full Download Tests @@ -28,24 +28,21 @@ jobs: fail-fast: true matrix: os: [ubuntu-latest] - python-version: ['3.7', '3.10', '3.12', pypy-3.7, pypy-3.8, pypy-3.10] - run-tests-ext: [sh] + python-version: ['3.10', '3.11', '3.12', pypy-3.8, pypy-3.10] include: # atleast one of each CPython/PyPy tests must be in windows - os: windows-latest python-version: '3.8' - run-tests-ext: bat - os: windows-latest python-version: pypy-3.9 - run-tests-ext: bat steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - name: Install pytest - run: pip install pytest + - name: Install test requirements + run: pip install pytest -r requirements.txt - name: Run tests continue-on-error: true - run: ./devscripts/run_tests.${{ matrix.run-tests-ext }} download + run: python3 ./devscripts/run_tests.py download diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml deleted file mode 100644 index 9ebf54e7f..000000000 --- a/.github/workflows/publish.yml +++ /dev/null @@ -1,97 +0,0 @@ -name: Publish -on: - workflow_call: - inputs: - channel: - default: stable - required: true - type: string - version: - required: true - type: string - target_commitish: - required: true - type: string - prerelease: - default: false - required: true - type: boolean - secrets: - ARCHIVE_REPO_TOKEN: - required: false - -permissions: - contents: write - -jobs: - publish: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - uses: actions/download-artifact@v3 - - uses: actions/setup-python@v4 - with: - python-version: "3.10" - - - name: Generate release notes - run: | - printf '%s' \ - '[![Installation](https://img.shields.io/badge/-Which%20file%20should%20I%20download%3F-white.svg?style=for-the-badge)]' \ - '(https://github.com/yt-dlp/yt-dlp#installation "Installation instructions") ' \ - '[![Documentation](https://img.shields.io/badge/-Docs-brightgreen.svg?style=for-the-badge&logo=GitBook&labelColor=555555)]' \ - '(https://github.com/yt-dlp/yt-dlp/tree/2023.03.04#readme "Documentation") ' \ - '[![Donate](https://img.shields.io/badge/_-Donate-red.svg?logo=githubsponsors&labelColor=555555&style=for-the-badge)]' \ - '(https://github.com/yt-dlp/yt-dlp/blob/master/Collaborators.md#collaborators "Donate") ' \ - '[![Discord](https://img.shields.io/discord/807245652072857610?color=blue&labelColor=555555&label=&logo=discord&style=for-the-badge)]' \ - '(https://discord.gg/H5MNcFW63r "Discord") ' \ - ${{ inputs.channel != 'nightly' && '"[![Nightly](https://img.shields.io/badge/Get%20nightly%20builds-purple.svg?style=for-the-badge)]" \ - "(https://github.com/yt-dlp/yt-dlp-nightly-builds/releases/latest \"Nightly builds\")"' || '' }} \ - > ./RELEASE_NOTES - printf '\n\n' >> ./RELEASE_NOTES - cat >> ./RELEASE_NOTES << EOF - #### A description of the various files are in the [README](https://github.com/yt-dlp/yt-dlp#release-files) - --- - $(python ./devscripts/make_changelog.py -vv --collapsible) - EOF - printf '%s\n\n' '**This is an automated nightly pre-release build**' >> ./NIGHTLY_NOTES - cat ./RELEASE_NOTES >> ./NIGHTLY_NOTES - printf '%s\n\n' 'Generated from: https://github.com/${{ github.repository }}/commit/${{ inputs.target_commitish }}' >> ./ARCHIVE_NOTES - cat ./RELEASE_NOTES >> ./ARCHIVE_NOTES - - - name: Archive nightly release - env: - GH_TOKEN: ${{ secrets.ARCHIVE_REPO_TOKEN }} - GH_REPO: ${{ vars.ARCHIVE_REPO }} - if: | - inputs.channel == 'nightly' && env.GH_TOKEN != '' && env.GH_REPO != '' - run: | - gh release create \ - --notes-file ARCHIVE_NOTES \ - --title "yt-dlp nightly ${{ inputs.version }}" \ - ${{ inputs.version }} \ - artifact/* - - - name: Prune old nightly release - if: inputs.channel == 'nightly' && !vars.ARCHIVE_REPO - env: - GH_TOKEN: ${{ github.token }} - run: | - gh release delete --yes --cleanup-tag "nightly" || true - git tag --delete "nightly" || true - sleep 5 # Enough time to cover deletion race condition - - - name: Publish release${{ inputs.channel == 'nightly' && ' (nightly)' || '' }} - env: - GH_TOKEN: ${{ github.token }} - if: (inputs.channel == 'nightly' && !vars.ARCHIVE_REPO) || inputs.channel != 'nightly' - run: | - gh release create \ - --notes-file ${{ inputs.channel == 'nightly' && 'NIGHTLY_NOTES' || 'RELEASE_NOTES' }} \ - --target ${{ inputs.target_commitish }} \ - --title "yt-dlp ${{ inputs.channel == 'nightly' && 'nightly ' || '' }}${{ inputs.version }}" \ - ${{ inputs.prerelease && '--prerelease' || '' }} \ - ${{ inputs.channel == 'nightly' && '"nightly"' || inputs.version }} \ - artifact/* diff --git a/.github/workflows/quick-test.yml b/.github/workflows/quick-test.yml index 930e58152..84fca62d4 100644 --- a/.github/workflows/quick-test.yml +++ b/.github/workflows/quick-test.yml @@ -9,23 +9,23 @@ jobs: if: "!contains(github.event.head_commit.message, 'ci skip all')" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Set up Python 3.11 + - uses: actions/checkout@v4 + - name: Set up Python 3.8 uses: actions/setup-python@v4 with: - python-version: '3.11' + python-version: '3.8' - name: Install test requirements - run: pip install pytest pycryptodomex + run: pip install pytest -r requirements.txt - name: Run tests run: | python3 -m yt_dlp -v || true - ./devscripts/run_tests.sh core + python3 ./devscripts/run_tests.py core flake8: name: Linter if: "!contains(github.event.head_commit.message, 'ci skip all')" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 - name: Install flake8 run: pip install flake8 diff --git a/.github/workflows/release-master.yml b/.github/workflows/release-master.yml new file mode 100644 index 000000000..0664137a9 --- /dev/null +++ b/.github/workflows/release-master.yml @@ -0,0 +1,27 @@ +name: Release (master) +on: + push: + branches: + - master + paths: + - "yt_dlp/**.py" + - "!yt_dlp/version.py" + - "setup.py" + - "pyinst.py" +concurrency: + group: release-master +permissions: + contents: read + +jobs: + release: + if: vars.BUILD_MASTER != '' + uses: ./.github/workflows/release.yml + with: + prerelease: true + source: master + permissions: + contents: write + packages: write + id-token: write # mandatory for trusted publishing + secrets: inherit diff --git a/.github/workflows/release-nightly.yml b/.github/workflows/release-nightly.yml index 543e2e6f7..2e623a67c 100644 --- a/.github/workflows/release-nightly.yml +++ b/.github/workflows/release-nightly.yml @@ -1,52 +1,35 @@ name: Release (nightly) on: - push: - branches: - - master - paths: - - "yt_dlp/**.py" - - "!yt_dlp/version.py" -concurrency: - group: release-nightly - cancel-in-progress: true + schedule: + - cron: '23 23 * * *' permissions: contents: read jobs: - prepare: + check_nightly: if: vars.BUILD_NIGHTLY != '' runs-on: ubuntu-latest outputs: - version: ${{ steps.get_version.outputs.version }} - + commit: ${{ steps.check_for_new_commits.outputs.commit }} steps: - - uses: actions/checkout@v3 - - name: Get version - id: get_version + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Check for new commits + id: check_for_new_commits run: | - python devscripts/update-version.py "$(date -u +"%H%M%S")" | grep -Po "version=\d+(\.\d+){3}" >> "$GITHUB_OUTPUT" + relevant_files=("yt_dlp/*.py" ':!yt_dlp/version.py' "setup.py" "pyinst.py") + echo "commit=$(git log --format=%H -1 --since="24 hours ago" -- "${relevant_files[@]}")" | tee "$GITHUB_OUTPUT" - build: - needs: prepare - uses: ./.github/workflows/build.yml + release: + needs: [check_nightly] + if: ${{ needs.check_nightly.outputs.commit }} + uses: ./.github/workflows/release.yml with: - version: ${{ needs.prepare.outputs.version }} - channel: nightly - permissions: - contents: read - packages: write # For package cache - secrets: - GPG_SIGNING_KEY: ${{ secrets.GPG_SIGNING_KEY }} - - publish: - needs: [prepare, build] - uses: ./.github/workflows/publish.yml - secrets: - ARCHIVE_REPO_TOKEN: ${{ secrets.ARCHIVE_REPO_TOKEN }} + prerelease: true + source: nightly permissions: contents: write - with: - channel: nightly - prerelease: true - version: ${{ needs.prepare.outputs.version }} - target_commitish: ${{ github.sha }} + packages: write + id-token: write # mandatory for trusted publishing + secrets: inherit diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ada508be8..69b5e3152 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,14 +1,45 @@ name: Release on: - workflow_dispatch: + workflow_call: inputs: - version: - description: Version tag (YYYY.MM.DD[.REV]) + prerelease: + required: false + default: true + type: boolean + source: required: false default: '' type: string - channel: - description: Update channel (stable/nightly/...) + target: + required: false + default: '' + type: string + version: + required: false + default: '' + type: string + workflow_dispatch: + inputs: + source: + description: | + SOURCE of this release's updates: + channel, repo, tag, or channel/repo@tag + (default: ) + required: false + default: '' + type: string + target: + description: | + TARGET to publish this release to: + channel, tag, or channel@tag + (default: if writable else [@source_tag]) + required: false + default: '' + type: string + version: + description: | + VERSION: yyyy.mm.dd[.rev] or rev + (default: auto-generated) required: false default: '' type: string @@ -26,12 +57,17 @@ jobs: contents: write runs-on: ubuntu-latest outputs: - channel: ${{ steps.set_channel.outputs.channel }} - version: ${{ steps.update_version.outputs.version }} + channel: ${{ steps.setup_variables.outputs.channel }} + version: ${{ steps.setup_variables.outputs.version }} + target_repo: ${{ steps.setup_variables.outputs.target_repo }} + target_repo_token: ${{ steps.setup_variables.outputs.target_repo_token }} + target_tag: ${{ steps.setup_variables.outputs.target_tag }} + pypi_project: ${{ steps.setup_variables.outputs.pypi_project }} + pypi_suffix: ${{ steps.setup_variables.outputs.pypi_suffix }} head_sha: ${{ steps.get_target.outputs.head_sha }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 @@ -39,25 +75,123 @@ jobs: with: python-version: "3.10" - - name: Set channel - id: set_channel + - name: Process inputs + id: process_inputs run: | - CHANNEL="${{ github.repository == 'yt-dlp/yt-dlp' && 'stable' || github.repository }}" - echo "channel=${{ inputs.channel || '$CHANNEL' }}" > "$GITHUB_OUTPUT" + cat << EOF + ::group::Inputs + prerelease=${{ inputs.prerelease }} + source=${{ inputs.source }} + target=${{ inputs.target }} + version=${{ inputs.version }} + ::endgroup:: + EOF + IFS='@' read -r source_repo source_tag <<<"${{ inputs.source }}" + IFS='@' read -r target_repo target_tag <<<"${{ inputs.target }}" + cat << EOF >> "$GITHUB_OUTPUT" + source_repo=${source_repo} + source_tag=${source_tag} + target_repo=${target_repo} + target_tag=${target_tag} + EOF - - name: Update version - id: update_version + - name: Setup variables + id: setup_variables + env: + source_repo: ${{ steps.process_inputs.outputs.source_repo }} + source_tag: ${{ steps.process_inputs.outputs.source_tag }} + target_repo: ${{ steps.process_inputs.outputs.target_repo }} + target_tag: ${{ steps.process_inputs.outputs.target_tag }} run: | - REVISION="${{ vars.PUSH_VERSION_COMMIT == '' && '$(date -u +"%H%M%S")' || '' }}" - REVISION="${{ inputs.prerelease && '$(date -u +"%H%M%S")' || '$REVISION' }}" - python devscripts/update-version.py ${{ inputs.version || '$REVISION' }} | \ - grep -Po "version=\d+\.\d+\.\d+(\.\d+)?" >> "$GITHUB_OUTPUT" + # unholy bash monstrosity (sincere apologies) + fallback_token () { + if ${{ !secrets.ARCHIVE_REPO_TOKEN }}; then + echo "::error::Repository access secret ${target_repo_token^^} not found" + exit 1 + fi + target_repo_token=ARCHIVE_REPO_TOKEN + return 0 + } + + source_is_channel=0 + [[ "${source_repo}" == 'stable' ]] && source_repo='yt-dlp/yt-dlp' + if [[ -z "${source_repo}" ]]; then + source_repo='${{ github.repository }}' + elif [[ '${{ vars[format('{0}_archive_repo', env.source_repo)] }}' ]]; then + source_is_channel=1 + source_channel='${{ vars[format('{0}_archive_repo', env.source_repo)] }}' + elif [[ -z "${source_tag}" && "${source_repo}" != */* ]]; then + source_tag="${source_repo}" + source_repo='${{ github.repository }}' + fi + resolved_source="${source_repo}" + if [[ "${source_tag}" ]]; then + resolved_source="${resolved_source}@${source_tag}" + elif [[ "${source_repo}" == 'yt-dlp/yt-dlp' ]]; then + resolved_source='stable' + fi + + revision="${{ (inputs.prerelease || !vars.PUSH_VERSION_COMMIT) && '$(date -u +"%H%M%S")' || '' }}" + version="$( + python devscripts/update-version.py \ + -c "${resolved_source}" -r "${{ github.repository }}" ${{ inputs.version || '$revision' }} | \ + grep -Po "version=\K\d+\.\d+\.\d+(\.\d+)?")" + + if [[ "${target_repo}" ]]; then + if [[ -z "${target_tag}" ]]; then + if [[ '${{ vars[format('{0}_archive_repo', env.target_repo)] }}' ]]; then + target_tag="${source_tag:-${version}}" + else + target_tag="${target_repo}" + target_repo='${{ github.repository }}' + fi + fi + if [[ "${target_repo}" != '${{ github.repository}}' ]]; then + target_repo='${{ vars[format('{0}_archive_repo', env.target_repo)] }}' + target_repo_token='${{ env.target_repo }}_archive_repo_token' + ${{ !!secrets[format('{0}_archive_repo_token', env.target_repo)] }} || fallback_token + pypi_project='${{ vars[format('{0}_pypi_project', env.target_repo)] }}' + pypi_suffix='${{ vars[format('{0}_pypi_suffix', env.target_repo)] }}' + fi + else + target_tag="${source_tag:-${version}}" + if ((source_is_channel)); then + target_repo="${source_channel}" + target_repo_token='${{ env.source_repo }}_archive_repo_token' + ${{ !!secrets[format('{0}_archive_repo_token', env.source_repo)] }} || fallback_token + pypi_project='${{ vars[format('{0}_pypi_project', env.source_repo)] }}' + pypi_suffix='${{ vars[format('{0}_pypi_suffix', env.source_repo)] }}' + else + target_repo='${{ github.repository }}' + fi + fi + + if [[ "${target_repo}" == '${{ github.repository }}' ]] && ${{ !inputs.prerelease }}; then + pypi_project='${{ vars.PYPI_PROJECT }}' + fi + + echo "::group::Output variables" + cat << EOF | tee -a "$GITHUB_OUTPUT" + channel=${resolved_source} + version=${version} + target_repo=${target_repo} + target_repo_token=${target_repo_token} + target_tag=${target_tag} + pypi_project=${pypi_project} + pypi_suffix=${pypi_suffix} + EOF + echo "::endgroup::" - name: Update documentation + env: + version: ${{ steps.setup_variables.outputs.version }} + target_repo: ${{ steps.setup_variables.outputs.target_repo }} + if: | + !inputs.prerelease && env.target_repo == github.repository run: | make doc sed '/### /Q' Changelog.md >> ./CHANGELOG - echo '### ${{ steps.update_version.outputs.version }}' >> ./CHANGELOG + echo '### ${{ env.version }}' >> ./CHANGELOG python ./devscripts/make_changelog.py -vv -c >> ./CHANGELOG echo >> ./CHANGELOG grep -Poz '(?s)### \d+\.\d+\.\d+.+' 'Changelog.md' | head -n -1 >> ./CHANGELOG @@ -65,12 +199,16 @@ jobs: - name: Push to release id: push_release - if: ${{ !inputs.prerelease }} + env: + version: ${{ steps.setup_variables.outputs.version }} + target_repo: ${{ steps.setup_variables.outputs.target_repo }} + if: | + !inputs.prerelease && env.target_repo == github.repository run: | - git config --global user.name github-actions - git config --global user.email github-actions@example.com + git config --global user.name "github-actions[bot]" + git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com" git add -u - git commit -m "Release ${{ steps.update_version.outputs.version }}" \ + git commit -m "Release ${{ env.version }}" \ -m "Created by: ${{ github.event.sender.login }}" -m ":ci skip all :ci run dl" git push origin --force ${{ github.event.ref }}:release @@ -80,7 +218,10 @@ jobs: echo "head_sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT" - name: Update master - if: vars.PUSH_VERSION_COMMIT != '' && !inputs.prerelease + env: + target_repo: ${{ steps.setup_variables.outputs.target_repo }} + if: | + vars.PUSH_VERSION_COMMIT != '' && !inputs.prerelease && env.target_repo == github.repository run: git push origin ${{ github.event.ref }} build: @@ -89,75 +230,148 @@ jobs: with: version: ${{ needs.prepare.outputs.version }} channel: ${{ needs.prepare.outputs.channel }} + origin: ${{ needs.prepare.outputs.target_repo }} permissions: contents: read packages: write # For package cache secrets: GPG_SIGNING_KEY: ${{ secrets.GPG_SIGNING_KEY }} - publish_pypi_homebrew: + publish_pypi: needs: [prepare, build] + if: ${{ needs.prepare.outputs.pypi_project }} runs-on: ubuntu-latest + permissions: + id-token: write # mandatory for trusted publishing steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: python-version: "3.10" - name: Install Requirements run: | - sudo apt-get -y install pandoc man + sudo apt -y install pandoc man python -m pip install -U pip setuptools wheel twine python -m pip install -U -r requirements.txt - name: Prepare - run: | - python devscripts/update-version.py ${{ needs.prepare.outputs.version }} - python devscripts/make_lazy_extractors.py - - - name: Build and publish on PyPI env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} - if: env.TWINE_PASSWORD != '' && !inputs.prerelease + version: ${{ needs.prepare.outputs.version }} + suffix: ${{ needs.prepare.outputs.pypi_suffix }} + channel: ${{ needs.prepare.outputs.channel }} + target_repo: ${{ needs.prepare.outputs.target_repo }} + pypi_project: ${{ needs.prepare.outputs.pypi_project }} + run: | + python devscripts/update-version.py -c "${{ env.channel }}" -r "${{ env.target_repo }}" -s "${{ env.suffix }}" "${{ env.version }}" + python devscripts/make_lazy_extractors.py + sed -i -E "s/(name=')[^']+(', # package name)/\1${{ env.pypi_project }}\2/" setup.py + + - name: Build run: | rm -rf dist/* make pypi-files python devscripts/set-variant.py pip -M "You installed yt-dlp with pip or using the wheel from PyPi; Use that to update" python setup.py sdist bdist_wheel - twine upload dist/* - - name: Checkout Homebrew repository - env: - BREW_TOKEN: ${{ secrets.BREW_TOKEN }} - PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} - if: env.BREW_TOKEN != '' && env.PYPI_TOKEN != '' && !inputs.prerelease - uses: actions/checkout@v3 + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 with: - repository: yt-dlp/homebrew-taps - path: taps - ssh-key: ${{ secrets.BREW_TOKEN }} - - - name: Update Homebrew Formulae - env: - BREW_TOKEN: ${{ secrets.BREW_TOKEN }} - PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} - if: env.BREW_TOKEN != '' && env.PYPI_TOKEN != '' && !inputs.prerelease - run: | - python devscripts/update-formulae.py taps/Formula/yt-dlp.rb "${{ needs.prepare.outputs.version }}" - git -C taps/ config user.name github-actions - git -C taps/ config user.email github-actions@example.com - git -C taps/ commit -am 'yt-dlp: ${{ needs.prepare.outputs.version }}' - git -C taps/ push + verbose: true publish: needs: [prepare, build] - uses: ./.github/workflows/publish.yml permissions: contents: write - with: - channel: ${{ needs.prepare.outputs.channel }} - prerelease: ${{ inputs.prerelease }} - version: ${{ needs.prepare.outputs.version }} - target_commitish: ${{ needs.prepare.outputs.head_sha }} + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - uses: actions/download-artifact@v3 + - uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Generate release notes + env: + head_sha: ${{ needs.prepare.outputs.head_sha }} + target_repo: ${{ needs.prepare.outputs.target_repo }} + target_tag: ${{ needs.prepare.outputs.target_tag }} + run: | + printf '%s' \ + '[![Installation](https://img.shields.io/badge/-Which%20file%20should%20I%20download%3F-white.svg?style=for-the-badge)]' \ + '(https://github.com/${{ github.repository }}#installation "Installation instructions") ' \ + '[![Documentation](https://img.shields.io/badge/-Docs-brightgreen.svg?style=for-the-badge&logo=GitBook&labelColor=555555)]' \ + '(https://github.com/${{ github.repository }}' \ + '${{ env.target_repo == github.repository && format('/tree/{0}', env.target_tag) || '' }}#readme "Documentation") ' \ + '[![Donate](https://img.shields.io/badge/_-Donate-red.svg?logo=githubsponsors&labelColor=555555&style=for-the-badge)]' \ + '(https://github.com/yt-dlp/yt-dlp/blob/master/Collaborators.md#collaborators "Donate") ' \ + '[![Discord](https://img.shields.io/discord/807245652072857610?color=blue&labelColor=555555&label=&logo=discord&style=for-the-badge)]' \ + '(https://discord.gg/H5MNcFW63r "Discord") ' \ + ${{ env.target_repo == 'yt-dlp/yt-dlp' && '\ + "[![Nightly](https://img.shields.io/badge/Get%20nightly%20builds-purple.svg?style=for-the-badge)]" \ + "(https://github.com/yt-dlp/yt-dlp-nightly-builds/releases/latest \"Nightly builds\") " \ + "[![Master](https://img.shields.io/badge/Get%20master%20builds-lightblue.svg?style=for-the-badge)]" \ + "(https://github.com/yt-dlp/yt-dlp-master-builds/releases/latest \"Master builds\")"' || '' }} > ./RELEASE_NOTES + printf '\n\n' >> ./RELEASE_NOTES + cat >> ./RELEASE_NOTES << EOF + #### A description of the various files are in the [README](https://github.com/${{ github.repository }}#release-files) + --- + $(python ./devscripts/make_changelog.py -vv --collapsible) + EOF + printf '%s\n\n' '**This is a pre-release build**' >> ./PRERELEASE_NOTES + cat ./RELEASE_NOTES >> ./PRERELEASE_NOTES + printf '%s\n\n' 'Generated from: https://github.com/${{ github.repository }}/commit/${{ env.head_sha }}' >> ./ARCHIVE_NOTES + cat ./RELEASE_NOTES >> ./ARCHIVE_NOTES + + - name: Publish to archive repo + env: + GH_TOKEN: ${{ secrets[needs.prepare.outputs.target_repo_token] }} + GH_REPO: ${{ needs.prepare.outputs.target_repo }} + version: ${{ needs.prepare.outputs.version }} + channel: ${{ needs.prepare.outputs.channel }} + if: | + inputs.prerelease && env.GH_TOKEN != '' && env.GH_REPO != '' && env.GH_REPO != github.repository + run: | + title="${{ startswith(env.GH_REPO, 'yt-dlp/') && 'yt-dlp ' || '' }}${{ env.channel }}" + gh release create \ + --notes-file ARCHIVE_NOTES \ + --title "${title} ${{ env.version }}" \ + ${{ env.version }} \ + artifact/* + + - name: Prune old release + env: + GH_TOKEN: ${{ github.token }} + version: ${{ needs.prepare.outputs.version }} + target_repo: ${{ needs.prepare.outputs.target_repo }} + target_tag: ${{ needs.prepare.outputs.target_tag }} + if: | + env.target_repo == github.repository && env.target_tag != env.version + run: | + gh release delete --yes --cleanup-tag "${{ env.target_tag }}" || true + git tag --delete "${{ env.target_tag }}" || true + sleep 5 # Enough time to cover deletion race condition + + - name: Publish release + env: + GH_TOKEN: ${{ github.token }} + version: ${{ needs.prepare.outputs.version }} + target_repo: ${{ needs.prepare.outputs.target_repo }} + target_tag: ${{ needs.prepare.outputs.target_tag }} + head_sha: ${{ needs.prepare.outputs.head_sha }} + if: | + env.target_repo == github.repository + run: | + title="${{ github.repository == 'yt-dlp/yt-dlp' && 'yt-dlp ' || '' }}" + title+="${{ env.target_tag != env.version && format('{0} ', env.target_tag) || '' }}" + gh release create \ + --notes-file ${{ inputs.prerelease && 'PRERELEASE_NOTES' || 'RELEASE_NOTES' }} \ + --target ${{ env.head_sha }} \ + --title "${title}${{ env.version }}" \ + ${{ inputs.prerelease && '--prerelease' || '' }} \ + ${{ env.target_tag }} \ + artifact/* diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 90e7faf7c..248917bf5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -140,12 +140,9 @@ # DEVELOPER INSTRUCTIONS python -m yt_dlp -To run the test, simply invoke your favorite test runner, or execute a test file directly; any of the following work: +To run all the available core tests, use: - python -m unittest discover - python test/test_download.py - nosetests - pytest + python devscripts/run_tests.py See item 6 of [new extractor tutorial](#adding-support-for-a-new-site) for how to run extractor specific test cases. @@ -187,15 +184,21 @@ ## Adding support for a new site 'url': 'https://yourextractor.com/watch/42', 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', 'info_dict': { + # For videos, only the 'id' and 'ext' fields are required to RUN the test: 'id': '42', 'ext': 'mp4', - 'title': 'Video title goes here', - 'thumbnail': r're:^https?://.*\.jpg$', - # TODO more properties, either as: - # * A value - # * MD5 checksum; start the string with md5: - # * A regular expression; start the string with re: - # * Any Python type, e.g. int or float + # Then if the test run fails, it will output the missing/incorrect fields. + # Properties can be added as: + # * A value, e.g. + # 'title': 'Video title goes here', + # * MD5 checksum; start the string with 'md5:', e.g. + # 'description': 'md5:098f6bcd4621d373cade4e832627b4f6', + # * A regular expression; start the string with 're:', e.g. + # 'thumbnail': r're:^https?://.*\.jpg$', + # * A count of elements in a list; start the string with 'count:', e.g. + # 'tags': 'count:10', + # * Any Python type, e.g. + # 'view_count': int, } }] @@ -215,14 +218,14 @@ ## Adding support for a new site } ``` 1. Add an import in [`yt_dlp/extractor/_extractors.py`](yt_dlp/extractor/_extractors.py). Note that the class name must end with `IE`. -1. Run `python test/test_download.py TestDownload.test_YourExtractor` (note that `YourExtractor` doesn't end with `IE`). This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, the tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. You can also run all the tests in one go with `TestDownload.test_YourExtractor_all` -1. Make sure you have atleast one test for your extractor. Even if all videos covered by the extractor are expected to be inaccessible for automated testing, tests should still be added with a `skip` parameter indicating why the particular test is disabled from running. +1. Run `python devscripts/run_tests.py YourExtractor`. This *may fail* at first, but you can continually re-run it until you're done. Upon failure, it will output the missing fields and/or correct values which you can copy. If you decide to add more than one test, the tests will then be named `YourExtractor`, `YourExtractor_1`, `YourExtractor_2`, etc. Note that tests with an `only_matching` key in the test's dict are not included in the count. You can also run all the tests in one go with `YourExtractor_all` +1. Make sure you have at least one test for your extractor. Even if all videos covered by the extractor are expected to be inaccessible for automated testing, tests should still be added with a `skip` parameter indicating why the particular test is disabled from running. 1. Have a look at [`yt_dlp/extractor/common.py`](yt_dlp/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](yt_dlp/extractor/common.py#L119-L440). Add tests and code for as many as you want. 1. Make sure your code follows [yt-dlp coding conventions](#yt-dlp-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): $ flake8 yt_dlp/extractor/yourextractor.py -1. Make sure your code works under all [Python](https://www.python.org/) versions supported by yt-dlp, namely CPython and PyPy for Python 3.7 and above. Backward compatibility is not required for even older versions of Python. +1. Make sure your code works under all [Python](https://www.python.org/) versions supported by yt-dlp, namely CPython and PyPy for Python 3.8 and above. Backward compatibility is not required for even older versions of Python. 1. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files, [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: $ git add yt_dlp/extractor/_extractors.py diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 8eda41307..adcc92144 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -509,3 +509,36 @@ handlerug jiru madewokherd xofe +awalgarg +midnightveil +naginatana +Riteo +1100101 +aniolpages +bartbroere +CrendKing +Esokrates +HitomaruKonpaku +LoserFox +peci1 +saintliao +shubhexists +SirElderling +almx +elivinsky +starius +TravisDupes +amir16yp +Fymyte +Ganesh910 +hashFactory +kclauhk +Kyraminol +lstrojny +middlingphys +NickCis +nicodato +prettykool +S-Aarab +sonmezberkay +TSRBerry diff --git a/Changelog.md b/Changelog.md index 48dcbf102..30de9072e 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,202 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2023.12.30 + +#### Core changes +- [Fix format selection parse error for CPython 3.12](https://github.com/yt-dlp/yt-dlp/commit/00cdda4f6fe18712ced13dbc64b7ea10f323e268) ([#8797](https://github.com/yt-dlp/yt-dlp/issues/8797)) by [Grub4K](https://github.com/Grub4K) +- [Let `read_stdin` obey `--quiet`](https://github.com/yt-dlp/yt-dlp/commit/a174c453ee1e853c584ceadeac17eef2bd433dc5) by [pukkandan](https://github.com/pukkandan) +- [Merged with youtube-dl be008e6](https://github.com/yt-dlp/yt-dlp/commit/65de7d204ce88c0225df1321060304baab85dbd8) by [bashonly](https://github.com/bashonly), [dirkf](https://github.com/dirkf), [Grub4K](https://github.com/Grub4K) +- [Parse `release_year` from `release_date`](https://github.com/yt-dlp/yt-dlp/commit/1732eccc0a40256e076bf0435a29f0f1d8419280) ([#8524](https://github.com/yt-dlp/yt-dlp/issues/8524)) by [seproDev](https://github.com/seproDev) +- [Release workflow and Updater cleanup](https://github.com/yt-dlp/yt-dlp/commit/632b8ee54eb2df8ac6e20746a0bd95b7ebb053aa) ([#8640](https://github.com/yt-dlp/yt-dlp/issues/8640)) by [bashonly](https://github.com/bashonly) +- [Remove Python 3.7 support](https://github.com/yt-dlp/yt-dlp/commit/f4b95acafcd69a50040730dfdf732e797278fdcc) ([#8361](https://github.com/yt-dlp/yt-dlp/issues/8361)) by [bashonly](https://github.com/bashonly) +- [Support `NO_COLOR` environment variable](https://github.com/yt-dlp/yt-dlp/commit/a0b19d319a6ce8b7059318fa17a34b144fde1785) ([#8385](https://github.com/yt-dlp/yt-dlp/issues/8385)) by [Grub4K](https://github.com/Grub4K), [prettykool](https://github.com/prettykool) +- **outtmpl**: [Support multiplication](https://github.com/yt-dlp/yt-dlp/commit/993edd3f6e17e966c763bc86dc34125445cec6b6) by [pukkandan](https://github.com/pukkandan) +- **utils**: `traverse_obj`: [Move `is_user_input` into output template](https://github.com/yt-dlp/yt-dlp/commit/0b6f829b1dfda15d3c1d7d1fbe4ea6102c26dd24) ([#8673](https://github.com/yt-dlp/yt-dlp/issues/8673)) by [Grub4K](https://github.com/Grub4K) +- **webvtt**: [Allow spaces before newlines for CueBlock](https://github.com/yt-dlp/yt-dlp/commit/15f22b4880b6b3f71f350c64d70976ae65b9f1ca) ([#7681](https://github.com/yt-dlp/yt-dlp/issues/7681)) by [TSRBerry](https://github.com/TSRBerry) (With fixes in [298230e](https://github.com/yt-dlp/yt-dlp/commit/298230e550886b746c266724dd701d842ca2696e) by [pukkandan](https://github.com/pukkandan)) + +#### Extractor changes +- [Add `media_type` field](https://github.com/yt-dlp/yt-dlp/commit/e370f9ec36972d06100a3db893b397bfc1b07b4d) by [trainman261](https://github.com/trainman261) +- [Extract from `media` elements in SMIL manifests](https://github.com/yt-dlp/yt-dlp/commit/ddb2d7588bea48bae965dbfabe6df6550c9d3d43) ([#8504](https://github.com/yt-dlp/yt-dlp/issues/8504)) by [seproDev](https://github.com/seproDev) +- **abematv**: [Fix season metadata](https://github.com/yt-dlp/yt-dlp/commit/cc07f5cc85d9e2a6cd0bedb9d961665eea0d6047) ([#8607](https://github.com/yt-dlp/yt-dlp/issues/8607)) by [middlingphys](https://github.com/middlingphys) +- **allstar**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/3237f8ba29fe13bf95ff42b1e48b5b5109715feb) ([#8274](https://github.com/yt-dlp/yt-dlp/issues/8274)) by [S-Aarab](https://github.com/S-Aarab) +- **altcensored**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/3f90813f0617e0d21302398010de7496c9ae36aa) ([#8291](https://github.com/yt-dlp/yt-dlp/issues/8291)) by [drzraf](https://github.com/drzraf) +- **ard**: [Overhaul extractors](https://github.com/yt-dlp/yt-dlp/commit/5f009a094f0e8450792b097c4c8273622778052d) ([#8878](https://github.com/yt-dlp/yt-dlp/issues/8878)) by [seproDev](https://github.com/seproDev) +- **ardbetamediathek**: [Fix series extraction](https://github.com/yt-dlp/yt-dlp/commit/1f8bd8eba82ba10ddb49ee7cc0be4540dab103d5) ([#8687](https://github.com/yt-dlp/yt-dlp/issues/8687)) by [lstrojny](https://github.com/lstrojny) +- **bbc** + - [Extract more formats](https://github.com/yt-dlp/yt-dlp/commit/c919b68f7e79ea5010f75f648d3c9e45405a8011) ([#8321](https://github.com/yt-dlp/yt-dlp/issues/8321)) by [barsnick](https://github.com/barsnick), [dirkf](https://github.com/dirkf) + - [Fix JSON parsing bug](https://github.com/yt-dlp/yt-dlp/commit/19741ab8a401ec64d5e84fdbfcfb141d105e7bc8) by [bashonly](https://github.com/bashonly) +- **bfmtv**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/4903f452b68efb62dadf22e81be8c7934fc743e7) ([#8651](https://github.com/yt-dlp/yt-dlp/issues/8651)) by [bashonly](https://github.com/bashonly) +- **bilibili**: [Support courses and interactive videos](https://github.com/yt-dlp/yt-dlp/commit/9f09bdcfcb8e2b4b2decdc30d35d34b993bc7a94) ([#8343](https://github.com/yt-dlp/yt-dlp/issues/8343)) by [c-basalt](https://github.com/c-basalt) +- **bitchute**: [Fix and improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/b1a1ec1540605d2ea7abdb63336ffb1c56bf6316) ([#8507](https://github.com/yt-dlp/yt-dlp/issues/8507)) by [SirElderling](https://github.com/SirElderling) +- **box**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/5a230233d6fce06f4abd1fce0dc92b948e6f780b) ([#8649](https://github.com/yt-dlp/yt-dlp/issues/8649)) by [bashonly](https://github.com/bashonly) +- **bundestag**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/00a3e47bf5440c96025a76e08337ff2a475ed83e) ([#8783](https://github.com/yt-dlp/yt-dlp/issues/8783)) by [Grub4K](https://github.com/Grub4K) +- **drtv**: [Set default ext for m3u8 formats](https://github.com/yt-dlp/yt-dlp/commit/f96ab86cd837b1b5823baa87d144e15322ee9298) ([#8590](https://github.com/yt-dlp/yt-dlp/issues/8590)) by [seproDev](https://github.com/seproDev) +- **duoplay**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/66a0127d45033c698bdbedf162cddc55d9e7b906) ([#8542](https://github.com/yt-dlp/yt-dlp/issues/8542)) by [glensc](https://github.com/glensc) +- **eplus**: [Add login support and DRM detection](https://github.com/yt-dlp/yt-dlp/commit/d5d1517e7d838500800d193ac3234b06e89654cd) ([#8661](https://github.com/yt-dlp/yt-dlp/issues/8661)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **facebook** + - [Fix Memories extraction](https://github.com/yt-dlp/yt-dlp/commit/c39358a54bc6675ae0c50b81024e5a086e41656a) ([#8681](https://github.com/yt-dlp/yt-dlp/issues/8681)) by [kclauhk](https://github.com/kclauhk) + - [Improve subtitles extraction](https://github.com/yt-dlp/yt-dlp/commit/9cafb9ff17e14475a35c9a58b5bb010c86c9db4b) ([#8296](https://github.com/yt-dlp/yt-dlp/issues/8296)) by [kclauhk](https://github.com/kclauhk) +- **floatplane**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/628fa244bbce2ad39775a5959e99588f30cac152) ([#8639](https://github.com/yt-dlp/yt-dlp/issues/8639)) by [seproDev](https://github.com/seproDev) +- **francetv**: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/71f28097fec1c9e029f74b68a4eadc8915399840) ([#8409](https://github.com/yt-dlp/yt-dlp/issues/8409)) by [Fymyte](https://github.com/Fymyte) +- **instagram**: [Fix stories extraction](https://github.com/yt-dlp/yt-dlp/commit/50eaea9fd7787546b53660e736325fa31c77765d) ([#8843](https://github.com/yt-dlp/yt-dlp/issues/8843)) by [bashonly](https://github.com/bashonly) +- **joqrag**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/db8b4edc7d0bd27da462f6fe82ff6e13e3d68a04) ([#8384](https://github.com/yt-dlp/yt-dlp/issues/8384)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **litv**: [Fix premium content extraction](https://github.com/yt-dlp/yt-dlp/commit/f45c4efcd928a173e1300a8f1ce4258e70c969b1) ([#8842](https://github.com/yt-dlp/yt-dlp/issues/8842)) by [bashonly](https://github.com/bashonly) +- **maariv**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/c5f01bf7d4b9426c87c3f8248de23934a56579e0) ([#8331](https://github.com/yt-dlp/yt-dlp/issues/8331)) by [amir16yp](https://github.com/amir16yp) +- **mediastream**: [Fix authenticated format extraction](https://github.com/yt-dlp/yt-dlp/commit/b03c89309eb141be1a1eceeeb7475dd3b7529ad9) ([#8657](https://github.com/yt-dlp/yt-dlp/issues/8657)) by [NickCis](https://github.com/NickCis) +- **nebula**: [Overhaul extractors](https://github.com/yt-dlp/yt-dlp/commit/45d82be65f71bb05506bd55376c6fdb36bc54142) ([#8566](https://github.com/yt-dlp/yt-dlp/issues/8566)) by [elyse0](https://github.com/elyse0), [pukkandan](https://github.com/pukkandan), [seproDev](https://github.com/seproDev) +- **nintendo**: [Fix Nintendo Direct extraction](https://github.com/yt-dlp/yt-dlp/commit/1d24da6c899ef280d8b0a48a5e280ecd5d39cdf4) ([#8609](https://github.com/yt-dlp/yt-dlp/issues/8609)) by [Grub4K](https://github.com/Grub4K) +- **ondemandkorea**: [Fix upgraded format extraction](https://github.com/yt-dlp/yt-dlp/commit/04a5e06350e3ef7c03f94f2f3f90dd96c6411152) ([#8677](https://github.com/yt-dlp/yt-dlp/issues/8677)) by [seproDev](https://github.com/seproDev) +- **pr0gramm**: [Support variant formats and subtitles](https://github.com/yt-dlp/yt-dlp/commit/f98a3305eb124a0c375d03209d5c5a64fe1766c8) ([#8674](https://github.com/yt-dlp/yt-dlp/issues/8674)) by [Grub4K](https://github.com/Grub4K) +- **rinsefm**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/c91af948e43570025e4aa887e248fd025abae394) ([#8778](https://github.com/yt-dlp/yt-dlp/issues/8778)) by [hashFactory](https://github.com/hashFactory) +- **rudovideo**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/0d531c35eca4c2eb36e160530a7a333edbc727cc) ([#8664](https://github.com/yt-dlp/yt-dlp/issues/8664)) by [nicodato](https://github.com/nicodato) +- **theguardian**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/1fa3f24d4b5d22176b11d78420f1f4b64a5af0a8) ([#8535](https://github.com/yt-dlp/yt-dlp/issues/8535)) by [SirElderling](https://github.com/SirElderling) +- **theplatform**: [Extract more metadata](https://github.com/yt-dlp/yt-dlp/commit/7e09c147fdccb44806bbf601573adc4b77210a89) ([#8635](https://github.com/yt-dlp/yt-dlp/issues/8635)) by [trainman261](https://github.com/trainman261) +- **twitcasting**: [Detect livestreams via API and `show` page](https://github.com/yt-dlp/yt-dlp/commit/585d0ed9abcfcb957f2b2684b8ad43c3af160383) ([#8601](https://github.com/yt-dlp/yt-dlp/issues/8601)) by [bashonly](https://github.com/bashonly), [JC-Chung](https://github.com/JC-Chung) +- **twitcastinguser**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/ff2fde1b8f922fd34bae6172602008cd67c07c93) ([#8650](https://github.com/yt-dlp/yt-dlp/issues/8650)) by [bashonly](https://github.com/bashonly) +- **twitter** + - [Extract stale tweets](https://github.com/yt-dlp/yt-dlp/commit/1c54a98e19d047e7c15184237b6ef8ad50af489c) ([#8724](https://github.com/yt-dlp/yt-dlp/issues/8724)) by [bashonly](https://github.com/bashonly) + - [Prioritize m3u8 formats](https://github.com/yt-dlp/yt-dlp/commit/e7d22348e77367740da78a3db27167ecf894b7c9) ([#8826](https://github.com/yt-dlp/yt-dlp/issues/8826)) by [bashonly](https://github.com/bashonly) + - [Work around API rate-limit](https://github.com/yt-dlp/yt-dlp/commit/116c268438ea4d3738f6fa502c169081ca8f0ee7) ([#8825](https://github.com/yt-dlp/yt-dlp/issues/8825)) by [bashonly](https://github.com/bashonly) + - broadcast: [Extract `concurrent_view_count`](https://github.com/yt-dlp/yt-dlp/commit/6fe82491ed622b948c512cf4aab46ac3a234ae0a) ([#8600](https://github.com/yt-dlp/yt-dlp/issues/8600)) by [sonmezberkay](https://github.com/sonmezberkay) +- **vidly**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/34df1c1f60fa652c0a6a5c712b06c10e45daf6b7) ([#8612](https://github.com/yt-dlp/yt-dlp/issues/8612)) by [seproDev](https://github.com/seproDev) +- **vocaroo**: [Do not use deprecated `getheader`](https://github.com/yt-dlp/yt-dlp/commit/f223b1b0789f65e06619dcc9fc9e74f50d259379) ([#8606](https://github.com/yt-dlp/yt-dlp/issues/8606)) by [qbnu](https://github.com/qbnu) +- **vvvvid**: [Set user-agent to fix extraction](https://github.com/yt-dlp/yt-dlp/commit/1725e943b0e8a8b585305660d4611e684374409c) ([#8615](https://github.com/yt-dlp/yt-dlp/issues/8615)) by [Kyraminol](https://github.com/Kyraminol) +- **youtube** + - [Fix `like_count` extraction](https://github.com/yt-dlp/yt-dlp/commit/6b5d93b0b0240e287389d1d43b2d5293e18aa4cc) ([#8763](https://github.com/yt-dlp/yt-dlp/issues/8763)) by [Ganesh910](https://github.com/Ganesh910) + - [Improve detection of faulty HLS formats](https://github.com/yt-dlp/yt-dlp/commit/bb5a54e6db2422bbd155d93a0e105b6616c09467) ([#8646](https://github.com/yt-dlp/yt-dlp/issues/8646)) by [bashonly](https://github.com/bashonly) + - [Return empty playlist when channel/tab has no videos](https://github.com/yt-dlp/yt-dlp/commit/044886c220620a7679109e92352890e18b6079e3) by [pukkandan](https://github.com/pukkandan) + - [Support cf.piped.video](https://github.com/yt-dlp/yt-dlp/commit/6a9c7a2b52655bacfa7ab2da24fd0d14a6fff495) ([#8514](https://github.com/yt-dlp/yt-dlp/issues/8514)) by [OIRNOIR](https://github.com/OIRNOIR) +- **zingmp3**: [Add support for radio and podcasts](https://github.com/yt-dlp/yt-dlp/commit/64de1a4c25bada90374b88d7353754fe8fbfcc51) ([#7189](https://github.com/yt-dlp/yt-dlp/issues/7189)) by [hatienl0i261299](https://github.com/hatienl0i261299) + +#### Postprocessor changes +- **ffmpegmetadata**: [Embed stream metadata in single format downloads](https://github.com/yt-dlp/yt-dlp/commit/deeb13eae82e60f82a2c0c5861f460399a997528) ([#8647](https://github.com/yt-dlp/yt-dlp/issues/8647)) by [bashonly](https://github.com/bashonly) + +#### Networking changes +- [Strip whitespace around header values](https://github.com/yt-dlp/yt-dlp/commit/196eb0fe77b78e2e5ca02c506c3837c2b1a7964c) ([#8802](https://github.com/yt-dlp/yt-dlp/issues/8802)) by [coletdjnz](https://github.com/coletdjnz) +- **Request Handler**: websockets: [Migrate websockets to networking framework](https://github.com/yt-dlp/yt-dlp/commit/ccfd70f4c24b579c72123ca76ab50164f8f122b7) ([#7720](https://github.com/yt-dlp/yt-dlp/issues/7720)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **ci** + - [Concurrency optimizations](https://github.com/yt-dlp/yt-dlp/commit/f124fa458826308afc86cf364c509f857686ecfd) ([#8614](https://github.com/yt-dlp/yt-dlp/issues/8614)) by [Grub4K](https://github.com/Grub4K) + - [Run core tests only for core changes](https://github.com/yt-dlp/yt-dlp/commit/13b3cb3c2b7169a1e17d6fc62593bf744170521c) ([#8841](https://github.com/yt-dlp/yt-dlp/issues/8841)) by [Grub4K](https://github.com/Grub4K) +- **cleanup** + - [Fix spelling of `IE_NAME`](https://github.com/yt-dlp/yt-dlp/commit/bc4ab17b38f01000d99c5c2bedec89721fee65ec) ([#8810](https://github.com/yt-dlp/yt-dlp/issues/8810)) by [barsnick](https://github.com/barsnick) + - [Remove dead extractors](https://github.com/yt-dlp/yt-dlp/commit/9751a457cfdb18bf99d9ee0d10e4e6a594502bbf) ([#8604](https://github.com/yt-dlp/yt-dlp/issues/8604)) by [seproDev](https://github.com/seproDev) + - Miscellaneous: [f9fb3ce](https://github.com/yt-dlp/yt-dlp/commit/f9fb3ce86e3c6a0c3c33b45392b8d7288bceba76) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan), [seproDev](https://github.com/seproDev) +- **devscripts**: `run_tests`: [Create Python script](https://github.com/yt-dlp/yt-dlp/commit/2d1d683a541d71f3d3bb999dfe8eeb1976fb91ce) ([#8720](https://github.com/yt-dlp/yt-dlp/issues/8720)) by [Grub4K](https://github.com/Grub4K) (With fixes in [225cf2b](https://github.com/yt-dlp/yt-dlp/commit/225cf2b830a1de2c5eacd257edd2a01aed1e1114)) +- **docs**: [Update youtube-dl merge commit in `README.md`](https://github.com/yt-dlp/yt-dlp/commit/f10589e3453009bb523f55849bba144c9b91cf2a) by [bashonly](https://github.com/bashonly) +- **test**: networking: [Update tests for OpenSSL 3.2](https://github.com/yt-dlp/yt-dlp/commit/37755a037e612bfc608c3d4722e8ef2ce6a022ee) ([#8814](https://github.com/yt-dlp/yt-dlp/issues/8814)) by [bashonly](https://github.com/bashonly) + +### 2023.11.16 + +#### Extractor changes +- **abc.net.au**: iview, showseries: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/15cb3528cbda7b6198f49a6b5953c226d701696b) ([#8586](https://github.com/yt-dlp/yt-dlp/issues/8586)) by [bashonly](https://github.com/bashonly) +- **beatbump**: [Update `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/21dc069bea2d4d99345dd969e098f4535c751d45) ([#8576](https://github.com/yt-dlp/yt-dlp/issues/8576)) by [seproDev](https://github.com/seproDev) +- **dailymotion**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/a489f071508ec5caf5f32052d142afe86c28df7a) ([#7692](https://github.com/yt-dlp/yt-dlp/issues/7692)) by [TravisDupes](https://github.com/TravisDupes) +- **drtv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/0783fd558ed0d3a8bc754beb75a406256f8b97b2) ([#8484](https://github.com/yt-dlp/yt-dlp/issues/8484)) by [almx](https://github.com/almx), [seproDev](https://github.com/seproDev) +- **eltrecetv**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/dcfad52812aa8ce007cefbfbe63f58b49f6b1046) ([#8216](https://github.com/yt-dlp/yt-dlp/issues/8216)) by [elivinsky](https://github.com/elivinsky) +- **jiosaavn**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b530118e7f48232cacf8050d79a6b20bdfcf5468) ([#8307](https://github.com/yt-dlp/yt-dlp/issues/8307)) by [awalgarg](https://github.com/awalgarg) +- **njpwworld**: [Remove](https://github.com/yt-dlp/yt-dlp/commit/e569c2d1f4b665795a2b64f0aaf7f76930664233) ([#8570](https://github.com/yt-dlp/yt-dlp/issues/8570)) by [aarubui](https://github.com/aarubui) +- **tv5mondeplus**: [Extract subtitles](https://github.com/yt-dlp/yt-dlp/commit/0f634dba3afdc429ece8839b02f6d56c27b7973a) ([#4209](https://github.com/yt-dlp/yt-dlp/issues/4209)) by [FrankZ85](https://github.com/FrankZ85) +- **twitcasting**: [Fix livestream detection](https://github.com/yt-dlp/yt-dlp/commit/2325d03aa7bb80f56ba52cd6992258e44727b424) ([#8574](https://github.com/yt-dlp/yt-dlp/issues/8574)) by [JC-Chung](https://github.com/JC-Chung) +- **zenyandex**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/5efe68b73cbf6e907c2e6a3aa338664385084184) ([#8454](https://github.com/yt-dlp/yt-dlp/issues/8454)) by [starius](https://github.com/starius) + +#### Misc. changes +- **build**: [Make `secretstorage` an optional dependency](https://github.com/yt-dlp/yt-dlp/commit/24f827875c6ba513f12ed09a3aef2bbed223760d) ([#8585](https://github.com/yt-dlp/yt-dlp/issues/8585)) by [bashonly](https://github.com/bashonly) + +### 2023.11.14 + +#### Important changes +- **The release channels have been adjusted!** + * [`master`](https://github.com/yt-dlp/yt-dlp-master-builds) builds are made after each push, containing the latest fixes (but also possibly bugs). This was previously the `nightly` channel. + * [`nightly`](https://github.com/yt-dlp/yt-dlp-nightly-builds) builds are now made once a day, if there were any changes. +- Security: [[CVE-2023-46121](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-46121)] Patch [Generic Extractor MITM Vulnerability via Arbitrary Proxy Injection](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-3ch3-jhc6-5r8x) + - Disallow smuggling of arbitrary `http_headers`; extractors now only use specific headers + +#### Core changes +- [Add `--compat-option manifest-filesize-approx`](https://github.com/yt-dlp/yt-dlp/commit/10025b715ea01489557eb2c5a3cc04d361fcdb52) ([#8356](https://github.com/yt-dlp/yt-dlp/issues/8356)) by [bashonly](https://github.com/bashonly) +- [Fix format sorting with `--load-info-json`](https://github.com/yt-dlp/yt-dlp/commit/595ea4a99b726b8fe9463e7853b7053978d0544e) ([#8521](https://github.com/yt-dlp/yt-dlp/issues/8521)) by [bashonly](https://github.com/bashonly) +- [Include build origin in verbose output](https://github.com/yt-dlp/yt-dlp/commit/20314dd46f25e0e0a7e985a7804049aefa8b909f) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) +- [Only ensure playlist thumbnail dir if writing thumbs](https://github.com/yt-dlp/yt-dlp/commit/a40e0b37dfc8c26916b0e01aa3f29f3bc42250b6) ([#8373](https://github.com/yt-dlp/yt-dlp/issues/8373)) by [bashonly](https://github.com/bashonly) +- **update**: [Overhaul self-updater](https://github.com/yt-dlp/yt-dlp/commit/0b6ad22e6a432006a75df968f0283e6c6b3cfae6) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- [Do not smuggle `http_headers`](https://github.com/yt-dlp/yt-dlp/commit/f04b5bedad7b281bee9814686bba1762bae092eb) by [coletdjnz](https://github.com/coletdjnz) +- [Do not test truth value of `xml.etree.ElementTree.Element`](https://github.com/yt-dlp/yt-dlp/commit/d4f14a72dc1dd79396e0e80980268aee902b61e4) ([#8582](https://github.com/yt-dlp/yt-dlp/issues/8582)) by [bashonly](https://github.com/bashonly) +- **brilliantpala**: [Fix cookies support](https://github.com/yt-dlp/yt-dlp/commit/9b5bedf13a3323074daceb0ec6ebb3cc6e0b9684) ([#8352](https://github.com/yt-dlp/yt-dlp/issues/8352)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **generic**: [Improve direct video link ext detection](https://github.com/yt-dlp/yt-dlp/commit/4ce2f29a50fcfb9920e6f2ffe42192945a2bad7e) ([#8340](https://github.com/yt-dlp/yt-dlp/issues/8340)) by [bashonly](https://github.com/bashonly) +- **laxarxames**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/312a2d1e8bc247264f9d85c5ec764e33aa0133b5) ([#8412](https://github.com/yt-dlp/yt-dlp/issues/8412)) by [aniolpages](https://github.com/aniolpages) +- **n-tv.de**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/8afd9468b0c822843bc480d366d1c86698daabfb) ([#8414](https://github.com/yt-dlp/yt-dlp/issues/8414)) by [1100101](https://github.com/1100101) +- **neteasemusic**: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/46acc418a53470b7f32581b3309c3cb87aa8488d) ([#8531](https://github.com/yt-dlp/yt-dlp/issues/8531)) by [LoserFox](https://github.com/LoserFox) +- **nhk**: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/54579be4364e148277c32e20a5c3efc2c3f52f5b) ([#8388](https://github.com/yt-dlp/yt-dlp/issues/8388)) by [garret1317](https://github.com/garret1317) +- **novaembed**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/3ff494f6f41c27549420fa88be27555bd449ffdc) ([#8368](https://github.com/yt-dlp/yt-dlp/issues/8368)) by [peci1](https://github.com/peci1) +- **npo**: [Send `POST` request to streams API endpoint](https://github.com/yt-dlp/yt-dlp/commit/8e02a4dcc800f9444e9d461edc41edd7b662f435) ([#8413](https://github.com/yt-dlp/yt-dlp/issues/8413)) by [bartbroere](https://github.com/bartbroere) +- **ondemandkorea**: [Overhaul extractor](https://github.com/yt-dlp/yt-dlp/commit/05adfd883a4f2ecae0267e670a62a2e45c351aeb) ([#8386](https://github.com/yt-dlp/yt-dlp/issues/8386)) by [seproDev](https://github.com/seproDev) +- **orf**: podcast: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/6ba3085616652cbf05d1858efc321fdbfc4c6119) ([#8486](https://github.com/yt-dlp/yt-dlp/issues/8486)) by [Esokrates](https://github.com/Esokrates) +- **polskieradio**: audition: [Fix playlist extraction](https://github.com/yt-dlp/yt-dlp/commit/464327acdb353ceb91d2115163a5a9621b22fe0d) ([#8459](https://github.com/yt-dlp/yt-dlp/issues/8459)) by [shubhexists](https://github.com/shubhexists) +- **qdance**: [Update `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/177f0d963e4b9db749805c482e6f288354c8be84) ([#8426](https://github.com/yt-dlp/yt-dlp/issues/8426)) by [bashonly](https://github.com/bashonly) +- **radiocomercial**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/ef12dbdcd3e7264bd3d744c1e3107597bd23ad35) ([#8508](https://github.com/yt-dlp/yt-dlp/issues/8508)) by [SirElderling](https://github.com/SirElderling) +- **sbs.co.kr**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/25a4bd345a0dcfece6fef752d4537eb403da94d9) ([#8326](https://github.com/yt-dlp/yt-dlp/issues/8326)) by [seproDev](https://github.com/seproDev) +- **theatercomplextown**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/2863fcf2b6876d0c7965ff7d6d9242eea653dc6b) ([#8560](https://github.com/yt-dlp/yt-dlp/issues/8560)) by [bashonly](https://github.com/bashonly) +- **thisav**: [Remove](https://github.com/yt-dlp/yt-dlp/commit/cb480e390d85fb3a598c1b6d5eef3438ce729fc9) ([#8346](https://github.com/yt-dlp/yt-dlp/issues/8346)) by [bashonly](https://github.com/bashonly) +- **thisoldhouse**: [Add login support](https://github.com/yt-dlp/yt-dlp/commit/c76c96677ff6a056f5844a568ef05ee22c46d6f4) ([#8561](https://github.com/yt-dlp/yt-dlp/issues/8561)) by [bashonly](https://github.com/bashonly) +- **twitcasting**: [Fix livestream extraction](https://github.com/yt-dlp/yt-dlp/commit/7b8b1cf5eb8bf44ce70bc24e1f56f0dba2737e98) ([#8427](https://github.com/yt-dlp/yt-dlp/issues/8427)) by [JC-Chung](https://github.com/JC-Chung), [saintliao](https://github.com/saintliao) +- **twitter** + - broadcast + - [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/7d337ca977d73a0a6c07ab481ed8faa8f6ff8726) ([#8383](https://github.com/yt-dlp/yt-dlp/issues/8383)) by [HitomaruKonpaku](https://github.com/HitomaruKonpaku) + - [Support `--wait-for-video`](https://github.com/yt-dlp/yt-dlp/commit/f6e97090d2ed9e05441ab0f4bec3559b816d7a00) ([#8475](https://github.com/yt-dlp/yt-dlp/issues/8475)) by [bashonly](https://github.com/bashonly) +- **weibo**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/15b252dfd2c6807fe57afc5a95e59abadb32ccd2) ([#8463](https://github.com/yt-dlp/yt-dlp/issues/8463)) by [c-basalt](https://github.com/c-basalt) +- **weverse**: [Fix login error handling](https://github.com/yt-dlp/yt-dlp/commit/4a601c9eff9fb42e24a4c8da3fa03628e035b35b) ([#8458](https://github.com/yt-dlp/yt-dlp/issues/8458)) by [seproDev](https://github.com/seproDev) +- **youtube**: [Check newly uploaded iOS HLS formats](https://github.com/yt-dlp/yt-dlp/commit/ef79d20dc9d27ac002a7196f073b37f2f2721aed) ([#8336](https://github.com/yt-dlp/yt-dlp/issues/8336)) by [bashonly](https://github.com/bashonly) +- **zoom**: [Extract combined view formats](https://github.com/yt-dlp/yt-dlp/commit/3906de07551fedb00b789345bf24cc27d6ddf128) ([#7847](https://github.com/yt-dlp/yt-dlp/issues/7847)) by [Mipsters](https://github.com/Mipsters) + +#### Downloader changes +- **aria2c**: [Remove duplicate `--file-allocation=none`](https://github.com/yt-dlp/yt-dlp/commit/21b25281c51523620706b11bfc1c4a889858e1f2) ([#8332](https://github.com/yt-dlp/yt-dlp/issues/8332)) by [CrendKing](https://github.com/CrendKing) +- **dash**: [Force native downloader for `--live-from-start`](https://github.com/yt-dlp/yt-dlp/commit/2622c804d1a5accc3045db398e0fc52074f4bdb3) ([#8339](https://github.com/yt-dlp/yt-dlp/issues/8339)) by [bashonly](https://github.com/bashonly) + +#### Networking changes +- **Request Handler**: requests: [Add handler for `requests` HTTP library (#3668)](https://github.com/yt-dlp/yt-dlp/commit/8a8b54523addf46dfd50ef599761a81bc22362e6) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [Grub4K](https://github.com/Grub4K) (With fixes in [4e38e2a](https://github.com/yt-dlp/yt-dlp/commit/4e38e2ae9d7380015349e6aee59c78bb3938befd)) + + Adds support for HTTPS proxies and persistent connections (keep-alive) + +#### Misc. changes +- **build** + - [Include secretstorage in Linux builds](https://github.com/yt-dlp/yt-dlp/commit/9970d74c8383432c6c8779aa47d3253dcf412b14) by [bashonly](https://github.com/bashonly) + - [Overhaul and unify release workflow](https://github.com/yt-dlp/yt-dlp/commit/1d03633c5a1621b9f3a756f0a4f9dc61fab3aeaa) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) +- **ci** + - [Bump `actions/checkout` to v4](https://github.com/yt-dlp/yt-dlp/commit/5438593a35b7b042fc48fe29cad0b9039f07c9bb) by [bashonly](https://github.com/bashonly) + - [Run core tests with dependencies](https://github.com/yt-dlp/yt-dlp/commit/700444c23ddb65f618c2abd942acdc0c58c650b1) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz) +- **cleanup** + - [Fix changelog typo](https://github.com/yt-dlp/yt-dlp/commit/a9d3f4b20a3533d2a40104c85bc2cc6c2564c800) by [bashonly](https://github.com/bashonly) + - [Update documentation for master and nightly channels](https://github.com/yt-dlp/yt-dlp/commit/a00af29853b8c7350ce086f4cab8c2c9cf2fcf1d) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - Miscellaneous: [b012271](https://github.com/yt-dlp/yt-dlp/commit/b012271d01b59759e4eefeab0308698cd9e7224c) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [dirkf](https://github.com/dirkf), [gamer191](https://github.com/gamer191), [Grub4K](https://github.com/Grub4K), [seproDev](https://github.com/seproDev) +- **test**: update: [Implement simple updater unit tests](https://github.com/yt-dlp/yt-dlp/commit/87264d4fdadcddd91289b968dd0e4bf58d449267) by [bashonly](https://github.com/bashonly) + +### 2023.10.13 + +#### Core changes +- [Ensure thumbnail output directory exists](https://github.com/yt-dlp/yt-dlp/commit/2acd1d555ef89851c73773776715d3de9a0e30b9) ([#7985](https://github.com/yt-dlp/yt-dlp/issues/7985)) by [Riteo](https://github.com/Riteo) +- **utils** + - `js_to_json`: [Fix `Date` constructor parsing](https://github.com/yt-dlp/yt-dlp/commit/9d7ded6419089c1bf252496073f73ad90ed71004) ([#8295](https://github.com/yt-dlp/yt-dlp/issues/8295)) by [awalgarg](https://github.com/awalgarg), [Grub4K](https://github.com/Grub4K) + - `write_xattr`: [Use `os.setxattr` if available](https://github.com/yt-dlp/yt-dlp/commit/84e26038d4002e763ea51ca1bdce4f7e63c540bf) ([#8205](https://github.com/yt-dlp/yt-dlp/issues/8205)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- **artetv**: [Support age-restricted content](https://github.com/yt-dlp/yt-dlp/commit/09f815ad52843219a7ee3f2a0dddf6c250c91f0c) ([#8301](https://github.com/yt-dlp/yt-dlp/issues/8301)) by [StefanLobbenmeier](https://github.com/StefanLobbenmeier) +- **jtbc**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b286ec68f1f28798b3e371f888a2ed97d399cf77) ([#8314](https://github.com/yt-dlp/yt-dlp/issues/8314)) by [seproDev](https://github.com/seproDev) +- **mbn**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e030b6b6fba7b2f4614ad2ab9f7649d40a2dd305) ([#8312](https://github.com/yt-dlp/yt-dlp/issues/8312)) by [seproDev](https://github.com/seproDev) +- **nhk**: [Fix Japanese-language VOD extraction](https://github.com/yt-dlp/yt-dlp/commit/4de94b9e165bfd6421a692f5f2eabcdb08edcb71) ([#8309](https://github.com/yt-dlp/yt-dlp/issues/8309)) by [garret1317](https://github.com/garret1317) +- **radiko**: [Fix bug with `downloader_options`](https://github.com/yt-dlp/yt-dlp/commit/b9316642313bbc9e209ac0d2276d37ba60bceb49) by [bashonly](https://github.com/bashonly) +- **tenplay**: [Add support for seasons](https://github.com/yt-dlp/yt-dlp/commit/88a99c87b680ae59002534a517e191f46c42cbd4) ([#7939](https://github.com/yt-dlp/yt-dlp/issues/7939)) by [midnightveil](https://github.com/midnightveil) +- **youku**: [Improve tudou.com support](https://github.com/yt-dlp/yt-dlp/commit/b7098d46b552a9322c6cea39ba80be5229f922de) ([#8160](https://github.com/yt-dlp/yt-dlp/issues/8160)) by [naginatana](https://github.com/naginatana) +- **youtube**: [Fix bug with `--extractor-retries inf`](https://github.com/yt-dlp/yt-dlp/commit/feebf6d02fc9651331eee2af5e08e6112288163b) ([#8328](https://github.com/yt-dlp/yt-dlp/issues/8328)) by [Grub4K](https://github.com/Grub4K) + +#### Downloader changes +- **fragment**: [Improve progress calculation](https://github.com/yt-dlp/yt-dlp/commit/1c51c520f7b511ebd9e4eb7322285a8c31eedbbd) ([#8241](https://github.com/yt-dlp/yt-dlp/issues/8241)) by [Grub4K](https://github.com/Grub4K) + +#### Misc. changes +- **cleanup**: Miscellaneous: [b634ba7](https://github.com/yt-dlp/yt-dlp/commit/b634ba742d8f38ce9ecfa0546485728b0c6c59d1) by [bashonly](https://github.com/bashonly), [gamer191](https://github.com/gamer191) + ### 2023.10.07 #### Extractor changes diff --git a/Collaborators.md b/Collaborators.md index 70ab616f1..894a853c9 100644 --- a/Collaborators.md +++ b/Collaborators.md @@ -29,6 +29,7 @@ ## [coletdjnz](https://github.com/coletdjnz) [![gh-sponsor](https://img.shields.io/badge/_-Github-white.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/coletdjnz) * Improved plugin architecture +* Rewrote the networking infrastructure, implemented support for `requests` * YouTube improvements including: age-gate bypass, private playlists, multiple-clients (to avoid throttling) and a lot of under-the-hood improvements * Added support for new websites YoutubeWebArchive, MainStreaming, PRX, nzherald, Mediaklikk, StarTV etc * Improved/fixed support for Patreon, panopto, gfycat, itv, pbs, SouthParkDE etc @@ -46,16 +47,17 @@ ## [Ashish0804](https://github.com/Ashish0804) [Inactive] ## [bashonly](https://github.com/bashonly) -* `--update-to`, automated release, nightly builds -* `--cookies-from-browser` support for Firefox containers -* Added support for new websites Genius, Kick, NBCStations, Triller, VideoKen etc -* Improved/fixed support for Anvato, Brightcove, Instagram, ParamountPlus, Reddit, SlidesLive, TikTok, Twitter, Vimeo etc +* `--update-to`, self-updater rewrite, automated/nightly/master releases +* `--cookies-from-browser` support for Firefox containers, external downloader cookie handling overhaul +* Added support for new websites like Dacast, Kick, NBCStations, Triller, VideoKen, Weverse, WrestleUniverse etc +* Improved/fixed support for Anvato, Brightcove, Reddit, SlidesLive, TikTok, Twitter, Vimeo etc ## [Grub4K](https://github.com/Grub4K) -[![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/Grub4K) [![gh-sponsor](https://img.shields.io/badge/_-Github-white.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/Grub4K) +[![gh-sponsor](https://img.shields.io/badge/_-Github-white.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/Grub4K) [![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/Grub4K) -* `--update-to`, automated release, nightly builds -* Rework internals like `traverse_obj`, various core refactors and bugs fixes -* Helped fix crunchyroll, Twitter, wrestleuniverse, wistia, slideslive etc +* `--update-to`, self-updater rewrite, automated/nightly/master releases +* Reworked internals like `traverse_obj`, various core refactors and bugs fixes +* Implemented proper progress reporting for parallel downloads +* Improved/fixed/added Bundestag, crunchyroll, pr0gramm, Twitter, WrestleUniverse etc diff --git a/README.md b/README.md index a0b69c9a1..b6a79667c 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ # NEW FEATURES -* Forked from [**yt-dlc@f9401f2**](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee) and merged with [**youtube-dl@66ab08**](https://github.com/ytdl-org/youtube-dl/commit/66ab0814c4baa2dc79c2dd5287bc0ad61a37c5b9) ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21)) +* Forked from [**yt-dlc@f9401f2**](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee) and merged with [**youtube-dl@be008e6**](https://github.com/ytdl-org/youtube-dl/commit/be008e657d79832642e2158557c899249c9e31cd) ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21)) * **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in YouTube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API @@ -89,7 +89,6 @@ # NEW FEATURES * Fix for [n-sig based throttling](https://github.com/ytdl-org/youtube-dl/issues/29326) **\*** * Supports some (but not all) age-gated content without cookies * Download livestreams from the start using `--live-from-start` (*experimental*) - * `255kbps` audio is extracted (if available) from YouTube Music when premium cookies are given * Channel URLs download all uploads of the channel, including shorts and live * **Cookies from browser**: Cookies can be automatically extracted from all major web browsers using `--cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER]` @@ -122,7 +121,7 @@ # NEW FEATURES * **Self updater**: The releases can be updated using `yt-dlp -U`, and downgraded using `--update-to` if required -* **Nightly builds**: [Automated nightly builds](#update-channels) can be used with `--update-to nightly` +* **Automated builds**: [Nightly/master builds](#update-channels) can be used with `--update-to nightly` and `--update-to master` See [changelog](Changelog.md) or [commits](https://github.com/yt-dlp/yt-dlp/commits) for the full list of changes @@ -132,7 +131,7 @@ ### Differences in default behavior Some of yt-dlp's default options are different from that of youtube-dl and youtube-dlc: -* yt-dlp supports only [Python 3.7+](## "Windows 7"), and *may* remove support for more versions as they [become EOL](https://devguide.python.org/versions/#python-release-cycle); while [youtube-dl still supports Python 2.6+ and 3.2+](https://github.com/ytdl-org/youtube-dl/issues/30568#issue-1118238743) +* yt-dlp supports only [Python 3.8+](## "Windows 7"), and *may* remove support for more versions as they [become EOL](https://devguide.python.org/versions/#python-release-cycle); while [youtube-dl still supports Python 2.6+ and 3.2+](https://github.com/ytdl-org/youtube-dl/issues/30568#issue-1118238743) * The options `--auto-number` (`-A`), `--title` (`-t`) and `--literal` (`-l`), no longer work. See [removed options](#Removed) for details * `avconv` is not supported as an alternative to `ffmpeg` * yt-dlp stores config files in slightly different locations to youtube-dl. See [CONFIGURATION](#configuration) for a list of correct locations @@ -158,14 +157,17 @@ ### Differences in default behavior * yt-dlp's sanitization of invalid characters in filenames is different/smarter than in youtube-dl. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior * yt-dlp tries to parse the external downloader outputs into the standard progress output if possible (Currently implemented: [~~aria2c~~](https://github.com/yt-dlp/yt-dlp/issues/5931)). You can use `--compat-options no-external-downloader-progress` to get the downloader output as-is * yt-dlp versions between 2021.09.01 and 2023.01.02 applies `--match-filter` to nested playlists. This was an unintentional side-effect of [8f18ac](https://github.com/yt-dlp/yt-dlp/commit/8f18aca8717bb0dd49054555af8d386e5eda3a88) and is fixed in [d7b460](https://github.com/yt-dlp/yt-dlp/commit/d7b460d0e5fc710950582baed2e3fc616ed98a80). Use `--compat-options playlist-match-filter` to revert this +* yt-dlp versions between 2021.11.10 and 2023.06.21 estimated `filesize_approx` values for fragmented/manifest formats. This was added for convenience in [f2fe69](https://github.com/yt-dlp/yt-dlp/commit/f2fe69c7b0d208bdb1f6292b4ae92bc1e1a7444a), but was reverted in [0dff8e](https://github.com/yt-dlp/yt-dlp/commit/0dff8e4d1e6e9fb938f4256ea9af7d81f42fd54f) due to the potentially extreme inaccuracy of the estimated values. Use `--compat-options manifest-filesize-approx` to keep extracting the estimated values +* yt-dlp uses modern http client backends such as `requests`. Use `--compat-options prefer-legacy-http-handler` to prefer the legacy http handler (`urllib`) to be used for standard http requests. +* The sub-module `swfinterp` is removed. For ease of use, a few more compat options are available: * `--compat-options all`: Use all compat options (Do NOT use) -* `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter` -* `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter` +* `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter,-manifest-filesize-approx` +* `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter,-manifest-filesize-approx` * `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` -* `--compat-options 2022`: Same as `--compat-options playlist-match-filter,no-external-downloader-progress`. Use this to enable all future compat options +* `--compat-options 2022`: Same as `--compat-options playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler,manifest-filesize-approx`. Use this to enable all future compat options # INSTALLATION @@ -192,9 +194,11 @@ ## UPDATE -There are currently two release channels for binaries, `stable` and `nightly`. -`stable` is the default channel, and many of its changes have been tested by users of the nightly channel. -The `nightly` channel has releases built after each push to the master branch, and will have the most recent fixes and additions, but also have more risk of regressions. They are available in [their own repo](https://github.com/yt-dlp/yt-dlp-nightly-builds/releases). +There are currently three release channels for binaries: `stable`, `nightly` and `master`. + +* `stable` is the default channel, and many of its changes have been tested by users of the `nightly` and `master` channels. +* The `nightly` channel has releases scheduled to build every day around midnight UTC, for a snapshot of the project's new patches and changes. This is the **recommended channel for regular users** of yt-dlp. The `nightly` releases are available from [yt-dlp/yt-dlp-nightly-builds](https://github.com/yt-dlp/yt-dlp-nightly-builds/releases) or as development releases of the `yt-dlp` PyPI package (which can be installed with pip's `--pre` flag). +* The `master` channel features releases that are built after each push to the master branch, and these will have the very latest fixes and additions, but may also be more prone to regressions. They are available from [yt-dlp/yt-dlp-master-builds](https://github.com/yt-dlp/yt-dlp-master-builds/releases). When using `--update`/`-U`, a release binary will only update to its current channel. `--update-to CHANNEL` can be used to switch to a different channel when a newer version is available. `--update-to [CHANNEL@]TAG` can also be used to upgrade or downgrade to specific tags from a channel. @@ -202,10 +206,19 @@ ## UPDATE You may also use `--update-to ` (`/`) to update to a channel on a completely different repository. Be careful with what repository you are updating to though, there is no verification done for binaries from different repositories. Example usage: -* `yt-dlp --update-to nightly` change to `nightly` channel and update to its latest release -* `yt-dlp --update-to stable@2023.02.17` upgrade/downgrade to release to `stable` channel tag `2023.02.17` -* `yt-dlp --update-to 2023.01.06` upgrade/downgrade to tag `2023.01.06` if it exists on the current channel -* `yt-dlp --update-to example/yt-dlp@2023.03.01` upgrade/downgrade to the release from the `example/yt-dlp` repository, tag `2023.03.01` +* `yt-dlp --update-to master` switch to the `master` channel and update to its latest release +* `yt-dlp --update-to stable@2023.07.06` upgrade/downgrade to release to `stable` channel tag `2023.07.06` +* `yt-dlp --update-to 2023.10.07` upgrade/downgrade to tag `2023.10.07` if it exists on the current channel +* `yt-dlp --update-to example/yt-dlp@2023.09.24` upgrade/downgrade to the release from the `example/yt-dlp` repository, tag `2023.09.24` + +**Important**: Any user experiencing an issue with the `stable` release should install or update to the `nightly` release before submitting a bug report: +``` +# To update to nightly from stable executable/binary: +yt-dlp --update-to nightly + +# To install nightly with pip: +python -m pip install -U --pre yt-dlp +``` ## RELEASE FILES @@ -254,7 +267,7 @@ #### Misc **Note**: The manpages, shell completion (autocomplete) files etc. are available inside the [source tarball](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.tar.gz) ## DEPENDENCIES -Python versions 3.7+ (CPython and PyPy) are supported. Other versions and implementations may or may not work correctly. +Python versions 3.8+ (CPython and PyPy) are supported. Other versions and implementations may or may not work correctly. diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index fe0c82c66..8c5286432 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -98,5 +98,27 @@ "action": "add", "when": "61bdf15fc7400601c3da1aa7a43917310a5bf391", "short": "[priority] Security: [[CVE-2023-40581](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-40581)] [Prevent RCE when using `--exec` with `%q` on Windows](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-42h4-v29r-42qg)\n - The shell escape function is now using `\"\"` instead of `\\\"`.\n - `utils.Popen` has been patched to properly quote commands." + }, + { + "action": "change", + "when": "8a8b54523addf46dfd50ef599761a81bc22362e6", + "short": "[rh:requests] Add handler for `requests` HTTP library (#3668)\n\n\tAdds support for HTTPS proxies and persistent connections (keep-alive)", + "authors": ["bashonly", "coletdjnz", "Grub4K"] + }, + { + "action": "add", + "when": "1d03633c5a1621b9f3a756f0a4f9dc61fab3aeaa", + "short": "[priority] **The release channels have been adjusted!**\n\t* [`master`](https://github.com/yt-dlp/yt-dlp-master-builds) builds are made after each push, containing the latest fixes (but also possibly bugs). This was previously the `nightly` channel.\n\t* [`nightly`](https://github.com/yt-dlp/yt-dlp-nightly-builds) builds are now made once a day, if there were any changes." + }, + { + "action": "add", + "when": "f04b5bedad7b281bee9814686bba1762bae092eb", + "short": "[priority] Security: [[CVE-2023-46121](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-46121)] Patch [Generic Extractor MITM Vulnerability via Arbitrary Proxy Injection](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-3ch3-jhc6-5r8x)\n\t- Disallow smuggling of arbitrary `http_headers`; extractors now only use specific headers" + }, + { + "action": "change", + "when": "15f22b4880b6b3f71f350c64d70976ae65b9f1ca", + "short": "[webvtt] Allow spaces before newlines for CueBlock (#7681)", + "authors": ["TSRBerry"] } ] diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index 9ff65db14..123eebc2a 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -40,22 +40,9 @@ def subgroup_lookup(cls): return { name: group for group, names in { - cls.CORE: { - 'aes', - 'cache', - 'compat_utils', - 'compat', - 'cookies', - 'dependencies', - 'formats', - 'jsinterp', - 'outtmpl', - 'plugins', - 'update', - 'utils', - }, cls.MISC: { 'build', + 'ci', 'cleanup', 'devscripts', 'docs', @@ -403,9 +390,9 @@ def groups(self): if not group: if self.EXTRACTOR_INDICATOR_RE.search(commit.short): group = CommitGroup.EXTRACTOR + logger.error(f'Assuming [ie] group for {commit.short!r}') else: - group = CommitGroup.POSTPROCESSOR - logger.warning(f'Failed to map {commit.short!r}, selected {group.name.lower()}') + group = CommitGroup.CORE commit_info = CommitInfo( details, sub_details, message.strip(), diff --git a/devscripts/make_issue_template.py b/devscripts/make_issue_template.py index 39b95c8da..a5d59f3c0 100644 --- a/devscripts/make_issue_template.py +++ b/devscripts/make_issue_template.py @@ -9,12 +9,7 @@ import re -from devscripts.utils import ( - get_filename_args, - read_file, - read_version, - write_file, -) +from devscripts.utils import get_filename_args, read_file, write_file VERBOSE_TMPL = ''' - type: checkboxes @@ -35,19 +30,18 @@ description: | It should start like this: placeholder: | - [debug] Command-line config: ['-vU', 'test:youtube'] - [debug] Portable config "yt-dlp.conf": ['-i'] + [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version %(version)s [9d339c4] (win32_exe) + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] Checking exe version: ffmpeg -bsfs - [debug] Checking exe version: ffprobe -bsfs [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: %(version)s, Current version: %(version)s - yt-dlp is up to date (%(version)s) + [debug] Request Handlers: urllib, requests + [debug] Loaded 1893 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) + [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell validations: @@ -66,7 +60,7 @@ def main(): - fields = {'version': read_version(), 'no_skip': NO_SKIP} + fields = {'no_skip': NO_SKIP} fields['verbose'] = VERBOSE_TMPL % fields fields['verbose_optional'] = re.sub(r'(\n\s+validations:)?\n\s+required: true', '', fields['verbose']) diff --git a/devscripts/run_tests.bat b/devscripts/run_tests.bat index 190d23918..57b1f4bf4 100644 --- a/devscripts/run_tests.bat +++ b/devscripts/run_tests.bat @@ -1,17 +1,4 @@ -@setlocal @echo off -cd /d %~dp0.. -if ["%~1"]==[""] ( - set "test_set="test"" -) else if ["%~1"]==["core"] ( - set "test_set="-m not download"" -) else if ["%~1"]==["download"] ( - set "test_set="-m "download"" -) else ( - echo.Invalid test type "%~1". Use "core" ^| "download" - exit /b 1 -) - -set PYTHONWARNINGS=error -pytest %test_set% +>&2 echo run_tests.bat is deprecated. Please use `devscripts/run_tests.py` instead +python %~dp0run_tests.py %~1 diff --git a/devscripts/run_tests.py b/devscripts/run_tests.py new file mode 100755 index 000000000..6d638a974 --- /dev/null +++ b/devscripts/run_tests.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 + +import argparse +import functools +import os +import re +import subprocess +import sys +from pathlib import Path + + +fix_test_name = functools.partial(re.compile(r'IE(_all|_\d+)?$').sub, r'\1') + + +def parse_args(): + parser = argparse.ArgumentParser(description='Run selected yt-dlp tests') + parser.add_argument( + 'test', help='a extractor tests, or one of "core" or "download"', nargs='*') + parser.add_argument( + '-k', help='run a test matching EXPRESSION. Same as "pytest -k"', metavar='EXPRESSION') + return parser.parse_args() + + +def run_tests(*tests, pattern=None, ci=False): + run_core = 'core' in tests or (not pattern and not tests) + run_download = 'download' in tests + tests = list(map(fix_test_name, tests)) + + arguments = ['pytest', '-Werror', '--tb=short'] + if ci: + arguments.append('--color=yes') + if run_core: + arguments.extend(['-m', 'not download']) + elif run_download: + arguments.extend(['-m', 'download']) + elif pattern: + arguments.extend(['-k', pattern]) + else: + arguments.extend( + f'test/test_download.py::TestDownload::test_{test}' for test in tests) + + print(f'Running {arguments}', flush=True) + try: + return subprocess.call(arguments) + except FileNotFoundError: + pass + + arguments = [sys.executable, '-Werror', '-m', 'unittest'] + if run_core: + print('"pytest" needs to be installed to run core tests', file=sys.stderr, flush=True) + return 1 + elif run_download: + arguments.append('test.test_download') + elif pattern: + arguments.extend(['-k', pattern]) + else: + arguments.extend( + f'test.test_download.TestDownload.test_{test}' for test in tests) + + print(f'Running {arguments}', flush=True) + return subprocess.call(arguments) + + +if __name__ == '__main__': + try: + args = parse_args() + + os.chdir(Path(__file__).parent.parent) + sys.exit(run_tests(*args.test, pattern=args.k, ci=bool(os.getenv('CI')))) + except KeyboardInterrupt: + pass diff --git a/devscripts/run_tests.sh b/devscripts/run_tests.sh index faa642e96..123ceb1ee 100755 --- a/devscripts/run_tests.sh +++ b/devscripts/run_tests.sh @@ -1,14 +1,4 @@ #!/usr/bin/env sh -if [ -z "$1" ]; then - test_set='test' -elif [ "$1" = 'core' ]; then - test_set="-m not download" -elif [ "$1" = 'download' ]; then - test_set="-m download" -else - echo 'Invalid test type "'"$1"'". Use "core" | "download"' - exit 1 -fi - -python3 -bb -Werror -m pytest "$test_set" +>&2 echo 'run_tests.sh is deprecated. Please use `devscripts/run_tests.py` instead' +python3 devscripts/run_tests.py "$1" diff --git a/devscripts/update-formulae.py b/devscripts/update-formulae.py deleted file mode 100644 index e79297f53..000000000 --- a/devscripts/update-formulae.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python3 - -""" -Usage: python3 ./devscripts/update-formulae.py -version can be either 0-aligned (yt-dlp version) or normalized (PyPi version) -""" - -# Allow direct execution -import os -import sys - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - - -import json -import re -import urllib.request - -from devscripts.utils import read_file, write_file - -filename, version = sys.argv[1:] - -normalized_version = '.'.join(str(int(x)) for x in version.split('.')) - -pypi_release = json.loads(urllib.request.urlopen( - 'https://pypi.org/pypi/yt-dlp/%s/json' % normalized_version -).read().decode()) - -tarball_file = next(x for x in pypi_release['urls'] if x['filename'].endswith('.tar.gz')) - -sha256sum = tarball_file['digests']['sha256'] -url = tarball_file['url'] - -formulae_text = read_file(filename) - -formulae_text = re.sub(r'sha256 "[0-9a-f]*?"', 'sha256 "%s"' % sha256sum, formulae_text, count=1) -formulae_text = re.sub(r'url "[^"]*?"', 'url "%s"' % url, formulae_text, count=1) - -write_file(filename, formulae_text) diff --git a/devscripts/update-version.py b/devscripts/update-version.py index 0144bd284..da54a6a25 100644 --- a/devscripts/update-version.py +++ b/devscripts/update-version.py @@ -20,7 +20,7 @@ def get_new_version(version, revision): version = datetime.now(timezone.utc).strftime('%Y.%m.%d') if revision: - assert revision.isdigit(), 'Revision must be a number' + assert revision.isdecimal(), 'Revision must be a number' else: old_version = read_version().split('.') if version.split('.') == old_version[:3]: @@ -46,6 +46,10 @@ def get_git_head(): UPDATE_HINT = None CHANNEL = {channel!r} + +ORIGIN = {origin!r} + +_pkg_version = {package_version!r} ''' if __name__ == '__main__': @@ -53,6 +57,12 @@ def get_git_head(): parser.add_argument( '-c', '--channel', default='stable', help='Select update channel (default: %(default)s)') + parser.add_argument( + '-r', '--origin', default='local', + help='Select origin/repository (default: %(default)s)') + parser.add_argument( + '-s', '--suffix', default='', + help='Add an alphanumeric suffix to the package version, e.g. "dev"') parser.add_argument( '-o', '--output', default='yt_dlp/version.py', help='The output file to write to (default: %(default)s)') @@ -66,6 +76,7 @@ def get_git_head(): args.version if args.version and '.' in args.version else get_new_version(None, args.version)) write_file(args.output, VERSION_TEMPLATE.format( - version=version, git_head=git_head, channel=args.channel)) + version=version, git_head=git_head, channel=args.channel, origin=args.origin, + package_version=f'{version}{args.suffix}')) print(f'version={version} ({args.channel}), head={git_head}') diff --git a/devscripts/utils.py b/devscripts/utils.py index f75a84da9..a952c9fae 100644 --- a/devscripts/utils.py +++ b/devscripts/utils.py @@ -13,10 +13,11 @@ def write_file(fname, content, mode='w'): return f.write(content) -def read_version(fname='yt_dlp/version.py'): +def read_version(fname='yt_dlp/version.py', varname='__version__'): """Get the version without importing the package""" - exec(compile(read_file(fname), fname, 'exec')) - return locals()['__version__'] + items = {} + exec(compile(read_file(fname), fname, 'exec'), items) + return items[varname] def get_filename_args(has_infile=False, default_outfile=None): diff --git a/requirements.txt b/requirements.txt index dde37120f..06ff82a80 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,8 @@ mutagen pycryptodomex -websockets -brotli; platform_python_implementation=='CPython' -brotlicffi; platform_python_implementation!='CPython' +brotli; implementation_name=='cpython' +brotlicffi; implementation_name!='cpython' certifi +requests>=2.31.0,<3 +urllib3>=1.26.17,<3 +websockets>=12.0 diff --git a/setup.cfg b/setup.cfg index 6deaa7971..a799f7293 100644 --- a/setup.cfg +++ b/setup.cfg @@ -26,7 +26,7 @@ markers = [tox:tox] skipsdist = true -envlist = py{36,37,38,39,310,311},pypy{36,37,38,39} +envlist = py{38,39,310,311,312},pypy{38,39,310} skip_missing_interpreters = true [testenv] # tox @@ -39,7 +39,7 @@ setenv = [isort] -py_version = 37 +py_version = 38 multi_line_output = VERTICAL_HANGING_INDENT line_length = 80 reverse_relative = true diff --git a/setup.py b/setup.py index a2f9f55c3..3d9a69d10 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ from devscripts.utils import read_file, read_version -VERSION = read_version() +VERSION = read_version(varname='_pkg_version') DESCRIPTION = 'A youtube-dl fork with additional features and patches' @@ -62,7 +62,14 @@ def py2exe_params(): 'compressed': 1, 'optimize': 2, 'dist_dir': './dist', - 'excludes': ['Crypto', 'Cryptodome'], # py2exe cannot import Crypto + 'excludes': [ + # py2exe cannot import Crypto + 'Crypto', + 'Cryptodome', + # py2exe appears to confuse this with our socks library. + # We don't use pysocks and urllib3.contrib.socks would fail to import if tried. + 'urllib3.contrib.socks' + ], 'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'], # Modules that are only imported dynamically must be added here 'includes': ['yt_dlp.compat._legacy', 'yt_dlp.compat._deprecated', @@ -135,7 +142,7 @@ def main(): params = build_params() setup( - name='yt-dlp', + name='yt-dlp', # package name (do not change/remove comment) version=VERSION, maintainer='pukkandan', maintainer_email='pukkandan.ytdlp@gmail.com', @@ -145,7 +152,7 @@ def main(): url='https://github.com/yt-dlp/yt-dlp', packages=packages(), install_requires=REQUIREMENTS, - python_requires='>=3.7', + python_requires='>=3.8', project_urls={ 'Documentation': 'https://github.com/yt-dlp/yt-dlp#readme', 'Source': 'https://github.com/yt-dlp/yt-dlp', @@ -157,11 +164,11 @@ def main(): 'Development Status :: 5 - Production/Stable', 'Environment :: Console', 'Programming Language :: Python', - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', 'Programming Language :: Python :: Implementation', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy', diff --git a/supportedsites.md b/supportedsites.md index ecef4dc2d..96681c16b 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -1,6 +1,4 @@ # Supported sites - - **0000studio:archive** - - **0000studio:clip** - **17live** - **17live:clip** - **1News**: 1news.co.nz article videos @@ -9,7 +7,6 @@ # Supported sites - **23video** - **247sports** - **24tv.ua** - - **24video** - **3qsdn**: 3Q SDN - **3sat** - **4tube** @@ -50,15 +47,18 @@ # Supported sites - **afreecatv**: [*afreecatv*](## "netrc machine") afreecatv.com - **afreecatv:live**: [*afreecatv*](## "netrc machine") afreecatv.com - **afreecatv:user** - - **AirMozilla** - **AirTV** - **AitubeKZVideo** - **AliExpressLive** - **AlJazeera** - **Allocine** + - **Allstar** + - **AllstarProfile** - **AlphaPorno** - **Alsace20TV** - **Alsace20TVEmbed** + - **altcensored** + - **altcensored:channel** - **Alura**: [*alura*](## "netrc machine") - **AluraCourse**: [*aluracourse*](## "netrc machine") - **Amara** @@ -79,7 +79,7 @@ # Supported sites - **ant1newsgr:embed**: ant1news.gr embedded videos - **antenna:watch**: antenna.gr and ant1news.gr videos - **Anvato** - - **aol.com**: Yahoo screen and movies + - **aol.com**: Yahoo screen and movies (**Currently broken**) - **APA** - **Aparat** - **AppleConnect** @@ -90,8 +90,8 @@ # Supported sites - **archive.org**: archive.org video and audio - **ArcPublishing** - **ARD** - - **ARD:mediathek** - - **ARDBetaMediathek** + - **ARDMediathek** + - **ARDMediathekCollection** - **Arkena** - **arte.sky.it** - **ArteTV** @@ -100,7 +100,6 @@ # Supported sites - **ArteTVPlaylist** - **AtresPlayer**: [*atresplayer*](## "netrc machine") - **AtScaleConfEvent** - - **ATTTechChannel** - **ATVAt** - **AudiMedia** - **AudioBoom** @@ -140,12 +139,12 @@ # Supported sites - **BeatBumpVideo** - **Beatport** - **Beeg** - - **BehindKink** + - **BehindKink**: (**Currently broken**) - **Bellator** - **BellMedia** - **BerufeTV** - - **Bet** - - **bfi:player** + - **Bet**: (**Currently broken**) + - **bfi:player**: (**Currently broken**) - **bfmtv** - **bfmtv:article** - **bfmtv:live** @@ -162,6 +161,8 @@ # Supported sites - **BiliBiliBangumi** - **BiliBiliBangumiMedia** - **BiliBiliBangumiSeason** + - **BilibiliCheese** + - **BilibiliCheeseSeason** - **BilibiliCollectionList** - **BilibiliFavoritesList** - **BiliBiliPlayer** @@ -176,11 +177,8 @@ # Supported sites - **BiliLive** - **BioBioChileTV** - **Biography** - - **BIQLE** - **BitChute** - **BitChuteChannel** - - **bitwave:replay** - - **bitwave:stream** - **BlackboardCollaborate** - **BleacherReport** - **BleacherReportCMS** @@ -193,7 +191,7 @@ # Supported sites - **Box** - **BoxCastVideo** - **Bpb**: Bundeszentrale für politische Bildung - - **BR**: Bayerischer Rundfunk + - **BR**: Bayerischer Rundfunk (**Currently broken**) - **BrainPOP**: [*brainpop*](## "netrc machine") - **BrainPOPELL**: [*brainpop*](## "netrc machine") - **BrainPOPEsp**: [*brainpop*](## "netrc machine") BrainPOP Español @@ -201,19 +199,18 @@ # Supported sites - **BrainPOPIl**: [*brainpop*](## "netrc machine") BrainPOP Hebrew - **BrainPOPJr**: [*brainpop*](## "netrc machine") - **BravoTV** - - **Break** - **BreitBart** - **brightcove:legacy** - **brightcove:new** - **Brilliantpala:Classes**: [*brilliantpala*](## "netrc machine") VoD on classes.brilliantpala.org - **Brilliantpala:Elearn**: [*brilliantpala*](## "netrc machine") VoD on elearn.brilliantpala.org - - **BRMediathek**: Bayerischer Rundfunk Mediathek - **bt:article**: Bergens Tidende Articles - **bt:vestlendingen**: Bergens Tidende - Vestlendingen - **Bundesliga** + - **Bundestag** - **BusinessInsider** - **BuzzFeed** - - **BYUtv** + - **BYUtv**: (**Currently broken**) - **CableAV** - **Callin** - **Caltrans** @@ -225,14 +222,11 @@ # Supported sites - **CamModels** - **Camsoda** - **CamtasiaEmbed** - - **CamWithHer** - **Canal1** - **CanalAlpha** - **canalc2.tv** - **Canalplus**: mycanal.fr and piwiplus.fr - **CaracolTvPlay**: [*caracoltv-play*](## "netrc machine") - - **CarambaTV** - - **CarambaTVPage** - **CartoonNetwork** - **cbc.ca** - **cbc.ca:player** @@ -254,16 +248,12 @@ # Supported sites - **Cellebrite** - **CeskaTelevize** - **CGTN** - - **channel9**: Channel 9 - **CharlieRose** - **Chaturbate** - **Chilloutzone** - **Chingari** - **ChingariUser** - - **chirbit** - - **chirbit:profile** - **cielotv.it** - - **Cinchcast** - **Cinemax** - **CinetecaMilano** - **Cineverse** @@ -276,14 +266,12 @@ # Supported sites - **cliphunter** - **Clippit** - **ClipRs** - - **Clipsyndicate** - **ClipYouEmbed** - **CloserToTruth** - **CloudflareStream** - - **Cloudy** - - **Clubic** + - **Clubic**: (**Currently broken**) - **Clyp** - - **cmt.com** + - **cmt.com**: (**Currently broken**) - **CNBC** - **CNBCVideo** - **CNN** @@ -328,7 +316,6 @@ # Supported sites - **CybraryCourse**: [*cybrary*](## "netrc machine") - **DacastPlaylist** - **DacastVOD** - - **Daftsex** - **DagelijkseKost**: dagelijksekost.een.be - **DailyMail** - **dailymotion**: [*dailymotion*](## "netrc machine") @@ -347,13 +334,12 @@ # Supported sites - **DctpTv** - **DeezerAlbum** - **DeezerPlaylist** - - **defense.gouv.fr** - **democracynow** - **DestinationAmerica** - **DetikEmbed** - **DeuxM** - **DeuxMNews** - - **DHM**: Filmarchiv - Deutsches Historisches Museum + - **DHM**: Filmarchiv - Deutsches Historisches Museum (**Currently broken**) - **Digg** - **DigitalConcertHall**: [*digitalconcerthall*](## "netrc machine") DigitalConcertHall extractor - **DigitallySpeaking** @@ -373,7 +359,6 @@ # Supported sites - **dlf:corpus**: DLF Multi-feed Archives - **dlive:stream** - **dlive:vod** - - **Dotsub** - **Douyin** - **DouyuShow** - **DouyuTV**: 斗鱼直播 @@ -392,34 +377,29 @@ # Supported sites - **duboku**: www.duboku.io - **duboku:list**: www.duboku.io entire series - **Dumpert** + - **Duoplay** - **dvtv**: http://video.aktualne.cz/ - **dw** - **dw:article** - **EaglePlatform** - **EbaumsWorld** - **Ebay** - - **EchoMsk** - **egghead:course**: egghead.io course - **egghead:lesson**: egghead.io lesson - - **ehftv** - - **eHow** - **EinsUndEinsTV**: [*1und1tv*](## "netrc machine") - **EinsUndEinsTVLive**: [*1und1tv*](## "netrc machine") - **EinsUndEinsTVRecordings**: [*1und1tv*](## "netrc machine") - **Einthusan** - **eitb.tv** - - **ElevenSports** - - **EllenTube** - - **EllenTubePlaylist** - - **EllenTubeVideo** - **Elonet** - **ElPais**: El País + - **ElTreceTV**: El Trece TV (Argentina) - **Embedly** - **EMPFlix** - - **Engadget** - **Epicon** - **EpiconSeries** - - **eplus:inbound**: e+ (イープラス) overseas + - **EpidemicSound** + - **eplus**: [*eplus*](## "netrc machine") e+ (イープラス) - **Epoch** - **Eporner** - **Erocast** @@ -428,11 +408,9 @@ # Supported sites - **ertflix**: ERTFLIX videos - **ertflix:codename**: ERTFLIX videos by codename - **ertwebtv:embed**: ert.gr webtv embedded videos - - **Escapist** - **ESPN** - **ESPNArticle** - **ESPNCricInfo** - - **EsriVideo** - **EttuTv** - **Europa** - **EuroParlWebstream** @@ -442,9 +420,7 @@ # Supported sites - **EWETV**: [*ewetv*](## "netrc machine") - **EWETVLive**: [*ewetv*](## "netrc machine") - **EWETVRecordings**: [*ewetv*](## "netrc machine") - - **ExpoTV** - **Expressen** - - **ExtremeTube** - **EyedoTV** - **facebook**: [*facebook*](## "netrc machine") - **facebook:reel** @@ -464,6 +440,8 @@ # Supported sites - **FiveThirtyEight** - **FiveTV** - **Flickr** + - **Floatplane** + - **FloatplaneChannel** - **Folketinget**: Folketinget (ft.dk; Danish parliament) - **FoodNetwork** - **FootyRoom** @@ -471,7 +449,6 @@ # Supported sites - **FOX** - **FOX9** - **FOX9News** - - **Foxgay** - **foxnews**: Fox News and Fox Business Video - **foxnews:article** - **FoxNewsVideo** @@ -495,7 +472,6 @@ # Supported sites - **funimation:show**: [*funimation*](## "netrc machine") - **Funk** - **Funker530** - - **Fusion** - **Fux** - **FuyinTV** - **Gab** @@ -521,7 +497,6 @@ # Supported sites - **GeniusLyrics** - **Gettr** - **GettrStreaming** - - **Gfycat** - **GiantBomb** - **Giga** - **GlattvisionTV**: [*glattvisiontv*](## "netrc machine") @@ -563,7 +538,6 @@ # Supported sites - **HearThisAt** - **Heise** - **HellPorno** - - **Helsinki**: helsinki.fi - **hetklokhuis** - **hgtv.com:show** - **HGTVDe** @@ -572,8 +546,6 @@ # Supported sites - **HistoricFilms** - **history:player** - **history:topic**: History.com Topic - - **hitbox** - - **hitbox:live** - **HitRecord** - **hketv**: 香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau - **HollywoodReporter** @@ -584,8 +556,6 @@ # Supported sites - **hotstar:playlist** - **hotstar:season** - **hotstar:series** - - **Howcast** - - **HowStuffWorks** - **hrfernsehen** - **HRTi**: [*hrti*](## "netrc machine") - **HRTiPlaylist**: [*hrti*](## "netrc machine") @@ -607,7 +577,7 @@ # Supported sites - **ign.com** - **IGNArticle** - **IGNVideo** - - **IHeartRadio** + - **iheartradio** - **iheartradio:podcast** - **Iltalehti** - **imdb**: Internet Movie Database trailers @@ -637,7 +607,6 @@ # Supported sites - **IsraelNationalNews** - **ITProTV** - **ITProTVCourse** - - **ITTF** - **ITV** - **ITVBTCC** - **ivi**: ivi.ru @@ -654,9 +623,14 @@ # Supported sites - **Jamendo** - **JamendoAlbum** - **JeuxVideo** + - **JioSaavnAlbum** + - **JioSaavnSong** - **Joj** + - **JoqrAg**: 超!A&G+ 文化放送 (f.k.a. AGQR) Nippon Cultural Broadcasting, Inc. (JOQR) - **Jove** - **JStream** + - **JTBC**: jtbc.co.kr + - **JTBC:program** - **JWPlatform** - **Kakao** - **Kaltura** @@ -665,7 +639,6 @@ # Supported sites - **Karaoketv** - **KarriereVideos** - **Katsomo** - - **KeezMovies** - **KelbyOne** - **Ketnet** - **khanacademy** @@ -674,7 +647,7 @@ # Supported sites - **Kicker** - **KickStarter** - **KickVOD** - - **KinjaEmbed** + - **kinja:embed** - **KinoPoisk** - **Kommunetv** - **KompasVideo** @@ -693,11 +666,10 @@ # Supported sites - **la7.it** - **la7.it:​pod:episode** - **la7.it:podcast** - - **laola1tv** - - **laola1tv:embed** - **LastFM** - **LastFMPlaylist** - **LastFMUser** + - **LaXarxaMes**: [*laxarxames*](## "netrc machine") - **lbry** - **lbry:channel** - **lbry:playlist** @@ -727,7 +699,6 @@ # Supported sites - **LinkedIn**: [*linkedin*](## "netrc machine") - **linkedin:learning**: [*linkedin*](## "netrc machine") - **linkedin:​learning:course**: [*linkedin*](## "netrc machine") - - **LinuxAcademy**: [*linuxacademy*](## "netrc machine") - **Liputan6** - **ListenNotes** - **LiTV** @@ -745,7 +716,7 @@ # Supported sites - **Lumni** - **lynda**: [*lynda*](## "netrc machine") lynda.com videos - **lynda:course**: [*lynda*](## "netrc machine") lynda.com online courses - - **m6** + - **maariv.co.il** - **MagellanTV** - **MagentaMusik360** - **mailru**: Видео@Mail.Ru @@ -766,6 +737,7 @@ # Supported sites - **massengeschmack.tv** - **Masters** - **MatchTV** + - **MBN**: mbn.co.kr (매일방송) - **MDR**: MDR.DE and KiKA - **MedalTV** - **media.ccc.de** @@ -786,11 +758,8 @@ # Supported sites - **megatvcom:embed**: megatv.com embedded videos - **Meipai**: 美拍 - **MelonVOD** - - **META** - - **metacafe** - **Metacritic** - **mewatch** - - **Mgoon** - **MiaoPai** - **MicrosoftEmbed** - **microsoftstream**: Microsoft Stream @@ -803,7 +772,6 @@ # Supported sites - **minds:group** - **MinistryGrid** - **Minoto** - - **miomio.tv** - **mirrativ** - **mirrativ:user** - **MirrorCoUK** @@ -818,14 +786,10 @@ # Supported sites - **MLBTV**: [*mlb*](## "netrc machine") - **MLBVideo** - **MLSSoccer** - - **Mnet** - **MNetTV**: [*mnettv*](## "netrc machine") - **MNetTVLive**: [*mnettv*](## "netrc machine") - **MNetTVRecordings**: [*mnettv*](## "netrc machine") - **MochaVideo** - - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net - - **Mofosex** - - **MofosexEmbed** - **Mojvideo** - **Monstercat** - **MonsterSirenHypergryphMusic** @@ -836,13 +800,12 @@ # Supported sites - **Motorsport**: motorsport.com - **MotorTrend** - **MotorTrendOnDemand** - - **MovieClips** - **MovieFap** - **Moviepilot** - **MoviewPlay** - **Moviezine** - **MovingImage** - - **MSN** + - **MSN**: (**Currently broken**) - **mtg**: MTG services - **mtv** - **mtv.de** @@ -864,18 +827,13 @@ # Supported sites - **MusicdexSong** - **mva**: Microsoft Virtual Academy videos - **mva:course**: Microsoft Virtual Academy courses - - **Mwave** - - **MwaveMeetGreet** - **Mxplayer** - **MxplayerShow** - - **MyChannels** - **MySpace** - **MySpace:album** - **MySpass** - - **Myvi** - **MyVideoGe** - **MyVidster** - - **MyviEmbed** - **Mzaalo** - **n-tv.de** - **N1Info:article** @@ -887,12 +845,12 @@ # Supported sites - **Naver** - **Naver:live** - **navernow** - - **NBA** + - **nba** + - **nba:channel** + - **nba:embed** - **nba:watch** - **nba:​watch:collection** - - **NBAChannel** - - **NBAEmbed** - - **NBAWatchEmbed** + - **nba:​watch:embed** - **NBC** - **NBCNews** - **nbcolympics** @@ -907,6 +865,7 @@ # Supported sites - **NDTV** - **Nebula**: [*watchnebula*](## "netrc machine") - **nebula:channel**: [*watchnebula*](## "netrc machine") + - **nebula:class**: [*watchnebula*](## "netrc machine") - **nebula:subscriptions**: [*watchnebula*](## "netrc machine") - **NekoHacker** - **NerdCubedFeed** @@ -928,7 +887,6 @@ # Supported sites - **Newgrounds:playlist** - **Newgrounds:user** - **NewsPicks** - - **Newstube** - **Newsy** - **NextMedia**: 蘋果日報 - **NextMediaActionNews**: 蘋果日報 - 動新聞 @@ -954,7 +912,6 @@ # Supported sites - **nick.de** - **nickelodeon:br** - **nickelodeonru** - - **nicknight** - **niconico**: [*niconico*](## "netrc machine") ニコニコ動画 - **niconico:history**: NicoNico user history or likes. Requires cookies. - **niconico:live**: ニコニコ生放送 @@ -972,15 +929,12 @@ # Supported sites - **Nitter** - **njoy**: N-JOY - **njoy:embed** - - **NJPWWorld**: [*njpwworld*](## "netrc machine") 新日本プロレスワールド - **NobelPrize** - **NoicePodcast** - **NonkTube** - **NoodleMagazine** - **Noovo** - - **Normalboots** - **NOSNLArticle** - - **NosVideo** - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz - **NovaEmbed** - **NovaPlay** @@ -1003,7 +957,7 @@ # Supported sites - **NRKTVEpisodes** - **NRKTVSeason** - **NRKTVSeries** - - **NRLTV** + - **NRLTV**: (**Currently broken**) - **ntv.ru** - **NubilesPorn**: [*nubiles-porn*](## "netrc machine") - **Nuvid** @@ -1023,6 +977,7 @@ # Supported sites - **on24**: ON24 - **OnDemandChinaEpisode** - **OnDemandKorea** + - **OnDemandKoreaProgram** - **OneFootball** - **OnePlacePodcast** - **onet.pl** @@ -1030,8 +985,6 @@ # Supported sites - **onet.tv:channel** - **OnetMVP** - **OnionStudios** - - **Ooyala** - - **OoyalaExternal** - **Opencast** - **OpencastPlaylist** - **openrec** @@ -1040,6 +993,7 @@ # Supported sites - **OraTV** - **orf:​fm4:story**: fm4.orf.at stories - **orf:iptv**: iptv.ORF.at + - **orf:podcast** - **orf:radio** - **orf:tvthek**: ORF TVthek - **OsnatelTV**: [*osnateltv*](## "netrc machine") @@ -1052,7 +1006,6 @@ # Supported sites - **PalcoMP3:artist** - **PalcoMP3:song** - **PalcoMP3:video** - - **pandora.tv**: 판도라TV - **Panopto** - **PanoptoList** - **PanoptoPlaylist** @@ -1074,7 +1027,6 @@ # Supported sites - **PeerTube:Playlist** - **peloton**: [*peloton*](## "netrc machine") - **peloton:live**: Peloton Live - - **People** - **PerformGroup** - **periscope**: Periscope - **periscope:user**: Periscope user videos @@ -1096,14 +1048,11 @@ # Supported sites - **PlanetMarathi** - **Platzi**: [*platzi*](## "netrc machine") - **PlatziCourse**: [*platzi*](## "netrc machine") - - **play.fm** - **player.sky.it** - **PlayPlusTV**: [*playplustv*](## "netrc machine") - **PlayStuff** - - **PlaysTV** - **PlaySuisse** - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz - - **Playvid** - **PlayVids** - **Playwire** - **pluralsight**: [*pluralsight*](## "netrc machine") @@ -1128,11 +1077,8 @@ # Supported sites - **Popcorntimes** - **PopcornTV** - **Pornbox** - - **PornCom** - **PornerBros** - - **Pornez** - **PornFlip** - - **PornHd** - **PornHub**: [*pornhub*](## "netrc machine") PornHub and Thumbzilla - **PornHubPagedVideoList**: [*pornhub*](## "netrc machine") - **PornHubPlaylist**: [*pornhub*](## "netrc machine") @@ -1174,9 +1120,10 @@ # Supported sites - **Radiko** - **RadikoRadio** - **radio.de** - - **radiobremen** - **radiocanada** - **radiocanada:audiovideo** + - **RadioComercial** + - **RadioComercialPlaylist** - **radiofrance** - **RadioFranceLive** - **RadioFrancePodcast** @@ -1212,7 +1159,6 @@ # Supported sites - **RCTIPlusSeries** - **RCTIPlusTV** - **RDS**: RDS.ca - - **Recurbate** - **RedBull** - **RedBullEmbed** - **RedBullTV** @@ -1229,7 +1175,7 @@ # Supported sites - **Reuters** - **ReverbNation** - **RheinMainTV** - - **RICE** + - **RinseFM** - **RMCDecouverte** - **RockstarGames** - **Rokfin**: [*rokfin*](## "netrc machine") @@ -1250,8 +1196,6 @@ # Supported sites - **rtl.lu:tele-vod** - **rtl.nl**: rtl.nl and rtlxl.nl - **rtl2** - - **rtl2:you** - - **rtl2:​you:series** - **RTLLuLive** - **RTLLuRadio** - **RTNews** @@ -1266,10 +1210,9 @@ # Supported sites - **rtve.es:infantil**: RTVE infantil - **rtve.es:live**: RTVE.es live streams - **rtve.es:television** - - **RTVNH** - **RTVS** - **rtvslo.si** - - **RUHD** + - **RudoVideo** - **Rule34Video** - **Rumble** - **RumbleChannel** @@ -1303,6 +1246,9 @@ # Supported sites - **Sapo**: SAPO Vídeos - **savefrom.net** - **SBS**: sbs.com.au + - **sbs.co.kr** + - **sbs.co.kr:allvod_program** + - **sbs.co.kr:programs_vod** - **schooltv** - **ScienceChannel** - **screen.yahoo:search**: Yahoo screen search; "yvsearch:" prefix @@ -1313,8 +1259,8 @@ # Supported sites - **ScrippsNetworks** - **scrippsnetworks:watch** - **Scrolller** - - **SCTE**: [*scte*](## "netrc machine") - - **SCTECourse**: [*scte*](## "netrc machine") + - **SCTE**: [*scte*](## "netrc machine") (**Currently broken**) + - **SCTECourse**: [*scte*](## "netrc machine") (**Currently broken**) - **Seeker** - **SenalColombiaLive** - **SenateGov** @@ -1326,7 +1272,6 @@ # Supported sites - **SeznamZpravyArticle** - **Shahid**: [*shahid*](## "netrc machine") - **ShahidShow** - - **Shared**: shared.sx - **ShareVideosEmbed** - **ShemarooMe** - **ShowRoomLive** @@ -1378,7 +1323,6 @@ # Supported sites - **SovietsClosetPlaylist** - **SpankBang** - **SpankBangPlaylist** - - **Spankwire** - **Spiegel** - **Sport5** - **SportBox** @@ -1391,7 +1335,7 @@ # Supported sites - **SpreakerShowPage** - **SpringboardPlatform** - **Sprout** - - **sr:mediathek**: Saarländischer Rundfunk + - **sr:mediathek**: Saarländischer Rundfunk (**Currently broken**) - **SRGSSR** - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites - **StacommuLive**: [*stacommu*](## "netrc machine") @@ -1408,7 +1352,6 @@ # Supported sites - **StoryFireSeries** - **StoryFireUser** - **Streamable** - - **streamcloud.eu** - **StreamCZ** - **StreamFF** - **StreetVoice** @@ -1424,7 +1367,6 @@ # Supported sites - **SVTPlay**: SVT Play and Öppet arkiv - **SVTSeries** - **SwearnetEpisode** - - **SWRMediathek** - **Syfy** - **SYVDK** - **SztvHu** @@ -1443,7 +1385,6 @@ # Supported sites - **TeachingChannel** - **Teamcoco** - **TeamTreeHouse**: [*teamtreehouse*](## "netrc machine") - - **TechTalks** - **techtv.mit.edu** - **TedEmbed** - **TedPlaylist** @@ -1468,8 +1409,13 @@ # Supported sites - **Tempo** - **TennisTV**: [*tennistv*](## "netrc machine") - **TenPlay**: [*10play*](## "netrc machine") + - **TenPlaySeason** - **TF1** - **TFO** + - **theatercomplextown:ppv**: [*theatercomplextown*](## "netrc machine") + - **theatercomplextown:vod**: [*theatercomplextown*](## "netrc machine") + - **TheGuardianPodcast** + - **TheGuardianPodcastPlaylist** - **TheHoleTv** - **TheIntercept** - **ThePlatform** @@ -1478,8 +1424,7 @@ # Supported sites - **TheSun** - **TheWeatherChannel** - **ThisAmericanLife** - - **ThisAV** - - **ThisOldHouse** + - **ThisOldHouse**: [*thisoldhouse*](## "netrc machine") - **ThisVid** - **ThisVidMember** - **ThisVidPlaylist** @@ -1491,27 +1436,23 @@ # Supported sites - **tiktok:sound**: (**Currently broken**) - **tiktok:tag**: (**Currently broken**) - **tiktok:user**: (**Currently broken**) - - **tinypic**: tinypic.com videos - **TLC** - **TMZ** - **TNAFlix** - **TNAFlixNetworkEmbed** - **toggle** - **toggo** - - **Tokentube** - - **Tokentube:channel** - **tokfm:audition** - **tokfm:podcast** - **ToonGoggles** - **tou.tv**: [*toutv*](## "netrc machine") - - **Toypics**: Toypics video - - **ToypicsUser**: Toypics user profile + - **Toypics**: Toypics video (**Currently broken**) + - **ToypicsUser**: Toypics user profile (**Currently broken**) - **TrailerAddict**: (**Currently broken**) - **TravelChannel** - **Triller**: [*triller*](## "netrc machine") - **TrillerShort** - **TrillerUser**: [*triller*](## "netrc machine") - - **Trilulilu** - **Trovo** - **TrovoChannelClip**: All Clips of a trovo.live channel; "trovoclip:" prefix - **TrovoChannelVod**: All VODs of a trovo.live channel; "trovovod:" prefix @@ -1521,7 +1462,7 @@ # Supported sites - **TruNews** - **Truth** - **TruTV** - - **Tube8** + - **Tube8**: (**Currently broken**) - **TubeTuGraz**: [*tubetugraz*](## "netrc machine") tube.tugraz.at - **TubeTuGrazSeries**: [*tubetugraz*](## "netrc machine") - **TubiTv**: [*tubitv*](## "netrc machine") @@ -1530,7 +1471,6 @@ # Supported sites - **TuneInPodcast** - **TuneInPodcastEpisode** - **TuneInStation** - - **TunePk** - **Turbo** - **tv.dfb.de** - **TV2** @@ -1554,14 +1494,7 @@ # Supported sites - **TVIPlayer** - **tvland.com** - **TVN24** - - **TVNet** - **TVNoe** - - **TVNow** - - **TVNowAnnual** - - **TVNowFilm** - - **TVNowNew** - - **TVNowSeason** - - **TVNowShow** - **tvopengr:embed**: tvopen.gr embedded videos - **tvopengr:watch**: tvopen.gr (and ethnos.gr) videos - **tvp**: Telewizja Polska @@ -1599,7 +1532,6 @@ # Supported sites - **umg:de**: Universal Music Deutschland - **Unistra** - **Unity** - - **UnscriptedNewsVideo** - **uol.com.br** - **uplynk** - **uplynk:preplay** @@ -1614,7 +1546,6 @@ # Supported sites - **Utreon** - **Varzesh3** - **Vbox7** - - **VeeHD** - **Veo** - **Veoh** - **veoh:user** @@ -1627,7 +1558,6 @@ # Supported sites - **vice** - **vice:article** - **vice:show** - - **Vidbit** - **Viddler** - **Videa** - **video.arnes.si**: Arnes Video @@ -1649,6 +1579,7 @@ # Supported sites - **VidioLive**: [*vidio*](## "netrc machine") - **VidioPremier**: [*vidio*](## "netrc machine") - **VidLii** + - **Vidly** - **viewlift** - **viewlift:embed** - **Viidea** @@ -1668,7 +1599,6 @@ # Supported sites - **Vimm:stream** - **ViMP** - **ViMP:Playlist** - - **Vimple**: Vimple - one-click video hosting - **Vine** - **vine:user** - **Viqeo** @@ -1676,7 +1606,6 @@ # Supported sites - **viu:ott**: [*viu*](## "netrc machine") - **viu:playlist** - **ViuOTTIndonesia** - - **Vivo**: vivo.sx - **vk**: [*vk*](## "netrc machine") VK - **vk:uservideos**: [*vk*](## "netrc machine") VK - User's Videos - **vk:wallpost**: [*vk*](## "netrc machine") @@ -1684,37 +1613,27 @@ # Supported sites - **VKPlayLive** - **vm.tiktok** - **Vocaroo** - - **Vodlocker** - **VODPl** - **VODPlatform** - - **VoiceRepublic** - **voicy** - **voicy:channel** - **VolejTV** - - **Voot**: [*voot*](## "netrc machine") - - **VootSeries**: [*voot*](## "netrc machine") + - **Voot**: [*voot*](## "netrc machine") (**Currently broken**) + - **VootSeries**: [*voot*](## "netrc machine") (**Currently broken**) - **VoxMedia** - **VoxMediaVolume** - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **vqq:series** - **vqq:video** - - **Vrak** - **VRT**: VRT NWS, Flanders News, Flandern Info and Sporza - **VrtNU**: [*vrtnu*](## "netrc machine") VRT MAX - - **vrv**: [*vrv*](## "netrc machine") - - **vrv:series** - - **VShare** - **VTM** - **VTXTV**: [*vtxtv*](## "netrc machine") - **VTXTVLive**: [*vtxtv*](## "netrc machine") - **VTXTVRecordings**: [*vtxtv*](## "netrc machine") - **VuClip** - - **Vupload** - **VVVVID** - **VVVVIDShow** - - **VyboryMos** - - **Vzaar** - - **Wakanim** - **Walla** - **WalyTV**: [*walytv*](## "netrc machine") - **WalyTVLive**: [*walytv*](## "netrc machine") @@ -1725,9 +1644,7 @@ # Supported sites - **washingtonpost** - **washingtonpost:article** - **wat.tv** - - **WatchBox** - **WatchESPN** - - **WatchIndianPorn**: Watch Indian Porn - **WDR** - **wdr:mobile**: (**Currently broken**) - **WDRElefant** @@ -1755,7 +1672,6 @@ # Supported sites - **whowatch** - **Whyp** - **wikimedia.org** - - **Willow** - **Wimbledon** - **WimTV** - **WinSportsVideo** @@ -1780,7 +1696,6 @@ # Supported sites - **wykop:post** - **wykop:​post:comment** - **Xanimu** - - **XBef** - **XboxClips** - **XFileShare**: XFileShare based sites: Aparat, ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, WolfStream, XVideoSharing - **XHamster** @@ -1792,9 +1707,6 @@ # Supported sites - **XMinus** - **XNXX** - **Xstream** - - **XTube** - - **XTubeUser**: XTube user profile - - **Xuite**: 隨意窩Xuite影音 - **XVideos** - **xvideos:quickies** - **XXXYMovies** @@ -1811,10 +1723,7 @@ # Supported sites - **YapFiles** - **Yappy** - **YappyProfile** - - **YesJapan** - - **yinyuetai:video**: 音悦Tai - **YleAreena** - - **Ynet** - **YouJizz** - **youku**: 优酷 - **youku:show** @@ -1862,6 +1771,9 @@ # Supported sites - **zingmp3:chart-home** - **zingmp3:chart-music-video** - **zingmp3:hub** + - **zingmp3:liveradio** + - **zingmp3:podcast** + - **zingmp3:podcast-episode** - **zingmp3:user** - **zingmp3:week-chart** - **zoom** diff --git a/test/conftest.py b/test/conftest.py index 15549d30b..2fbc269e1 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -19,3 +19,8 @@ def handler(request): pytest.skip(f'{RH_KEY} request handler is not available') return functools.partial(handler, logger=FakeLogger) + + +def validate_and_send(rh, req): + rh.validate(req) + return rh.send(req) diff --git a/test/helper.py b/test/helper.py index 539b2f618..4aca47025 100644 --- a/test/helper.py +++ b/test/helper.py @@ -10,7 +10,7 @@ import yt_dlp.extractor from yt_dlp import YoutubeDL from yt_dlp.compat import compat_os_name -from yt_dlp.utils import preferredencoding, write_string +from yt_dlp.utils import preferredencoding, try_call, write_string, find_available_port if 'pytest' in sys.modules: import pytest @@ -214,14 +214,19 @@ def sanitize(key, value): test_info_dict = { key: sanitize(key, value) for key, value in got_dict.items() - if value is not None and key not in IGNORED_FIELDS and not any( - key.startswith(f'{prefix}_') for prefix in IGNORED_PREFIXES) + if value is not None and key not in IGNORED_FIELDS and ( + not any(key.startswith(f'{prefix}_') for prefix in IGNORED_PREFIXES) + or key == '_old_archive_ids') } # display_id may be generated from id if test_info_dict.get('display_id') == test_info_dict.get('id'): test_info_dict.pop('display_id') + # release_year may be generated from release_date + if try_call(lambda: test_info_dict['release_year'] == int(test_info_dict['release_date'][:4])): + test_info_dict.pop('release_year') + # Check url for flat entries if got_dict.get('_type', 'video') != 'video' and got_dict.get('url'): test_info_dict['url'] = got_dict['url'] @@ -324,3 +329,8 @@ def http_server_port(httpd): else: sock = httpd.socket return sock.getsockname()[1] + + +def verify_address_availability(address): + if find_available_port(address) is None: + pytest.skip(f'Unable to bind to source address {address} (address may not exist)') diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 0cf130db0..0087cbc94 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -140,6 +140,8 @@ def test(inp, *expected, multi=False): test('example-with-dashes', 'example-with-dashes') test('all', '2', '47', '45', 'example-with-dashes', '35') test('mergeall', '2+47+45+example-with-dashes+35', multi=True) + # See: https://github.com/yt-dlp/yt-dlp/pulls/8797 + test('7_a/worst', '35') def test_format_selection_audio(self): formats = [ @@ -728,7 +730,7 @@ def expect_same_infodict(out): self.assertEqual(got_dict.get(info_field), expected, info_field) return True - test('%()j', (expect_same_infodict, str)) + test('%()j', (expect_same_infodict, None)) # NA placeholder NA_TEST_OUTTMPL = '%(uploader_date)s-%(width)d-%(x|def)s-%(id)s.%(ext)s' @@ -797,6 +799,7 @@ def expect_same_infodict(out): test('%(title|%)s %(title|%%)s', '% %%') test('%(id+1-height+3)05d', '00158') test('%(width+100)05d', 'NA') + test('%(filesize*8)d', '8192') test('%(formats.0) 15s', ('% 15s' % FORMATS[0], None)) test('%(formats.0)r', (repr(FORMATS[0]), None)) test('%(height.0)03d', '001') diff --git a/test/test_networking.py b/test/test_networking.py index 5308c8d6f..62325aa8e 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -26,9 +26,9 @@ from email.message import Message from http.cookiejar import CookieJar -from test.helper import FakeYDL, http_server_port +from test.helper import FakeYDL, http_server_port, verify_address_availability from yt_dlp.cookies import YoutubeDLCookieJar -from yt_dlp.dependencies import brotli +from yt_dlp.dependencies import brotli, requests, urllib3 from yt_dlp.networking import ( HEADRequest, PUTRequest, @@ -43,6 +43,7 @@ HTTPError, IncompleteRead, NoSupportingHandlers, + ProxyError, RequestError, SSLError, TransportError, @@ -51,6 +52,8 @@ from yt_dlp.utils._utils import _YDLLogger as FakeLogger from yt_dlp.utils.networking import HTTPHeaderDict +from test.conftest import validate_and_send + TEST_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -274,11 +277,6 @@ def send_header(self, keyword, value): self._headers_buffer.append(f'{keyword}: {value}\r\n'.encode()) -def validate_and_send(rh, req): - rh.validate(req) - return rh.send(req) - - class TestRequestHandlerBase: @classmethod def setup_class(cls): @@ -305,7 +303,7 @@ def setup_class(cls): class TestHTTPRequestHandler(TestRequestHandlerBase): - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_verify_cert(self, handler): with handler() as rh: with pytest.raises(CertificateVerifyError): @@ -316,7 +314,7 @@ def test_verify_cert(self, handler): assert r.status == 200 r.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_ssl_error(self, handler): # HTTPS server with too old TLS version # XXX: is there a better way to test this than to create a new server? @@ -330,11 +328,11 @@ def test_ssl_error(self, handler): https_server_thread.start() with handler(verify=False) as rh: - with pytest.raises(SSLError, match='sslv3 alert handshake failure') as exc_info: + with pytest.raises(SSLError, match=r'ssl(?:v3|/tls) alert handshake failure') as exc_info: validate_and_send(rh, Request(f'https://127.0.0.1:{https_port}/headers')) assert not issubclass(exc_info.type, CertificateVerifyError) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_percent_encode(self, handler): with handler() as rh: # Unicode characters should be encoded with uppercase percent-encoding @@ -346,7 +344,7 @@ def test_percent_encode(self, handler): assert res.status == 200 res.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_remove_dot_segments(self, handler): with handler() as rh: # This isn't a comprehensive test, @@ -361,14 +359,14 @@ def test_remove_dot_segments(self, handler): assert res.url == f'http://127.0.0.1:{self.http_port}/headers' res.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_unicode_path_redirection(self, handler): with handler() as rh: r = validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/302-non-ascii-redirect')) assert r.url == f'http://127.0.0.1:{self.http_port}/%E4%B8%AD%E6%96%87.html' r.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_raise_http_error(self, handler): with handler() as rh: for bad_status in (400, 500, 599, 302): @@ -378,7 +376,7 @@ def test_raise_http_error(self, handler): # Should not raise an error validate_and_send(rh, Request('http://127.0.0.1:%d/gen_200' % self.http_port)).close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_response_url(self, handler): with handler() as rh: # Response url should be that of the last url in redirect chain @@ -389,7 +387,7 @@ def test_response_url(self, handler): assert res2.url == f'http://127.0.0.1:{self.http_port}/gen_200' res2.close() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_redirect(self, handler): with handler() as rh: def do_req(redirect_status, method, assert_no_content=False): @@ -444,7 +442,7 @@ def do_req(redirect_status, method, assert_no_content=False): with pytest.raises(HTTPError): do_req(code, 'GET') - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_request_cookie_header(self, handler): # We should accept a Cookie header being passed as in normal headers and handle it appropriately. with handler() as rh: @@ -476,19 +474,19 @@ def test_request_cookie_header(self, handler): assert b'Cookie: test=ytdlp' not in data assert b'Cookie: test=test' in data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_redirect_loop(self, handler): with handler() as rh: with pytest.raises(HTTPError, match='redirect loop'): validate_and_send(rh, Request(f'http://127.0.0.1:{self.http_port}/redirect_loop')) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_incompleteread(self, handler): with handler(timeout=2) as rh: with pytest.raises(IncompleteRead): validate_and_send(rh, Request('http://127.0.0.1:%d/incompleteread' % self.http_port)).read() - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_cookies(self, handler): cookiejar = YoutubeDLCookieJar() cookiejar.set_cookie(http.cookiejar.Cookie( @@ -505,7 +503,7 @@ def test_cookies(self, handler): rh, Request(f'http://127.0.0.1:{self.http_port}/headers', extensions={'cookiejar': cookiejar})).read() assert b'Cookie: test=ytdlp' in data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_headers(self, handler): with handler(headers=HTTPHeaderDict({'test1': 'test', 'test2': 'test2'})) as rh: @@ -521,7 +519,7 @@ def test_headers(self, handler): assert b'Test2: test2' not in data assert b'Test3: test3' in data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_timeout(self, handler): with handler() as rh: # Default timeout is 20 seconds, so this should go through @@ -537,21 +535,24 @@ def test_timeout(self, handler): validate_and_send( rh, Request(f'http://127.0.0.1:{self.http_port}/timeout_1', extensions={'timeout': 4})) - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_source_address(self, handler): source_address = f'127.0.0.{random.randint(5, 255)}' + # on some systems these loopback addresses we need for testing may not be available + # see: https://github.com/yt-dlp/yt-dlp/issues/8890 + verify_address_availability(source_address) with handler(source_address=source_address) as rh: data = validate_and_send( rh, Request(f'http://127.0.0.1:{self.http_port}/source_address')).read().decode() assert source_address == data - @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) + @pytest.mark.parametrize('handler', ['Urllib', 'Requests'], indirect=True) def test_gzip_trailing_garbage(self, handler): with handler() as rh: data = validate_and_send(rh, Request(f'http://localhost:{self.http_port}/trailing_garbage')).read().decode() assert data == '', webpage), - }) - - return info_dict diff --git a/yt_dlp/extractor/allstar.py b/yt_dlp/extractor/allstar.py new file mode 100644 index 000000000..87219f2f8 --- /dev/null +++ b/yt_dlp/extractor/allstar.py @@ -0,0 +1,253 @@ +import functools +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + OnDemandPagedList, + int_or_none, + join_nonempty, + parse_qs, + urljoin, +) +from ..utils.traversal import traverse_obj + + +_FIELDS = ''' + _id + clipImageSource + clipImageThumb + clipLink + clipTitle + createdDate + shareId + user { _id } + username + views''' + +_EXTRA_FIELDS = ''' + clipLength + clipSizeBytes''' + +_QUERIES = { + 'clip': '''query ($id: String!) { + video: getClip(clipIdentifier: $id) { + %s %s + } + }''' % (_FIELDS, _EXTRA_FIELDS), + 'montage': '''query ($id: String!) { + video: getMontage(clipIdentifier: $id) { + %s + } + }''' % _FIELDS, + 'Clips': '''query ($page: Int!, $user: String!, $game: Int) { + videos: clips(search: createdDate, page: $page, user: $user, mobile: false, game: $game) { + data { %s %s } + } + }''' % (_FIELDS, _EXTRA_FIELDS), + 'Montages': '''query ($page: Int!, $user: String!) { + videos: montages(search: createdDate, page: $page, user: $user) { + data { %s } + } + }''' % _FIELDS, + 'Mobile Clips': '''query ($page: Int!, $user: String!) { + videos: clips(search: createdDate, page: $page, user: $user, mobile: true) { + data { %s %s } + } + }''' % (_FIELDS, _EXTRA_FIELDS), +} + + +class AllstarBaseIE(InfoExtractor): + @staticmethod + def _parse_video_data(video_data): + def media_url_or_none(path): + return urljoin('https://media.allstar.gg/', path) + + info = traverse_obj(video_data, { + 'id': ('_id', {str}), + 'display_id': ('shareId', {str}), + 'title': ('clipTitle', {str}), + 'url': ('clipLink', {media_url_or_none}), + 'thumbnails': (('clipImageThumb', 'clipImageSource'), {'url': {media_url_or_none}}), + 'duration': ('clipLength', {int_or_none}), + 'filesize': ('clipSizeBytes', {int_or_none}), + 'timestamp': ('createdDate', {functools.partial(int_or_none, scale=1000)}), + 'uploader': ('username', {str}), + 'uploader_id': ('user', '_id', {str}), + 'view_count': ('views', {int_or_none}), + }) + + if info.get('id') and info.get('url'): + basename = 'clip' if '/clips/' in info['url'] else 'montage' + info['webpage_url'] = f'https://allstar.gg/{basename}?{basename}={info["id"]}' + + info.update({ + 'extractor_key': AllstarIE.ie_key(), + 'extractor': AllstarIE.IE_NAME, + 'uploader_url': urljoin('https://allstar.gg/u/', info.get('uploader_id')), + }) + + return info + + def _call_api(self, query, variables, path, video_id=None, note=None): + response = self._download_json( + 'https://a1.allstar.gg/graphql', video_id, note=note, + headers={'content-type': 'application/json'}, + data=json.dumps({'variables': variables, 'query': query}).encode()) + + errors = traverse_obj(response, ('errors', ..., 'message', {str})) + if errors: + raise ExtractorError('; '.join(errors)) + + return traverse_obj(response, path) + + +class AllstarIE(AllstarBaseIE): + _VALID_URL = r'https?://(?:www\.)?allstar\.gg/(?P(?:clip|montage))\?(?P=type)=(?P[^/?#&]+)' + + _TESTS = [{ + 'url': 'https://allstar.gg/clip?clip=64482c2da9eec30008a67d1b', + 'info_dict': { + 'id': '64482c2da9eec30008a67d1b', + 'title': '4K on Inferno', + 'url': 'md5:66befb5381eef0c9456026386c25fa55', + 'thumbnail': r're:https://media\.allstar\.gg/.+\.(?:png|jpg)$', + 'uploader': 'chrk.', + 'ext': 'mp4', + 'duration': 20, + 'filesize': 21199257, + 'timestamp': 1682451501, + 'uploader_id': '62b8bdfc9021052f7905882d', + 'uploader_url': 'https://allstar.gg/u/62b8bdfc9021052f7905882d', + 'upload_date': '20230425', + 'view_count': int, + } + }, { + 'url': 'https://allstar.gg/clip?clip=8LJLY4JKB', + 'info_dict': { + 'id': '64a1ec6b887f4c0008dc50b8', + 'display_id': '8LJLY4JKB', + 'title': 'AK-47 3K on Mirage', + 'url': 'md5:dde224fd12f035c0e2529a4ae34c4283', + 'ext': 'mp4', + 'thumbnail': r're:https://media\.allstar\.gg/.+\.(?:png|jpg)$', + 'duration': 16, + 'filesize': 30175859, + 'timestamp': 1688333419, + 'uploader': 'cherokee', + 'uploader_id': '62b8bdfc9021052f7905882d', + 'uploader_url': 'https://allstar.gg/u/62b8bdfc9021052f7905882d', + 'upload_date': '20230702', + 'view_count': int, + } + }, { + 'url': 'https://allstar.gg/montage?montage=643e64089da7e9363e1fa66c', + 'info_dict': { + 'id': '643e64089da7e9363e1fa66c', + 'display_id': 'APQLGM2IMXW', + 'title': 'cherokee Rapid Fire Snipers Montage', + 'url': 'md5:a3ee356022115db2b27c81321d195945', + 'thumbnail': r're:https://media\.allstar\.gg/.+\.(?:png|jpg)$', + 'ext': 'mp4', + 'timestamp': 1681810448, + 'uploader': 'cherokee', + 'uploader_id': '62b8bdfc9021052f7905882d', + 'uploader_url': 'https://allstar.gg/u/62b8bdfc9021052f7905882d', + 'upload_date': '20230418', + 'view_count': int, + } + }, { + 'url': 'https://allstar.gg/montage?montage=RILJMH6QOS', + 'info_dict': { + 'id': '64a2697372ce3703de29e868', + 'display_id': 'RILJMH6QOS', + 'title': 'cherokee Rapid Fire Snipers Montage', + 'url': 'md5:d5672e6f88579730c2310a80fdbc4030', + 'thumbnail': r're:https://media\.allstar\.gg/.+\.(?:png|jpg)$', + 'ext': 'mp4', + 'timestamp': 1688365434, + 'uploader': 'cherokee', + 'uploader_id': '62b8bdfc9021052f7905882d', + 'uploader_url': 'https://allstar.gg/u/62b8bdfc9021052f7905882d', + 'upload_date': '20230703', + 'view_count': int, + } + }] + + def _real_extract(self, url): + query_id, video_id = self._match_valid_url(url).group('type', 'id') + + return self._parse_video_data( + self._call_api( + _QUERIES.get(query_id), {'id': video_id}, ('data', 'video'), video_id)) + + +class AllstarProfileIE(AllstarBaseIE): + _VALID_URL = r'https?://(?:www\.)?allstar\.gg/(?:profile\?user=|u/)(?P[^/?#&]+)' + + _TESTS = [{ + 'url': 'https://allstar.gg/profile?user=62b8bdfc9021052f7905882d', + 'info_dict': { + 'id': '62b8bdfc9021052f7905882d-clips', + 'title': 'cherokee - Clips', + }, + 'playlist_mincount': 15 + }, { + 'url': 'https://allstar.gg/u/cherokee?game=730&view=Clips', + 'info_dict': { + 'id': '62b8bdfc9021052f7905882d-clips-730', + 'title': 'cherokee - Clips - 730', + }, + 'playlist_mincount': 15 + }, { + 'url': 'https://allstar.gg/u/62b8bdfc9021052f7905882d?view=Montages', + 'info_dict': { + 'id': '62b8bdfc9021052f7905882d-montages', + 'title': 'cherokee - Montages', + }, + 'playlist_mincount': 4 + }, { + 'url': 'https://allstar.gg/profile?user=cherokee&view=Mobile Clips', + 'info_dict': { + 'id': '62b8bdfc9021052f7905882d-mobile', + 'title': 'cherokee - Mobile Clips', + }, + 'playlist_mincount': 1 + }] + + _PAGE_SIZE = 10 + + def _get_page(self, user_id, display_id, game, query, page_num): + page_num += 1 + + for video_data in self._call_api( + query, { + 'user': user_id, + 'page': page_num, + 'game': game, + }, ('data', 'videos', 'data'), display_id, f'Downloading page {page_num}'): + yield self._parse_video_data(video_data) + + def _real_extract(self, url): + display_id = self._match_id(url) + profile_data = self._download_json( + urljoin('https://api.allstar.gg/v1/users/profile/', display_id), display_id) + user_id = traverse_obj(profile_data, ('data', ('_id'), {str})) + if not user_id: + raise ExtractorError('Unable to extract the user id') + + username = traverse_obj(profile_data, ('data', 'profile', ('username'), {str})) + url_query = parse_qs(url) + game = traverse_obj(url_query, ('game', 0, {int_or_none})) + query_id = traverse_obj(url_query, ('view', 0), default='Clips') + + if query_id not in ('Clips', 'Montages', 'Mobile Clips'): + raise ExtractorError(f'Unsupported playlist URL type {query_id!r}') + + return self.playlist_result( + OnDemandPagedList( + functools.partial( + self._get_page, user_id, display_id, game, _QUERIES.get(query_id)), self._PAGE_SIZE), + playlist_id=join_nonempty(user_id, query_id.lower().split()[0], game), + playlist_title=join_nonempty((username or display_id), query_id, game, delim=' - ')) diff --git a/yt_dlp/extractor/altcensored.py b/yt_dlp/extractor/altcensored.py new file mode 100644 index 000000000..0e1627bfd --- /dev/null +++ b/yt_dlp/extractor/altcensored.py @@ -0,0 +1,96 @@ +import re + +from .archiveorg import ArchiveOrgIE +from .common import InfoExtractor +from ..utils import ( + InAdvancePagedList, + int_or_none, + orderedSet, + str_to_int, + urljoin, +) + + +class AltCensoredIE(InfoExtractor): + IE_NAME = 'altcensored' + _VALID_URL = r'https?://(?:www\.)?altcensored\.com/(?:watch\?v=|embed/)(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.altcensored.com/watch?v=k0srjLSkga8', + 'info_dict': { + 'id': 'youtube-k0srjLSkga8', + 'ext': 'webm', + 'title': "QUELLES SONT LES CONSÉQUENCES DE L'HYPERSEXUALISATION DE LA SOCIÉTÉ ?", + 'display_id': 'k0srjLSkga8.webm', + 'release_date': '20180403', + 'creator': 'Virginie Vota', + 'release_year': 2018, + 'upload_date': '20230318', + 'uploader': 'admin@altcensored.com', + 'description': 'md5:0b38a8fc04103579d5c1db10a247dc30', + 'timestamp': 1679161343, + 'track': 'k0srjLSkga8', + 'duration': 926.09, + 'thumbnail': 'https://archive.org/download/youtube-k0srjLSkga8/youtube-k0srjLSkga8.thumbs/k0srjLSkga8_000925.jpg', + 'view_count': int, + 'categories': ['News & Politics'], + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + return { + '_type': 'url_transparent', + 'url': f'https://archive.org/details/youtube-{video_id}', + 'ie_key': ArchiveOrgIE.ie_key(), + 'view_count': str_to_int(self._html_search_regex( + r'YouTube Views:(?:\s| )*([\d,]+)', webpage, 'view count', default=None)), + 'categories': self._html_search_regex( + r'\s*\n?\s*([^<]+)', + webpage, 'category', default='').split() or None, + } + + +class AltCensoredChannelIE(InfoExtractor): + IE_NAME = 'altcensored:channel' + _VALID_URL = r'https?://(?:www\.)?altcensored\.com/channel/(?!page|table)(?P[^/?#]+)' + _PAGE_SIZE = 24 + _TESTS = [{ + 'url': 'https://www.altcensored.com/channel/UCFPTO55xxHqFqkzRZHu4kcw', + 'info_dict': { + 'title': 'Virginie Vota', + 'id': 'UCFPTO55xxHqFqkzRZHu4kcw', + }, + 'playlist_count': 91 + }, { + 'url': 'https://altcensored.com/channel/UC9CcJ96HKMWn0LZlcxlpFTw', + 'info_dict': { + 'title': 'yukikaze775', + 'id': 'UC9CcJ96HKMWn0LZlcxlpFTw', + }, + 'playlist_count': 4 + }] + + def _real_extract(self, url): + channel_id = self._match_id(url) + webpage = self._download_webpage( + url, channel_id, 'Download channel webpage', 'Unable to get channel webpage') + title = self._html_search_meta('altcen_title', webpage, 'title', fatal=False) + page_count = int_or_none(self._html_search_regex( + r']+href="/channel/\w+/page/(\d+)">(?:\1)', + webpage, 'page count', default='1')) + + def page_func(page_num): + page_num += 1 + webpage = self._download_webpage( + f'https://altcensored.com/channel/{channel_id}/page/{page_num}', + channel_id, note=f'Downloading page {page_num}') + + items = re.findall(r']+href="(/watch\?v=[^"]+)', webpage) + return [self.url_result(urljoin('https://www.altcensored.com', path), AltCensoredIE) + for path in orderedSet(items)] + + return self.playlist_result( + InAdvancePagedList(page_func, page_count, self._PAGE_SIZE), + playlist_id=channel_id, playlist_title=title) diff --git a/yt_dlp/extractor/amadeustv.py b/yt_dlp/extractor/amadeustv.py new file mode 100644 index 000000000..2f5ca9137 --- /dev/null +++ b/yt_dlp/extractor/amadeustv.py @@ -0,0 +1,77 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + parse_iso8601, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class AmadeusTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?amadeus\.tv/library/(?P[\da-f]+)' + _TESTS = [{ + 'url': 'http://www.amadeus.tv/library/65091a87ff85af59d9fc54c3', + 'info_dict': { + 'id': '5576678021301411311', + 'ext': 'mp4', + 'title': 'Jieon Park - 第五届珠海莫扎特国际青少年音乐周小提琴C组第三轮', + 'thumbnail': 'http://1253584441.vod2.myqcloud.com/a0046a27vodtransbj1253584441/7db4af535576678021301411311/coverBySnapshot_10_0.jpg', + 'duration': 1264.8, + 'upload_date': '20230918', + 'timestamp': 1695034800, + 'display_id': '65091a87ff85af59d9fc54c3', + 'view_count': int, + 'description': 'md5:a0357b9c215489e2067cbae0b777bb95', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + nuxt_data = self._search_nuxt_data(webpage, display_id, traverse=('fetch', '0')) + video_id = traverse_obj(nuxt_data, ('item', 'video', {str})) + + if not video_id: + raise ExtractorError('Unable to extract actual video ID') + + video_data = self._download_json( + f'http://playvideo.qcloud.com/getplayinfo/v2/1253584441/{video_id}', + video_id, headers={'Referer': 'http://www.amadeus.tv/'}) + + formats = [] + for video in traverse_obj(video_data, ('videoInfo', ('sourceVideo', ('transcodeList', ...)), {dict})): + if not url_or_none(video.get('url')): + continue + formats.append({ + **traverse_obj(video, { + 'url': 'url', + 'format_id': ('definition', {lambda x: f'http-{x or "0"}'}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'filesize': (('totalSize', 'size'), {int_or_none}), + 'vcodec': ('videoStreamList', 0, 'codec'), + 'acodec': ('audioStreamList', 0, 'codec'), + 'fps': ('videoStreamList', 0, 'fps', {float_or_none}), + }, get_all=False), + 'http_headers': {'Referer': 'http://www.amadeus.tv/'}, + }) + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + **traverse_obj(video_data, { + 'title': ('videoInfo', 'basicInfo', 'name', {str}), + 'thumbnail': ('coverInfo', 'coverUrl', {url_or_none}), + 'duration': ('videoInfo', 'sourceVideo', ('floatDuration', 'duration'), {float_or_none}), + }, get_all=False), + **traverse_obj(nuxt_data, ('item', { + 'title': (('title', 'title_en', 'title_cn'), {str}), + 'description': (('description', 'description_en', 'description_cn'), {str}), + 'timestamp': ('date', {parse_iso8601}), + 'view_count': ('view', {int_or_none}), + }), get_all=False), + } diff --git a/yt_dlp/extractor/aol.py b/yt_dlp/extractor/aol.py index 6949ca974..455f66795 100644 --- a/yt_dlp/extractor/aol.py +++ b/yt_dlp/extractor/aol.py @@ -10,6 +10,7 @@ class AolIE(YahooIE): # XXX: Do not subclass from concrete IE + _WORKING = False IE_NAME = 'aol.com' _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P\d{9}|[0-9a-f]{24}|[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})' diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index a0b26ac5a..3bb6f2e31 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -52,7 +52,6 @@ class ArchiveOrgIE(InfoExtractor): 'creator': 'SRI International', 'uploader': 'laura@archive.org', 'thumbnail': r're:https://archive\.org/download/.*\.jpg', - 'release_year': 1968, 'display_id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect.cdr', 'track': 'XD300-23 68HighlightsAResearchCntAugHumanIntellect', @@ -134,7 +133,6 @@ class ArchiveOrgIE(InfoExtractor): 'album': '1977-05-08 - Barton Hall - Cornell University', 'release_date': '19770508', 'display_id': 'gd1977-05-08d01t07.flac', - 'release_year': 1977, 'track_number': 7, }, }, { diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py index ca1faa7d0..f4b1cd075 100644 --- a/yt_dlp/extractor/ard.py +++ b/yt_dlp/extractor/ard.py @@ -1,24 +1,24 @@ -import json import re +from functools import partial from .common import InfoExtractor -from .generic import GenericIE from ..utils import ( + OnDemandPagedList, + bug_reports_message, determine_ext, - ExtractorError, int_or_none, + join_nonempty, + make_archive_id, parse_duration, - qualities, + parse_iso8601, + remove_start, str_or_none, - try_get, unified_strdate, - unified_timestamp, - update_url, update_url_query, url_or_none, xpath_text, ) -from ..compat import compat_etree_fromstring +from ..utils.traversal import traverse_obj class ARDMediathekBaseIE(InfoExtractor): @@ -61,45 +61,6 @@ def _parse_media_info(self, media_info, video_id, fsk): 'subtitles': subtitles, } - def _ARD_extract_episode_info(self, title): - """Try to extract season/episode data from the title.""" - res = {} - if not title: - return res - - for pattern in [ - # Pattern for title like "Homo sapiens (S06/E07) - Originalversion" - # from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw - r'.*(?P \(S(?P\d+)/E(?P\d+)\)).*', - # E.g.: title="Fritjof aus Norwegen (2) (AD)" - # from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/ - r'.*(?P \((?:Folge |Teil )?(?P\d+)(?:/\d+)?\)).*', - r'.*(?PFolge (?P\d+)(?:\:| -|) )\"(?P.+)\".*', - # E.g.: title="Folge 25/42: Symmetrie" - # from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/ - # E.g.: title="Folge 1063 - Vertrauen" - # from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/ - r'.*(?PFolge (?P\d+)(?:/\d+)?(?:\:| -|) ).*', - ]: - m = re.match(pattern, title) - if m: - groupdict = m.groupdict() - res['season_number'] = int_or_none(groupdict.get('season_number')) - res['episode_number'] = int_or_none(groupdict.get('episode_number')) - res['episode'] = str_or_none(groupdict.get('episode')) - # Build the episode title by removing numeric episode information: - if groupdict.get('ep_info') and not res['episode']: - res['episode'] = str_or_none( - title.replace(groupdict.get('ep_info'), '')) - if res['episode']: - res['episode'] = res['episode'].strip() - break - - # As a fallback use the whole title as the episode name: - if not res.get('episode'): - res['episode'] = title.strip() - return res - def _extract_formats(self, media_info, video_id): type_ = media_info.get('_type') media_array = media_info.get('_mediaArray', []) @@ -155,144 +116,12 @@ def _extract_formats(self, media_info, video_id): return formats -class ARDMediathekIE(ARDMediathekBaseIE): - IE_NAME = 'ARD:mediathek' - _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' - - _TESTS = [{ - # available till 26.07.2022 - 'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822', - 'info_dict': { - 'id': '44726822', - 'ext': 'mp4', - 'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?', - 'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5', - 'duration': 1740, - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, { - 'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872', - 'only_matching': True, - }, { - # audio - 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', - 'only_matching': True, - }, { - 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', - 'only_matching': True, - }, { - # audio - 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', - 'only_matching': True, - }, { - 'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) - - def _real_extract(self, url): - # determine video id from url - m = self._match_valid_url(url) - - document_id = None - - numid = re.search(r'documentId=([0-9]+)', url) - if numid: - document_id = video_id = numid.group(1) - else: - video_id = m.group('video_id') - - webpage = self._download_webpage(url, video_id) - - ERRORS = ( - ('>Leider liegt eine Störung vor.', 'Video %s is unavailable'), - ('>Der gewünschte Beitrag ist nicht mehr verfügbar.<', - 'Video %s is no longer available'), - ) - - for pattern, message in ERRORS: - if pattern in webpage: - raise ExtractorError(message % video_id, expected=True) - - if re.search(r'[\?&]rss($|[=&])', url): - doc = compat_etree_fromstring(webpage.encode('utf-8')) - if doc.tag == 'rss': - return GenericIE()._extract_rss(url, video_id, doc) - - title = self._og_search_title(webpage, default=None) or self._html_search_regex( - [r'(.*?)', - r'', - r'

(.*?)

', - r']*>(.*?)'], - webpage, 'title') - description = self._og_search_description(webpage, default=None) or self._html_search_meta( - 'dcterms.abstract', webpage, 'description', default=None) - if description is None: - description = self._html_search_meta( - 'description', webpage, 'meta description', default=None) - if description is None: - description = self._html_search_regex( - r'(.+?)

', - webpage, 'teaser text', default=None) - - # Thumbnail is sometimes not present. - # It is in the mobile version, but that seems to use a different URL - # structure altogether. - thumbnail = self._og_search_thumbnail(webpage, default=None) - - media_streams = re.findall(r'''(?x) - mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s* - "([^"]+)"''', webpage) - - if media_streams: - QUALITIES = qualities(['lo', 'hi', 'hq']) - formats = [] - for furl in set(media_streams): - if furl.endswith('.f4m'): - fid = 'f4m' - else: - fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl) - fid = fid_m.group(1) if fid_m else None - formats.append({ - 'quality': QUALITIES(fid), - 'format_id': fid, - 'url': furl, - }) - info = { - 'formats': formats, - } - else: # request JSON file - if not document_id: - video_id = self._search_regex( - (r'/play/(?:config|media|sola)/(\d+)', r'contentId["\']\s*:\s*(\d+)'), - webpage, 'media id', default=None) - info = self._extract_media_info( - 'http://www.ardmediathek.de/play/media/%s' % video_id, - webpage, video_id) - - info.update({ - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - }) - info.update(self._ARD_extract_episode_info(info['title'])) - - return info - - class ARDIE(InfoExtractor): _VALID_URL = r'(?Phttps?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P[^/?#&]+))\.html' _TESTS = [{ # available till 7.12.2023 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html', - 'md5': 'a438f671e87a7eba04000336a119ccc4', + 'md5': '94812e6438488fb923c361a44469614b', 'info_dict': { 'id': 'maischberger-video-424', 'display_id': 'maischberger-video-424', @@ -399,31 +228,35 @@ def _real_extract(self, url): } -class ARDBetaMediathekIE(ARDMediathekBaseIE): +class ARDBetaMediathekIE(InfoExtractor): + IE_NAME = 'ARDMediathek' _VALID_URL = r'''(?x)https:// (?:(?:beta|www)\.)?ardmediathek\.de/ - (?:(?P[^/]+)/)? - (?:player|live|video|(?Psendung|sammlung))/ - (?:(?P(?(playlist)[^?#]+?|[^?#]+))/)? - (?P(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+) - (?(playlist)/(?P\d+)?/?(?:[?#]|$))''' + (?:[^/]+/)? + (?:player|live|video)/ + (?:[^?#]+/)? + (?P[a-zA-Z0-9]+) + /?(?:[?#]|$)''' + _GEO_COUNTRIES = ['DE'] _TESTS = [{ - 'url': 'https://www.ardmediathek.de/video/filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy8xZGY0ZGJmZS00ZWQwLTRmMGItYjhhYy0wOGQ4ZmYxNjVhZDI', - 'md5': '3fd5fead7a370a819341129c8d713136', + 'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0', + 'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4', 'info_dict': { - 'display_id': 'filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen', - 'id': '12172961', - 'title': 'Wolfsland - Die traurigen Schwestern', - 'description': r're:^Als der Polizeiobermeister Raaben', - 'duration': 5241, - 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:efa186f7b0054957', - 'timestamp': 1670710500, - 'upload_date': '20221210', + 'display_id': 'Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0', + 'id': '12939099', + 'title': 'Liebe auf vier Pfoten', + 'description': r're:^Claudia Schmitt, Anwältin in Salzburg', + 'duration': 5222, + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:aee7cbf8f06de976?w=960&ch=ae4d0f2ee47d8b9b', + 'timestamp': 1701343800, + 'upload_date': '20231130', 'ext': 'mp4', - 'age_limit': 12, - 'episode': 'Wolfsland - Die traurigen Schwestern', - 'series': 'Filme im MDR' + 'episode': 'Liebe auf vier Pfoten', + 'series': 'Filme im MDR', + 'age_limit': 0, + 'channel': 'MDR', + '_old_archive_ids': ['ardbetamediathek Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0'], }, }, { 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', @@ -450,11 +283,31 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'timestamp': 1636398000, 'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b', 'upload_date': '20211108', - 'display_id': 'tagesschau-oder-tagesschau-20-00-uhr/das-erste', + 'display_id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll', 'duration': 915, 'episode': 'tagesschau, 20:00 Uhr', 'series': 'tagesschau', - 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49', + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678', + 'channel': 'ARD-Aktuell', + '_old_archive_ids': ['ardbetamediathek Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll'], + }, + }, { + 'url': 'https://www.ardmediathek.de/video/7-tage/7-tage-unter-harten-jungs/hr-fernsehen/N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3', + 'md5': 'c428b9effff18ff624d4f903bda26315', + 'info_dict': { + 'id': '94834686', + 'ext': 'mp4', + 'duration': 2700, + 'episode': '7 Tage ... unter harten Jungs', + 'description': 'md5:0f215470dcd2b02f59f4bd10c963f072', + 'upload_date': '20231005', + 'timestamp': 1696491171, + 'display_id': 'N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3', + 'series': '7 Tage ...', + 'channel': 'HR', + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:f6e6d5ffac41925c?w=960&ch=fa32ba69bc87989a', + 'title': '7 Tage ... unter harten Jungs', + '_old_archive_ids': ['ardbetamediathek N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3'], }, }, { 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', @@ -471,203 +324,239 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): }, { 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/', + 'only_matching': True, + }] + + def _extract_episode_info(self, title): + patterns = [ + # Pattern for title like "Homo sapiens (S06/E07) - Originalversion" + # from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw + r'.*(?P \(S(?P\d+)/E(?P\d+)\)).*', + # E.g.: title="Fritjof aus Norwegen (2) (AD)" + # from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/ + r'.*(?P \((?:Folge |Teil )?(?P\d+)(?:/\d+)?\)).*', + r'.*(?PFolge (?P\d+)(?:\:| -|) )\"(?P.+)\".*', + # E.g.: title="Folge 25/42: Symmetrie" + # from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/ + # E.g.: title="Folge 1063 - Vertrauen" + # from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/ + r'.*(?PFolge (?P\d+)(?:/\d+)?(?:\:| -|) ).*', + # As a fallback use the full title + r'(?P.*)', + ] + + return traverse_obj(patterns, (..., {partial(re.match, string=title)}, { + 'season_number': ('season_number', {int_or_none}), + 'episode_number': ('episode_number', {int_or_none}), + 'episode': (( + ('episode', {str_or_none}), + ('ep_info', {lambda x: title.replace(x, '')}), + ('title', {str}), + ), {str.strip}), + }), get_all=False) + + def _real_extract(self, url): + display_id = self._match_id(url) + + page_data = self._download_json( + f'https://api.ardmediathek.de/page-gateway/pages/ard/item/{display_id}', display_id, query={ + 'embedded': 'false', + 'mcV6': 'true', + }) + + # For user convenience we use the old contentId instead of the longer crid + # Ref: https://github.com/yt-dlp/yt-dlp/issues/8731#issuecomment-1874398283 + old_id = traverse_obj(page_data, ('tracking', 'atiCustomVars', 'contentId', {int})) + if old_id is not None: + video_id = str(old_id) + archive_ids = [make_archive_id(ARDBetaMediathekIE, display_id)] + else: + self.report_warning(f'Could not extract contentId{bug_reports_message()}') + video_id = display_id + archive_ids = None + + player_data = traverse_obj( + page_data, ('widgets', lambda _, v: v['type'] in ('player_ondemand', 'player_live'), {dict}), get_all=False) + is_live = player_data.get('type') == 'player_live' + media_data = traverse_obj(player_data, ('mediaCollection', 'embedded', {dict})) + + if player_data.get('blockedByFsk'): + self.raise_no_formats('This video is only available after 22:00', expected=True) + + formats = [] + subtitles = {} + for stream in traverse_obj(media_data, ('streams', ..., {dict})): + kind = stream.get('kind') + # Prioritize main stream over sign language and others + preference = 1 if kind == 'main' else None + for media in traverse_obj(stream, ('media', lambda _, v: url_or_none(v['url']))): + media_url = media['url'] + + audio_kind = traverse_obj(media, ( + 'audios', 0, 'kind', {str}), default='').replace('standard', '') + lang_code = traverse_obj(media, ('audios', 0, 'languageCode', {str})) or 'deu' + lang = join_nonempty(lang_code, audio_kind) + language_preference = 10 if lang == 'deu' else -10 + + if determine_ext(media_url) == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + media_url, video_id, m3u8_id=f'hls-{kind}', preference=preference, fatal=False, live=is_live) + for f in fmts: + f['language'] = lang + f['language_preference'] = language_preference + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': media_url, + 'format_id': f'http-{kind}', + 'preference': preference, + 'language': lang, + 'language_preference': language_preference, + **traverse_obj(media, { + 'format_note': ('forcedLabel', {str}), + 'width': ('maxHResolutionPx', {int_or_none}), + 'height': ('maxVResolutionPx', {int_or_none}), + 'vcodec': ('videoCodec', {str}), + }), + }) + + for sub in traverse_obj(media_data, ('subtitles', ..., {dict})): + for sources in traverse_obj(sub, ('sources', lambda _, v: url_or_none(v['url']))): + subtitles.setdefault(sub.get('languageCode') or 'deu', []).append({ + 'url': sources['url'], + 'ext': {'webvtt': 'vtt', 'ebutt': 'ttml'}.get(sources.get('kind')), + }) + + age_limit = traverse_obj(page_data, ('fskRating', {lambda x: remove_start(x, 'FSK')}, {int_or_none})) + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + 'age_limit': age_limit, + **traverse_obj(media_data, ('meta', { + 'title': 'title', + 'description': 'synopsis', + 'timestamp': ('broadcastedOnDateTime', {parse_iso8601}), + 'series': 'seriesTitle', + 'thumbnail': ('images', 0, 'url', {url_or_none}), + 'duration': ('durationSeconds', {int_or_none}), + 'channel': 'clipSourceName', + })), + **self._extract_episode_info(page_data.get('title')), + '_old_archive_ids': archive_ids, + } + + +class ARDMediathekCollectionIE(InfoExtractor): + _VALID_URL = r'''(?x)https:// + (?:(?:beta|www)\.)?ardmediathek\.de/ + (?:[^/?#]+/)? + (?P<playlist>sendung|serie|sammlung)/ + (?:(?P<display_id>[^?#]+?)/)? + (?P<id>[a-zA-Z0-9]+) + (?:/(?P<season>\d+)(?:/(?P<version>OV|AD))?)?/?(?:[?#]|$)''' + _GEO_COUNTRIES = ['DE'] + + _TESTS = [{ + 'url': 'https://www.ardmediathek.de/serie/quiz/staffel-1-originalversion/Y3JpZDovL3dkci5kZS9vbmUvcXVpeg/1/OV', + 'info_dict': { + 'id': 'Y3JpZDovL3dkci5kZS9vbmUvcXVpeg_1_OV', + 'display_id': 'quiz/staffel-1-originalversion', + 'title': 'Staffel 1 Originalversion', + }, + 'playlist_count': 3, + }, { + 'url': 'https://www.ardmediathek.de/serie/babylon-berlin/staffel-4-mit-audiodeskription/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu/4/AD', + 'info_dict': { + 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu_4_AD', + 'display_id': 'babylon-berlin/staffel-4-mit-audiodeskription', + 'title': 'Staffel 4 mit Audiodeskription', + }, + 'playlist_count': 12, + }, { + 'url': 'https://www.ardmediathek.de/serie/babylon-berlin/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu/1/', + 'info_dict': { + 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu_1', + 'display_id': 'babylon-berlin/staffel-1', + 'title': 'Staffel 1', + }, + 'playlist_count': 8, + }, { + 'url': 'https://www.ardmediathek.de/sendung/tatort/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydA', + 'info_dict': { + 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydA', + 'display_id': 'tatort', + 'title': 'Tatort', + }, + 'playlist_mincount': 500, + }, { + 'url': 'https://www.ardmediathek.de/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2', + 'info_dict': { + 'id': '5eOHzt8XB2sqeFXbIoJlg2', + 'display_id': 'die-kirche-bleibt-im-dorf', + 'title': 'Die Kirche bleibt im Dorf', + 'description': 'Die Kirche bleibt im Dorf', + }, + 'playlist_count': 4, }, { # playlist of type 'sendung' 'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/', 'only_matching': True, + }, { + # playlist of type 'serie' + 'url': 'https://www.ardmediathek.de/serie/nachtstreife/staffel-1/Y3JpZDovL3N3ci5kZS9zZGIvc3RJZC8xMjQy/1', + 'only_matching': True, }, { # playlist of type 'sammlung' 'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/', 'only_matching': True, - }, { - 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/', - 'only_matching': True, - }, { - 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet', - 'only_matching': True, }] - def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber): - """ Query the ARD server for playlist information - and returns the data in "raw" format """ - if mode == 'sendung': - graphQL = json.dumps({ - 'query': '''{ - showPage( - client: "%s" - showId: "%s" - pageNumber: %d - ) { - pagination { - pageSize - totalElements - } - teasers { # Array - mediumTitle - links { target { id href title } } - type - } - }}''' % (client, playlist_id, pageNumber), - }).encode() - else: # mode == 'sammlung' - graphQL = json.dumps({ - 'query': '''{ - morePage( - client: "%s" - compilationId: "%s" - pageNumber: %d - ) { - widget { - pagination { - pageSize - totalElements - } - teasers { # Array - mediumTitle - links { target { id href title } } - type - } - } - }}''' % (client, playlist_id, pageNumber), - }).encode() - # Ressources for ARD graphQL debugging: - # https://api-test.ardmediathek.de/public-gateway - show_page = self._download_json( - 'https://api.ardmediathek.de/public-gateway', - '[Playlist] %s' % display_id, - data=graphQL, - headers={'Content-Type': 'application/json'})['data'] - # align the structure of the returned data: - if mode == 'sendung': - show_page = show_page['showPage'] - else: # mode == 'sammlung' - show_page = show_page['morePage']['widget'] - return show_page - - def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode): - """ Collects all playlist entries and returns them as info dict. - Supports playlists of mode 'sendung' and 'sammlung', and also nested - playlists. """ - entries = [] - pageNumber = 0 - while True: # iterate by pageNumber - show_page = self._ARD_load_playlist_snipped( - playlist_id, display_id, client, mode, pageNumber) - for teaser in show_page['teasers']: # process playlist items - if '/compilation/' in teaser['links']['target']['href']: - # alternativ cond.: teaser['type'] == "compilation" - # => This is an nested compilation, e.g. like: - # https://www.ardmediathek.de/ard/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2/ - link_mode = 'sammlung' - else: - link_mode = 'video' - - item_url = 'https://www.ardmediathek.de/%s/%s/%s/%s/%s' % ( - client, link_mode, display_id, - # perform HTLM quoting of episode title similar to ARD: - re.sub('^-|-$', '', # remove '-' from begin/end - re.sub('[^a-zA-Z0-9]+', '-', # replace special chars by - - teaser['links']['target']['title'].lower() - .replace('ä', 'ae').replace('ö', 'oe') - .replace('ü', 'ue').replace('ß', 'ss'))), - teaser['links']['target']['id']) - entries.append(self.url_result( - item_url, - ie=ARDBetaMediathekIE.ie_key())) - - if (show_page['pagination']['pageSize'] * (pageNumber + 1) - >= show_page['pagination']['totalElements']): - # we've processed enough pages to get all playlist entries - break - pageNumber = pageNumber + 1 - - return self.playlist_result(entries, playlist_id, playlist_title=display_id) + _PAGE_SIZE = 100 def _real_extract(self, url): - video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group( - 'id', 'display_id', 'playlist', 'client', 'season') - display_id, client = display_id or video_id, client or 'ard' + playlist_id, display_id, playlist_type, season_number, version = self._match_valid_url(url).group( + 'id', 'display_id', 'playlist', 'season', 'version') - if playlist_type: - # TODO: Extract only specified season - return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type) + def call_api(page_num): + api_path = 'compilations/ard' if playlist_type == 'sammlung' else 'widgets/ard/asset' + return self._download_json( + f'https://api.ardmediathek.de/page-gateway/{api_path}/{playlist_id}', playlist_id, + f'Downloading playlist page {page_num}', query={ + 'pageNumber': page_num, + 'pageSize': self._PAGE_SIZE, + **({ + 'seasoned': 'true', + 'seasonNumber': season_number, + 'withOriginalversion': 'true' if version == 'OV' else 'false', + 'withAudiodescription': 'true' if version == 'AD' else 'false', + } if season_number else {}), + }) - player_page = self._download_json( - 'https://api.ardmediathek.de/public-gateway', - display_id, data=json.dumps({ - 'query': '''{ - playerPage(client:"%s", clipId: "%s") { - blockedByFsk - broadcastedOn - maturityContentRating - mediaCollection { - _duration - _geoblocked - _isLive - _mediaArray { - _mediaStreamArray { - _quality - _server - _stream - } - } - _previewImage - _subtitleUrl - _type - } - show { - title - } - image { - src - } - synopsis - title - tracking { - atiCustomVars { - contentId - } - } - } -}''' % (client, video_id), - }).encode(), headers={ - 'Content-Type': 'application/json' - })['data']['playerPage'] - title = player_page['title'] - content_id = str_or_none(try_get( - player_page, lambda x: x['tracking']['atiCustomVars']['contentId'])) - media_collection = player_page.get('mediaCollection') or {} - if not media_collection and content_id: - media_collection = self._download_json( - 'https://www.ardmediathek.de/play/media/' + content_id, - content_id, fatal=False) or {} - info = self._parse_media_info( - media_collection, content_id or video_id, - player_page.get('blockedByFsk')) - age_limit = None - description = player_page.get('synopsis') - maturity_content_rating = player_page.get('maturityContentRating') - if maturity_content_rating: - age_limit = int_or_none(maturity_content_rating.lstrip('FSK')) - if not age_limit and description: - age_limit = int_or_none(self._search_regex( - r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) - info.update({ - 'age_limit': age_limit, - 'display_id': display_id, - 'title': title, - 'description': description, - 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), - 'series': try_get(player_page, lambda x: x['show']['title']), - 'thumbnail': (media_collection.get('_previewImage') - or try_get(player_page, lambda x: update_url(x['image']['src'], query=None, fragment=None)) - or self.get_thumbnail_from_html(display_id, url)), - }) - info.update(self._ARD_extract_episode_info(info['title'])) - return info + def fetch_page(page_num): + for item in traverse_obj(call_api(page_num), ('teasers', ..., {dict})): + item_id = traverse_obj(item, ('links', 'target', ('urlId', 'id')), 'id', get_all=False) + if not item_id or item_id == playlist_id: + continue + item_mode = 'sammlung' if item.get('type') == 'compilation' else 'video' + yield self.url_result( + f'https://www.ardmediathek.de/{item_mode}/{item_id}', + ie=(ARDMediathekCollectionIE if item_mode == 'sammlung' else ARDBetaMediathekIE), + **traverse_obj(item, { + 'id': ('id', {str}), + 'title': ('longTitle', {str}), + 'duration': ('duration', {int_or_none}), + 'timestamp': ('broadcastedOn', {parse_iso8601}), + })) - def get_thumbnail_from_html(self, display_id, url): - webpage = self._download_webpage(url, display_id, fatal=False) or '' - return ( - self._og_search_thumbnail(webpage, default=None) - or self._html_search_meta('thumbnailUrl', webpage, default=None)) + page_data = call_api(0) + full_id = join_nonempty(playlist_id, season_number, version, delim='_') + + return self.playlist_result( + OnDemandPagedList(fetch_page, self._PAGE_SIZE), full_id, display_id=display_id, + title=page_data.get('title'), description=page_data.get('synopsis')) diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py index a19cd2a3a..92b4900f9 100644 --- a/yt_dlp/extractor/arte.py +++ b/yt_dlp/extractor/arte.py @@ -48,17 +48,7 @@ class ArteTVIE(ArteTVBaseIE): }, { 'note': 'No alt_title', 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/', - 'info_dict': { - 'id': '110371-000-A', - 'ext': 'mp4', - 'upload_date': '20220718', - 'duration': 154, - 'timestamp': 1658162460, - 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786', - 'title': 'La chaleur, supplice des arbres de rue', - 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530', - }, - 'params': {'skip_download': 'm3u8'} + 'only_matching': True, }, { 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', 'only_matching': True, @@ -67,19 +57,37 @@ class ArteTVIE(ArteTVBaseIE): 'only_matching': True, }, { 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/', + 'only_matching': True, + }, { + 'note': 'age-restricted', + 'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/', 'info_dict': { - 'id': '110203-006-A', - 'chapters': 'count:16', - 'description': 'md5:cf592f1df52fe52007e3f8eac813c084', - 'alt_title': 'Zaz', - 'title': 'Baloise Session 2022', - 'timestamp': 1668445200, - 'duration': 4054, - 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/ubQjmVCGyRx3hmBuZEK9QZ/940x530', - 'upload_date': '20221114', + 'id': '006785-000-A', + 'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba', + 'title': 'The Element of Crime', + 'timestamp': 1696111200, + 'duration': 5849, + 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530', + 'upload_date': '20230930', 'ext': 'mp4', }, - 'expected_warnings': ['geo restricted'] + }, { + 'url': 'https://www.arte.tv/de/videos/085374-003-A/im-hohen-norden-geboren/', + 'info_dict': { + 'id': '085374-003-A', + 'ext': 'mp4', + 'description': 'md5:ab79ec7cc472a93164415b4e4916abf9', + 'timestamp': 1702872000, + 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/TnyHBfPxv3v2GEY3suXGZP/940x530', + 'duration': 2594, + 'title': 'Die kurze Zeit der Jugend', + 'alt_title': 'Im hohen Norden geboren', + 'upload_date': '20231218', + 'subtitles': { + 'fr': 'mincount:1', + 'fr-acc': 'mincount:1', + }, + }, }] _GEO_BYPASS = True @@ -130,13 +138,25 @@ class ArteTVIE(ArteTVBaseIE): ), } + @staticmethod + def _fix_accessible_subs_locale(subs): + updated_subs = {} + for lang, sub_formats in subs.items(): + for format in sub_formats: + if format.get('url', '').endswith('-MAL.m3u8'): + lang += '-acc' + updated_subs.setdefault(lang, []).append(format) + return updated_subs + def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') lang = mobj.group('lang') or mobj.group('lang_2') langauge_code = self._LANG_MAP.get(lang) - config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id) + config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id, headers={ + 'x-validated-age': '18' + }) geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {} if geoblocking.get('restrictedArea'): @@ -181,6 +201,7 @@ def _real_extract(self, url): secondary_formats.extend(fmts) else: formats.extend(fmts) + subs = self._fix_accessible_subs_locale(subs) self._merge_subtitles(subs, target=subtitles) elif stream['protocol'] in ('HTTPS', 'RTMP'): diff --git a/yt_dlp/extractor/asobichannel.py b/yt_dlp/extractor/asobichannel.py new file mode 100644 index 000000000..e3479ede9 --- /dev/null +++ b/yt_dlp/extractor/asobichannel.py @@ -0,0 +1,168 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + merge_dicts, + parse_iso8601, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class AsobiChannelBaseIE(InfoExtractor): + _MICROCMS_HEADER = {'X-MICROCMS-API-KEY': 'qRaKehul9AHU8KtL0dnq1OCLKnFec6yrbcz3'} + + def _extract_info(self, metadata): + return traverse_obj(metadata, { + 'id': ('id', {str}), + 'title': ('title', {str}), + 'description': ('body', {clean_html}), + 'thumbnail': ('contents', 'video_thumb', 'url', {url_or_none}), + 'timestamp': ('publishedAt', {parse_iso8601}), + 'modified_timestamp': ('updatedAt', {parse_iso8601}), + 'channel': ('channel', 'name', {str}), + 'channel_id': ('channel', 'id', {str}), + }) + + +class AsobiChannelIE(AsobiChannelBaseIE): + IE_NAME = 'asobichannel' + IE_DESC = 'ASOBI CHANNEL' + + _VALID_URL = r'https?://asobichannel\.asobistore\.jp/watch/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://asobichannel.asobistore.jp/watch/1ypp48qd32p', + 'md5': '39df74e872afe032c4eb27b89144fc92', + 'info_dict': { + 'id': '1ypp48qd32p', + 'ext': 'mp4', + 'title': 'アイドルマスター ミリオンライブ! 765プロch 原っぱ通信 #1', + 'description': 'md5:b930bd2199c9b2fd75951ce4aaa7efd2', + 'thumbnail': 'https://images.microcms-assets.io/assets/d2420de4b9194e11beb164f99edb1f95/a8e6f84119f54eb9ab4ce16729239905/%E3%82%B5%E3%83%A0%E3%83%8D%20(1).png', + 'timestamp': 1697098247, + 'upload_date': '20231012', + 'modified_timestamp': 1698381162, + 'modified_date': '20231027', + 'channel': 'アイドルマスター', + 'channel_id': 'idolmaster', + }, + }, { + 'url': 'https://asobichannel.asobistore.jp/watch/redigiwnjzqj', + 'md5': '229fa8fb5c591c75ce8c37a497f113f6', + 'info_dict': { + 'id': 'redigiwnjzqj', + 'ext': 'mp4', + 'title': '【おまけ放送】アイドルマスター ミリオンライブ! 765プロch 原っぱ通信 #1', + 'description': 'md5:7d9cd35fb54425a6967822bd564ea2d9', + 'thumbnail': 'https://images.microcms-assets.io/assets/d2420de4b9194e11beb164f99edb1f95/20e5c1d6184242eebc2512a5dec59bf0/P1_%E5%8E%9F%E3%81%A3%E3%81%B1%E3%82%B5%E3%83%A0%E3%83%8D.png', + 'modified_timestamp': 1697797125, + 'modified_date': '20231020', + 'timestamp': 1697261769, + 'upload_date': '20231014', + 'channel': 'アイドルマスター', + 'channel_id': 'idolmaster', + }, + }] + + _survapi_header = None + + def _real_initialize(self): + token = self._download_json( + 'https://asobichannel-api.asobistore.jp/api/v1/vspf/token', None, + note='Retrieving API token') + self._survapi_header = {'Authorization': f'Bearer {token}'} + + def _process_vod(self, video_id, metadata): + content_id = metadata['contents']['video_id'] + + vod_data = self._download_json( + f'https://survapi.channel.or.jp/proxy/v1/contents/{content_id}/get_by_cuid', video_id, + headers=self._survapi_header, note='Downloading vod data') + + return { + 'formats': self._extract_m3u8_formats(vod_data['ex_content']['streaming_url'], video_id), + } + + def _process_live(self, video_id, metadata): + content_id = metadata['contents']['video_id'] + event_data = self._download_json( + f'https://survapi.channel.or.jp/ex/events/{content_id}?embed=channel', video_id, + headers=self._survapi_header, note='Downloading event data') + + player_type = traverse_obj(event_data, ('data', 'Player_type', {str})) + if player_type == 'poster': + self.raise_no_formats('Live event has not yet started', expected=True) + live_status = 'is_upcoming' + formats = [] + elif player_type == 'player': + live_status = 'is_live' + formats = self._extract_m3u8_formats( + event_data['data']['Channel']['Custom_live_url'], video_id, live=True) + else: + raise ExtractorError('Unsupported player type {player_type!r}') + + return { + 'release_timestamp': traverse_obj(metadata, ('period', 'start', {parse_iso8601})), + 'live_status': live_status, + 'formats': formats, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + metadata = self._download_json( + f'https://channel.microcms.io/api/v1/media/{video_id}', video_id, + headers=self._MICROCMS_HEADER) + + info = self._extract_info(metadata) + + video_type = traverse_obj(metadata, ('contents', 'video_type', 0, {str})) + if video_type == 'VOD': + return merge_dicts(info, self._process_vod(video_id, metadata)) + if video_type == 'LIVE': + return merge_dicts(info, self._process_live(video_id, metadata)) + + raise ExtractorError(f'Unexpected video type {video_type!r}') + + +class AsobiChannelTagURLIE(AsobiChannelBaseIE): + IE_NAME = 'asobichannel:tag' + IE_DESC = 'ASOBI CHANNEL' + + _VALID_URL = r'https?://asobichannel\.asobistore\.jp/tag/(?P<id>[a-z0-9-_]+)' + _TESTS = [{ + 'url': 'https://asobichannel.asobistore.jp/tag/bjhh-nbcja', + 'info_dict': { + 'id': 'bjhh-nbcja', + 'title': 'アイドルマスター ミリオンライブ! 765プロch 原っぱ通信', + }, + 'playlist_mincount': 16, + }, { + 'url': 'https://asobichannel.asobistore.jp/tag/hvm5qw3c6od', + 'info_dict': { + 'id': 'hvm5qw3c6od', + 'title': 'アイマスMOIW2023ラジオ', + }, + 'playlist_mincount': 13, + }] + + def _real_extract(self, url): + tag_id = self._match_id(url) + webpage = self._download_webpage(url, tag_id) + title = traverse_obj(self._search_nextjs_data( + webpage, tag_id, fatal=False), ('props', 'pageProps', 'data', 'name', {str})) + + media = self._download_json( + f'https://channel.microcms.io/api/v1/media?limit=999&filters=(tag[contains]{tag_id})', + tag_id, headers=self._MICROCMS_HEADER) + + def entries(): + for metadata in traverse_obj(media, ('contents', lambda _, v: v['id'])): + yield { + '_type': 'url', + 'url': f'https://asobichannel.asobistore.jp/watch/{metadata["id"]}', + 'ie_key': AsobiChannelIE.ie_key(), + **self._extract_info(metadata), + } + + return self.playlist_result(entries(), tag_id, title) diff --git a/yt_dlp/extractor/atttechchannel.py b/yt_dlp/extractor/atttechchannel.py deleted file mode 100644 index 6ff4ec0ad..000000000 --- a/yt_dlp/extractor/atttechchannel.py +++ /dev/null @@ -1,53 +0,0 @@ -from .common import InfoExtractor -from ..utils import unified_strdate - - -class ATTTechChannelIE(InfoExtractor): - _VALID_URL = r'https?://techchannel\.att\.com/play-video\.cfm/([^/]+/)*(?P<id>.+)' - _TEST = { - 'url': 'http://techchannel.att.com/play-video.cfm/2014/1/27/ATT-Archives-The-UNIX-System-Making-Computers-Easier-to-Use', - 'info_dict': { - 'id': '11316', - 'display_id': 'ATT-Archives-The-UNIX-System-Making-Computers-Easier-to-Use', - 'ext': 'flv', - 'title': 'AT&T Archives : The UNIX System: Making Computers Easier to Use', - 'description': 'A 1982 film about UNIX is the foundation for software in use around Bell Labs and AT&T.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20140127', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - video_url = self._search_regex( - r"url\s*:\s*'(rtmp://[^']+)'", - webpage, 'video URL') - - video_id = self._search_regex( - r'mediaid\s*=\s*(\d+)', - webpage, 'video id', fatal=False) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - upload_date = unified_strdate(self._search_regex( - r'[Rr]elease\s+date:\s*(\d{1,2}/\d{1,2}/\d{4})', - webpage, 'upload date', fatal=False), False) - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'ext': 'flv', - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - } diff --git a/yt_dlp/extractor/banbye.py b/yt_dlp/extractor/banbye.py index e0fc93b97..67af29a96 100644 --- a/yt_dlp/extractor/banbye.py +++ b/yt_dlp/extractor/banbye.py @@ -31,7 +31,7 @@ def _extract_playlist(self, playlist_id): class BanByeIE(BanByeBaseIE): - _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P<id>[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?banbye\.com/(?:en/)?watch/(?P<id>[\w-]+)' _TESTS = [{ 'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T', 'md5': '2f4ea15c5ca259a73d909b2cfd558eb5', @@ -120,7 +120,7 @@ def _real_extract(self, url): class BanByeChannelIE(BanByeBaseIE): - _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?channel/(?P<id>\w+)' + _VALID_URL = r'https?://(?:www\.)?banbye\.com/(?:en/)?channel/(?P<id>\w+)' _TESTS = [{ 'url': 'https://banbye.com/channel/ch_wrealu24', 'info_dict': { @@ -152,7 +152,7 @@ def page_func(page_num): 'sort': 'new', 'limit': self._PAGE_SIZE, 'offset': page_num * self._PAGE_SIZE, - }, note=f'Downloading page {page_num+1}') + }, note=f'Downloading page {page_num + 1}') return [ self.url_result(f"{self._VIDEO_BASE}/{video['_id']}", BanByeIE) for video in data['items'] diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index d1d6e04fa..015af9e1d 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -317,16 +317,25 @@ def _raise_extractor_error(self, media_selection_error): def _download_media_selector(self, programme_id): last_exception = None + formats, subtitles = [], {} for media_set in self._MEDIA_SETS: try: - return self._download_media_selector_url( + fmts, subs = self._download_media_selector_url( self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id) + formats.extend(fmts) + if subs: + self._merge_subtitles(subs, target=subtitles) except BBCCoUkIE.MediaSelectionError as e: if e.id in ('notukerror', 'geolocation', 'selectionunavailable'): last_exception = e continue self._raise_extractor_error(e) - self._raise_extractor_error(last_exception) + if last_exception: + if formats or subtitles: + self.report_warning(f'{self.IE_NAME} returned error: {last_exception.id}') + else: + self._raise_extractor_error(last_exception) + return formats, subtitles def _download_media_selector_url(self, url, programme_id=None): media_selection = self._download_json( @@ -1188,7 +1197,7 @@ def _real_extract(self, url): if initial_data is None: initial_data = self._search_regex( r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage, - 'preload state', default={}) + 'preload state', default='{}') else: initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) initial_data = self._parse_json(initial_data, playlist_id, fatal=False) diff --git a/yt_dlp/extractor/beatbump.py b/yt_dlp/extractor/beatbump.py index 0f40ebe7a..f48566b2d 100644 --- a/yt_dlp/extractor/beatbump.py +++ b/yt_dlp/extractor/beatbump.py @@ -3,14 +3,13 @@ class BeatBumpVideoIE(InfoExtractor): - _VALID_URL = r'https://beatbump\.ml/listen\?id=(?P<id>[\w-]+)' + _VALID_URL = r'https://beatbump\.(?:ml|io)/listen\?id=(?P<id>[\w-]+)' _TESTS = [{ 'url': 'https://beatbump.ml/listen?id=MgNrAu2pzNs', 'md5': '5ff3fff41d3935b9810a9731e485fe66', 'info_dict': { 'id': 'MgNrAu2pzNs', 'ext': 'mp4', - 'uploader_url': 'http://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA', 'artist': 'Stephen', 'thumbnail': 'https://i.ytimg.com/vi_webp/MgNrAu2pzNs/maxresdefault.webp', 'channel_url': 'https://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA', @@ -22,10 +21,9 @@ class BeatBumpVideoIE(InfoExtractor): 'alt_title': 'Voyeur Girl', 'view_count': int, 'track': 'Voyeur Girl', - 'uploader': 'Stephen - Topic', + 'uploader': 'Stephen', 'title': 'Voyeur Girl', 'channel_follower_count': int, - 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA', 'age_limit': 0, 'availability': 'public', 'live_status': 'not_live', @@ -36,7 +34,12 @@ class BeatBumpVideoIE(InfoExtractor): 'tags': 'count:11', 'creator': 'Stephen', 'channel_id': 'UC-pWHpBjdGG69N9mM2auIAA', - } + 'channel_is_verified': True, + 'heatmap': 'count:100', + }, + }, { + 'url': 'https://beatbump.io/listen?id=LDGZAprNGWo', + 'only_matching': True, }] def _real_extract(self, url): @@ -45,7 +48,7 @@ def _real_extract(self, url): class BeatBumpPlaylistIE(InfoExtractor): - _VALID_URL = r'https://beatbump\.ml/(?:release\?id=|artist/|playlist/)(?P<id>[\w-]+)' + _VALID_URL = r'https://beatbump\.(?:ml|io)/(?:release\?id=|artist/|playlist/)(?P<id>[\w-]+)' _TESTS = [{ 'url': 'https://beatbump.ml/release?id=MPREb_gTAcphH99wE', 'playlist_count': 50, @@ -56,25 +59,28 @@ class BeatBumpPlaylistIE(InfoExtractor): 'title': 'Album - Royalty Free Music Library V2 (50 Songs)', 'description': '', 'tags': [], - 'modified_date': '20221223', - } + 'modified_date': '20231110', + }, + 'expected_warnings': ['YouTube Music is not directly supported'], }, { 'url': 'https://beatbump.ml/artist/UC_aEa8K-EOJ3D6gOs7HcyNg', 'playlist_mincount': 1, 'params': {'flatplaylist': True}, 'info_dict': { 'id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', - 'uploader_url': 'https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg', + 'uploader_url': 'https://www.youtube.com/@NoCopyrightSounds', 'channel_url': 'https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg', - 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + 'uploader_id': '@NoCopyrightSounds', 'channel_follower_count': int, - 'title': 'NoCopyrightSounds - Videos', + 'title': 'NoCopyrightSounds', 'uploader': 'NoCopyrightSounds', 'description': 'md5:cd4fd53d81d363d05eee6c1b478b491a', 'channel': 'NoCopyrightSounds', - 'tags': 'count:12', + 'tags': 'count:65', 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + 'channel_is_verified': True, }, + 'expected_warnings': ['YouTube Music is not directly supported'], }, { 'url': 'https://beatbump.ml/playlist/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', 'playlist_mincount': 1, @@ -84,16 +90,20 @@ class BeatBumpPlaylistIE(InfoExtractor): 'uploader_url': 'https://www.youtube.com/@NoCopyrightSounds', 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!', 'view_count': int, - 'channel_url': 'https://www.youtube.com/@NoCopyrightSounds', - 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + 'channel_url': 'https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg', + 'uploader_id': '@NoCopyrightSounds', 'title': 'NCS : All Releases 💿', 'uploader': 'NoCopyrightSounds', 'availability': 'public', 'channel': 'NoCopyrightSounds', 'tags': [], - 'modified_date': '20221225', + 'modified_date': '20231112', 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', - } + }, + 'expected_warnings': ['YouTube Music is not directly supported'], + }, { + 'url': 'https://beatbump.io/playlist/VLPLFCHGavqRG-q_2ZhmgU2XB2--ZY6irT1c', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/behindkink.py b/yt_dlp/extractor/behindkink.py index ca4498150..9d2324f4f 100644 --- a/yt_dlp/extractor/behindkink.py +++ b/yt_dlp/extractor/behindkink.py @@ -3,6 +3,7 @@ class BehindKinkIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)' _TEST = { 'url': 'http://www.behindkink.com/2014/12/05/what-are-you-passionate-about-marley-blaze/', diff --git a/yt_dlp/extractor/bet.py b/yt_dlp/extractor/bet.py index 6b867d135..cbf3dd082 100644 --- a/yt_dlp/extractor/bet.py +++ b/yt_dlp/extractor/bet.py @@ -1,10 +1,9 @@ from .mtv import MTVServicesInfoExtractor from ..utils import unified_strdate -# TODO Remove - Reason: Outdated Site - class BetIE(MTVServicesInfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P<id>.+?)\.html' _TESTS = [ { diff --git a/yt_dlp/extractor/bfi.py b/yt_dlp/extractor/bfi.py index 76f0516a4..a6ebfedff 100644 --- a/yt_dlp/extractor/bfi.py +++ b/yt_dlp/extractor/bfi.py @@ -5,6 +5,7 @@ class BFIPlayerIE(InfoExtractor): + _WORKING = False IE_NAME = 'bfi:player' _VALID_URL = r'https?://player\.bfi\.org\.uk/[^/]+/film/watch-(?P<id>[\w-]+)-online' _TEST = { diff --git a/yt_dlp/extractor/bfmtv.py b/yt_dlp/extractor/bfmtv.py index a7be0e67d..5d0c73ff3 100644 --- a/yt_dlp/extractor/bfmtv.py +++ b/yt_dlp/extractor/bfmtv.py @@ -7,7 +7,7 @@ class BFMTVBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:www\.|rmc\.)?bfmtv\.com/' _VALID_URL_TMPL = _VALID_URL_BASE + r'(?:[^/]+/)*[^/?&#]+_%s[A-Z]-(?P<id>\d{12})\.html' - _VIDEO_BLOCK_REGEX = r'(<div[^>]+class="video_block"[^>]*>)' + _VIDEO_BLOCK_REGEX = r'(<div[^>]+class="video_block[^"]*"[^>]*>)' BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' def _brightcove_url_result(self, video_id, video_block): @@ -55,8 +55,11 @@ class BFMTVLiveIE(BFMTVIE): # XXX: Do not subclass from concrete IE 'ext': 'mp4', 'title': r're:^le direct BFMTV WEB \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'uploader_id': '876450610001', - 'upload_date': '20171018', - 'timestamp': 1508329950, + 'upload_date': '20220926', + 'timestamp': 1664207191, + 'live_status': 'is_live', + 'thumbnail': r're:https://.+/image\.jpg', + 'tags': [], }, 'params': { 'skip_download': True, diff --git a/yt_dlp/extractor/bigo.py b/yt_dlp/extractor/bigo.py index 1cb6e58be..acf78e49a 100644 --- a/yt_dlp/extractor/bigo.py +++ b/yt_dlp/extractor/bigo.py @@ -29,7 +29,8 @@ def _real_extract(self, url): info_raw = self._download_json( 'https://ta.bigo.tv/official_website/studio/getInternalStudioInfo', - user_id, data=urlencode_postdata({'siteId': user_id})) + user_id, data=urlencode_postdata({'siteId': user_id}), + headers={'Accept': 'application/json'}) if not isinstance(info_raw, dict): raise ExtractorError('Received invalid JSON data') diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 9119f396b..cd7df69ef 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -2,6 +2,7 @@ import functools import hashlib import itertools +import json import math import re import time @@ -16,9 +17,12 @@ InAdvancePagedList, OnDemandPagedList, bool_or_none, + clean_html, + determine_ext, filter_dict, float_or_none, format_field, + get_element_by_class, int_or_none, join_nonempty, make_archive_id, @@ -88,6 +92,12 @@ def extract_formats(self, play_info): return formats + def _download_playinfo(self, video_id, cid): + return self._download_json( + 'https://api.bilibili.com/x/player/playurl', video_id, + query={'bvid': video_id, 'cid': cid, 'fnval': 4048}, + note=f'Downloading video formats for cid {cid}')['data'] + def json2srt(self, json_data): srt_data = '' for idx, line in enumerate(json_data.get('body') or []): @@ -96,7 +106,7 @@ def json2srt(self, json_data): f'{line["content"]}\n\n') return srt_data - def _get_subtitles(self, video_id, aid, cid): + def _get_subtitles(self, video_id, cid, aid=None): subtitles = { 'danmaku': [{ 'ext': 'xml', @@ -104,8 +114,15 @@ def _get_subtitles(self, video_id, aid, cid): }] } - video_info_json = self._download_json(f'https://api.bilibili.com/x/player/v2?aid={aid}&cid={cid}', video_id) - for s in traverse_obj(video_info_json, ('data', 'subtitle', 'subtitles', ...)): + subtitle_info = traverse_obj(self._download_json( + 'https://api.bilibili.com/x/player/v2', video_id, + query={'aid': aid, 'cid': cid} if aid else {'bvid': video_id, 'cid': cid}, + note=f'Extracting subtitle info {cid}'), ('data', 'subtitle')) + subs_list = traverse_obj(subtitle_info, ('subtitles', lambda _, v: v['subtitle_url'] and v['lan'])) + if not subs_list and traverse_obj(subtitle_info, 'allow_submit'): + if not self._get_cookies('https://api.bilibili.com').get('SESSDATA'): # no login session cookie + self.report_warning(f'CC subtitles (if any) are only visible when logged in. {self._login_hint()}', only_once=True) + for s in subs_list: subtitles.setdefault(s['lan'], []).append({ 'ext': 'srt', 'data': self.json2srt(self._download_json(s['subtitle_url'], video_id)) @@ -155,7 +172,54 @@ def _get_episodes_from_season(self, ss_id, url): for entry in traverse_obj(season_info, ( 'result', 'main_section', 'episodes', lambda _, v: url_or_none(v['share_url']) and v['id'])): - yield self.url_result(entry['share_url'], BiliBiliBangumiIE, f'ep{entry["id"]}') + yield self.url_result(entry['share_url'], BiliBiliBangumiIE, str_or_none(entry.get('id'))) + + def _get_divisions(self, video_id, graph_version, edges, edge_id, cid_edges=None): + cid_edges = cid_edges or {} + division_data = self._download_json( + 'https://api.bilibili.com/x/stein/edgeinfo_v2', video_id, + query={'graph_version': graph_version, 'edge_id': edge_id, 'bvid': video_id}, + note=f'Extracting divisions from edge {edge_id}') + edges.setdefault(edge_id, {}).update( + traverse_obj(division_data, ('data', 'story_list', lambda _, v: v['edge_id'] == edge_id, { + 'title': ('title', {str}), + 'cid': ('cid', {int_or_none}), + }), get_all=False)) + + edges[edge_id].update(traverse_obj(division_data, ('data', { + 'title': ('title', {str}), + 'choices': ('edges', 'questions', ..., 'choices', ..., { + 'edge_id': ('id', {int_or_none}), + 'cid': ('cid', {int_or_none}), + 'text': ('option', {str}), + }), + }))) + # use dict to combine edges that use the same video section (same cid) + cid_edges.setdefault(edges[edge_id]['cid'], {})[edge_id] = edges[edge_id] + for choice in traverse_obj(edges, (edge_id, 'choices', ...)): + if choice['edge_id'] not in edges: + edges[choice['edge_id']] = {'cid': choice['cid']} + self._get_divisions(video_id, graph_version, edges, choice['edge_id'], cid_edges=cid_edges) + return cid_edges + + def _get_interactive_entries(self, video_id, cid, metainfo): + graph_version = traverse_obj( + self._download_json( + 'https://api.bilibili.com/x/player/wbi/v2', video_id, + 'Extracting graph version', query={'bvid': video_id, 'cid': cid}), + ('data', 'interaction', 'graph_version', {int_or_none})) + cid_edges = self._get_divisions(video_id, graph_version, {1: {'cid': cid}}, 1) + for cid, edges in cid_edges.items(): + play_info = self._download_playinfo(video_id, cid) + yield { + **metainfo, + 'id': f'{video_id}_{cid}', + 'title': f'{metainfo.get("title")} - {list(edges.values())[0].get("title")}', + 'formats': self.extract_formats(play_info), + 'description': f'{json.dumps(edges, ensure_ascii=False)}\n{metainfo.get("description", "")}', + 'duration': float_or_none(play_info.get('timelength'), scale=1000), + 'subtitles': self.extract_subtitles(video_id, cid), + } class BiliBiliIE(BilibiliBaseIE): @@ -180,7 +244,7 @@ class BiliBiliIE(BilibiliBaseIE): 'view_count': int, }, }, { - # old av URL version + 'note': 'old av URL version', 'url': 'http://www.bilibili.com/video/av1074402/', 'info_dict': { 'thumbnail': r're:^https?://.*\.(jpg|jpeg)$', @@ -212,7 +276,7 @@ class BiliBiliIE(BilibiliBaseIE): 'id': 'BV1bK411W797_p1', 'ext': 'mp4', 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', - 'tags': 'count:11', + 'tags': 'count:10', 'timestamp': 1589601697, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'uploader': '打牌还是打桩', @@ -232,7 +296,7 @@ class BiliBiliIE(BilibiliBaseIE): 'id': 'BV1bK411W797_p1', 'ext': 'mp4', 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', - 'tags': 'count:11', + 'tags': 'count:10', 'timestamp': 1589601697, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'uploader': '打牌还是打桩', @@ -343,18 +407,120 @@ class BiliBiliIE(BilibiliBaseIE): 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, 'params': {'skip_download': True}, + }, { + 'note': 'interactive/split-path video', + 'url': 'https://www.bilibili.com/video/BV1af4y1H7ga/', + 'info_dict': { + 'id': 'BV1af4y1H7ga', + 'title': '【互动游戏】花了大半年时间做的自我介绍~请查收!!', + 'timestamp': 1630500414, + 'upload_date': '20210901', + 'description': 'md5:01113e39ab06e28042d74ac356a08786', + 'tags': list, + 'uploader': '钉宫妮妮Ninico', + 'duration': 1503, + 'uploader_id': '8881297', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + 'playlist_count': 33, + 'playlist': [{ + 'info_dict': { + 'id': 'BV1af4y1H7ga_400950101', + 'ext': 'mp4', + 'title': '【互动游戏】花了大半年时间做的自我介绍~请查收!! - 听见猫猫叫~', + 'timestamp': 1630500414, + 'upload_date': '20210901', + 'description': 'md5:db66ac7a2813a94b8291dbce990cc5b2', + 'tags': list, + 'uploader': '钉宫妮妮Ninico', + 'duration': 11.605, + 'uploader_id': '8881297', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + }], + }, { + 'note': '301 redirect to bangumi link', + 'url': 'https://www.bilibili.com/video/BV1TE411f7f1', + 'info_dict': { + 'id': '288525', + 'title': '李永乐老师 钱学森弹道和乘波体飞行器是什么?', + 'ext': 'mp4', + 'series': '我和我的祖国', + 'series_id': '4780', + 'season': '幕后纪实', + 'season_id': '28609', + 'season_number': 1, + 'episode': '钱学森弹道和乘波体飞行器是什么?', + 'episode_id': '288525', + 'episode_number': 105, + 'duration': 1183.957, + 'timestamp': 1571648124, + 'upload_date': '20191021', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + }, { + 'url': 'https://www.bilibili.com/video/BV1jL41167ZG/', + 'info_dict': { + 'id': 'BV1jL41167ZG', + 'title': '一场大火引发的离奇死亡!古典推理经典短篇集《不可能犯罪诊断书》!', + 'ext': 'mp4', + }, + 'skip': 'supporter-only video', + }, { + 'url': 'https://www.bilibili.com/video/BV1Ks411f7aQ/', + 'info_dict': { + 'id': 'BV1Ks411f7aQ', + 'title': '【BD1080P】狼与香辛料I【华盟】', + 'ext': 'mp4', + }, + 'skip': 'login required', + }, { + 'url': 'https://www.bilibili.com/video/BV1GJ411x7h7/', + 'info_dict': { + 'id': 'BV1GJ411x7h7', + 'title': '【官方 MV】Never Gonna Give You Up - Rick Astley', + 'ext': 'mp4', + }, + 'skip': 'geo-restricted', }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage, urlh = self._download_webpage_handle(url, video_id) + if not self._match_valid_url(urlh.url): + return self.url_result(urlh.url) + initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) is_festival = 'videoData' not in initial_state if is_festival: video_data = initial_state['videoInfo'] else: - play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] + play_info_obj = self._search_json( + r'window\.__playinfo__\s*=', webpage, 'play info', video_id, fatal=False) + if not play_info_obj: + if traverse_obj(initial_state, ('error', 'trueCode')) == -403: + self.raise_login_required() + if traverse_obj(initial_state, ('error', 'trueCode')) == -404: + raise ExtractorError( + 'This video may be deleted or geo-restricted. ' + 'You might want to try a VPN or a proxy server (with --proxy)', expected=True) + play_info = traverse_obj(play_info_obj, ('data', {dict})) + if not play_info: + if traverse_obj(play_info_obj, 'code') == 87007: + toast = get_element_by_class('tips-toast', webpage) or '' + msg = clean_html( + f'{get_element_by_class("belongs-to", toast) or ""},' + + (get_element_by_class('level', toast) or '')) + raise ExtractorError( + f'This is a supporter-only video: {msg}. {self._login_hint()}', expected=True) + raise ExtractorError('Failed to extract play info') video_data = initial_state['videoData'] video_id, title = video_data['bvid'], video_data.get('title') @@ -385,10 +551,7 @@ def _real_extract(self, url): festival_info = {} if is_festival: - play_info = self._download_json( - 'https://api.bilibili.com/x/player/playurl', video_id, - query={'bvid': video_id, 'cid': cid, 'fnval': 4048}, - note='Extracting festival video formats')['data'] + play_info = self._download_playinfo(video_id, cid) festival_info = traverse_obj(initial_state, { 'uploader': ('videoInfo', 'upName'), @@ -397,7 +560,7 @@ def _real_extract(self, url): 'thumbnail': ('sectionEpisodes', lambda _, v: v['bvid'] == video_id, 'cover'), }, get_all=False) - return { + metainfo = { **traverse_obj(initial_state, { 'uploader': ('upData', 'name'), 'uploader_id': ('upData', 'mid', {str_or_none}), @@ -413,28 +576,59 @@ def _real_extract(self, url): 'comment_count': ('stat', 'reply', {int_or_none}), }, get_all=False), 'id': f'{video_id}{format_field(part_id, None, "_p%d")}', - 'formats': self.extract_formats(play_info), '_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None, 'title': title, - 'duration': float_or_none(play_info.get('timelength'), scale=1000), - 'chapters': self._get_chapters(aid, cid), - 'subtitles': self.extract_subtitles(video_id, aid, cid), - '__post_extractor': self.extract_comments(aid), 'http_headers': {'Referer': url}, } + is_interactive = traverse_obj(video_data, ('rights', 'is_stein_gate')) + if is_interactive: + return self.playlist_result( + self._get_interactive_entries(video_id, cid, metainfo), **metainfo, **{ + 'duration': traverse_obj(initial_state, ('videoData', 'duration', {int_or_none})), + '__post_extractor': self.extract_comments(aid), + }) + else: + return { + **metainfo, + 'duration': float_or_none(play_info.get('timelength'), scale=1000), + 'chapters': self._get_chapters(aid, cid), + 'subtitles': self.extract_subtitles(video_id, cid), + 'formats': self.extract_formats(play_info), + '__post_extractor': self.extract_comments(aid), + } + class BiliBiliBangumiIE(BilibiliBaseIE): - _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/play/(?P<id>ep\d+)' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/play/ep(?P<id>\d+)' _TESTS = [{ + 'url': 'https://www.bilibili.com/bangumi/play/ep21495/', + 'info_dict': { + 'id': '21495', + 'ext': 'mp4', + 'series': '悠久之翼', + 'series_id': '774', + 'season': '第二季', + 'season_id': '1182', + 'season_number': 2, + 'episode': 'forever/ef', + 'episode_id': '21495', + 'episode_number': 12, + 'title': '12 forever/ef', + 'duration': 1420.791, + 'timestamp': 1320412200, + 'upload_date': '20111104', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + }, { 'url': 'https://www.bilibili.com/bangumi/play/ep267851', 'info_dict': { 'id': '267851', 'ext': 'mp4', 'series': '鬼灭之刃', 'series_id': '4358', - 'season': '鬼灭之刃', + 'season': '立志篇', 'season_id': '26801', 'season_number': 1, 'episode': '残酷', @@ -446,13 +640,32 @@ class BiliBiliBangumiIE(BilibiliBaseIE): 'upload_date': '20190406', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$' }, - 'skip': 'According to the copyright owner\'s request, you may only watch the video after you are premium member.' + 'skip': 'Geo-restricted', + }, { + 'note': 'a making-of which falls outside main section', + 'url': 'https://www.bilibili.com/bangumi/play/ep345120', + 'info_dict': { + 'id': '345120', + 'ext': 'mp4', + 'series': '鬼灭之刃', + 'series_id': '4358', + 'season': '立志篇', + 'season_id': '26801', + 'season_number': 1, + 'episode': '炭治郎篇', + 'episode_id': '345120', + 'episode_number': 27, + 'title': '#1 炭治郎篇', + 'duration': 1922.129, + 'timestamp': 1602853860, + 'upload_date': '20201016', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$' + }, }] def _real_extract(self, url): - video_id = self._match_id(url) - episode_id = video_id[2:] - webpage = self._download_webpage(url, video_id) + episode_id = self._match_id(url) + webpage = self._download_webpage(url, episode_id) if '您所在的地区无法观看本片' in webpage: raise GeoRestrictedError('This video is restricted') @@ -461,7 +674,7 @@ def _real_extract(self, url): headers = {'Referer': url, **self.geo_verification_headers()} play_info = self._download_json( - 'https://api.bilibili.com/pgc/player/web/v2/playurl', video_id, + 'https://api.bilibili.com/pgc/player/web/v2/playurl', episode_id, 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id}, headers=headers) premium_only = play_info.get('code') == -10403 @@ -472,40 +685,43 @@ def _real_extract(self, url): self.raise_login_required('This video is for premium members only') bangumi_info = self._download_json( - 'https://api.bilibili.com/pgc/view/web/season', video_id, 'Get episode details', + 'https://api.bilibili.com/pgc/view/web/season', episode_id, 'Get episode details', query={'ep_id': episode_id}, headers=headers)['result'] episode_number, episode_info = next(( (idx, ep) for idx, ep in enumerate(traverse_obj( - bangumi_info, ('episodes', ..., {dict})), 1) + bangumi_info, (('episodes', ('section', ..., 'episodes')), ..., {dict})), 1) if str_or_none(ep.get('id')) == episode_id), (1, {})) season_id = bangumi_info.get('season_id') - season_number = season_id and next(( - idx + 1 for idx, e in enumerate( + season_number, season_title = season_id and next(( + (idx + 1, e.get('season_title')) for idx, e in enumerate( traverse_obj(bangumi_info, ('seasons', ...))) if e.get('season_id') == season_id - ), None) + ), (None, None)) aid = episode_info.get('aid') return { - 'id': video_id, + 'id': episode_id, 'formats': formats, **traverse_obj(bangumi_info, { 'series': ('series', 'series_title', {str}), 'series_id': ('series', 'series_id', {str_or_none}), 'thumbnail': ('square_cover', {url_or_none}), }), - 'title': join_nonempty('title', 'long_title', delim=' ', from_dict=episode_info), - 'episode': episode_info.get('long_title'), + **traverse_obj(episode_info, { + 'episode': ('long_title', {str}), + 'episode_number': ('title', {int_or_none}, {lambda x: x or episode_number}), + 'timestamp': ('pub_time', {int_or_none}), + 'title': {lambda v: v and join_nonempty('title', 'long_title', delim=' ', from_dict=v)}, + }), 'episode_id': episode_id, - 'episode_number': int_or_none(episode_info.get('title')) or episode_number, + 'season': str_or_none(season_title), 'season_id': str_or_none(season_id), 'season_number': season_number, - 'timestamp': int_or_none(episode_info.get('pub_time')), 'duration': float_or_none(play_info.get('timelength'), scale=1000), - 'subtitles': self.extract_subtitles(video_id, aid, episode_info.get('cid')), + 'subtitles': self.extract_subtitles(episode_id, episode_info.get('cid'), aid=aid), '__post_extractor': self.extract_comments(aid), 'http_headers': headers, } @@ -517,17 +733,53 @@ class BiliBiliBangumiMediaIE(BilibiliBaseIE): 'url': 'https://www.bilibili.com/bangumi/media/md24097891', 'info_dict': { 'id': '24097891', + 'title': 'CAROLE & TUESDAY', + 'description': 'md5:42417ad33d1eaa1c93bfd2dd1626b829', }, 'playlist_mincount': 25, + }, { + 'url': 'https://www.bilibili.com/bangumi/media/md1565/', + 'info_dict': { + 'id': '1565', + 'title': '攻壳机动队 S.A.C. 2nd GIG', + 'description': 'md5:46cac00bafd645b97f4d6df616fc576d', + }, + 'playlist_count': 26, + 'playlist': [{ + 'info_dict': { + 'id': '68540', + 'ext': 'mp4', + 'series': '攻壳机动队', + 'series_id': '1077', + 'season': '第二季', + 'season_id': '1565', + 'season_number': 2, + 'episode': '再启动 REEMBODY', + 'episode_id': '68540', + 'episode_number': 1, + 'title': '1 再启动 REEMBODY', + 'duration': 1525.777, + 'timestamp': 1425074413, + 'upload_date': '20150227', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$' + }, + }], }] def _real_extract(self, url): media_id = self._match_id(url) webpage = self._download_webpage(url, media_id) - ss_id = self._search_json( - r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id)['mediaInfo']['season_id'] - return self.playlist_result(self._get_episodes_from_season(ss_id, url), media_id) + initial_state = self._search_json( + r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id) + ss_id = initial_state['mediaInfo']['season_id'] + + return self.playlist_result( + self._get_episodes_from_season(ss_id, url), media_id, + **traverse_obj(initial_state, ('mediaInfo', { + 'title': ('title', {str}), + 'description': ('evaluate', {str}), + }))) class BiliBiliBangumiSeasonIE(BilibiliBaseIE): @@ -535,15 +787,183 @@ class BiliBiliBangumiSeasonIE(BilibiliBaseIE): _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/play/ss26801', 'info_dict': { - 'id': '26801' + 'id': '26801', + 'title': '鬼灭之刃', + 'description': 'md5:e2cc9848b6f69be6db79fc2a82d9661b', }, 'playlist_mincount': 26 + }, { + 'url': 'https://www.bilibili.com/bangumi/play/ss2251', + 'info_dict': { + 'id': '2251', + 'title': '玲音', + 'description': 'md5:1fd40e3df4c08d4d9d89a6a34844bdc4', + }, + 'playlist_count': 13, + 'playlist': [{ + 'info_dict': { + 'id': '50188', + 'ext': 'mp4', + 'series': '玲音', + 'series_id': '1526', + 'season': 'TV', + 'season_id': '2251', + 'season_number': 1, + 'episode': 'WEIRD', + 'episode_id': '50188', + 'episode_number': 1, + 'title': '1 WEIRD', + 'duration': 1436.992, + 'timestamp': 1343185080, + 'upload_date': '20120725', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$' + }, + }], }] def _real_extract(self, url): ss_id = self._match_id(url) + webpage = self._download_webpage(url, ss_id) + metainfo = traverse_obj( + self._search_json(r'<script[^>]+type="application/ld\+json"[^>]*>', webpage, 'info', ss_id), + ('itemListElement', ..., { + 'title': ('name', {str}), + 'description': ('description', {str}), + }), get_all=False) - return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id) + return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id, **metainfo) + + +class BilibiliCheeseBaseIE(BilibiliBaseIE): + _HEADERS = {'Referer': 'https://www.bilibili.com/'} + + def _extract_episode(self, season_info, ep_id): + episode_info = traverse_obj(season_info, ( + 'episodes', lambda _, v: v['id'] == int(ep_id)), get_all=False) + aid, cid = episode_info['aid'], episode_info['cid'] + + if traverse_obj(episode_info, 'ep_status') == -1: + raise ExtractorError('This course episode is not yet available.', expected=True) + if not traverse_obj(episode_info, 'playable'): + self.raise_login_required('You need to purchase the course to download this episode') + + play_info = self._download_json( + 'https://api.bilibili.com/pugv/player/web/playurl', ep_id, + query={'avid': aid, 'cid': cid, 'ep_id': ep_id, 'fnval': 16, 'fourk': 1}, + headers=self._HEADERS, note='Downloading playinfo')['data'] + + return { + 'id': str_or_none(ep_id), + 'episode_id': str_or_none(ep_id), + 'formats': self.extract_formats(play_info), + 'extractor_key': BilibiliCheeseIE.ie_key(), + 'extractor': BilibiliCheeseIE.IE_NAME, + 'webpage_url': f'https://www.bilibili.com/cheese/play/ep{ep_id}', + **traverse_obj(episode_info, { + 'episode': ('title', {str}), + 'title': {lambda v: v and join_nonempty('index', 'title', delim=' - ', from_dict=v)}, + 'alt_title': ('subtitle', {str}), + 'duration': ('duration', {int_or_none}), + 'episode_number': ('index', {int_or_none}), + 'thumbnail': ('cover', {url_or_none}), + 'timestamp': ('release_date', {int_or_none}), + 'view_count': ('play', {int_or_none}), + }), + **traverse_obj(season_info, { + 'uploader': ('up_info', 'uname', {str}), + 'uploader_id': ('up_info', 'mid', {str_or_none}), + }), + 'subtitles': self.extract_subtitles(ep_id, cid, aid=aid), + '__post_extractor': self.extract_comments(aid), + 'http_headers': self._HEADERS, + } + + def _download_season_info(self, query_key, video_id): + return self._download_json( + f'https://api.bilibili.com/pugv/view/web/season?{query_key}={video_id}', video_id, + headers=self._HEADERS, note='Downloading season info')['data'] + + +class BilibiliCheeseIE(BilibiliCheeseBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/cheese/play/ep(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/cheese/play/ep229832', + 'info_dict': { + 'id': '229832', + 'ext': 'mp4', + 'title': '1 - 课程先导片', + 'alt_title': '视频课 · 3分41秒', + 'uploader': '马督工', + 'uploader_id': '316568752', + 'episode': '课程先导片', + 'episode_id': '229832', + 'episode_number': 1, + 'duration': 221, + 'timestamp': 1695549606, + 'upload_date': '20230924', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'view_count': int, + } + }] + + def _real_extract(self, url): + ep_id = self._match_id(url) + return self._extract_episode(self._download_season_info('ep_id', ep_id), ep_id) + + +class BilibiliCheeseSeasonIE(BilibiliCheeseBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/cheese/play/ss(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/cheese/play/ss5918', + 'info_dict': { + 'id': '5918', + 'title': '【限时五折】新闻系学不到:马督工教你做自媒体', + 'description': '帮普通人建立世界模型,降低人与人的沟通门槛', + }, + 'playlist': [{ + 'info_dict': { + 'id': '229832', + 'ext': 'mp4', + 'title': '1 - 课程先导片', + 'alt_title': '视频课 · 3分41秒', + 'uploader': '马督工', + 'uploader_id': '316568752', + 'episode': '课程先导片', + 'episode_id': '229832', + 'episode_number': 1, + 'duration': 221, + 'timestamp': 1695549606, + 'upload_date': '20230924', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'view_count': int, + } + }], + 'params': {'playlist_items': '1'}, + }, { + 'url': 'https://www.bilibili.com/cheese/play/ss5918', + 'info_dict': { + 'id': '5918', + 'title': '【限时五折】新闻系学不到:马督工教你做自媒体', + 'description': '帮普通人建立世界模型,降低人与人的沟通门槛', + }, + 'playlist_mincount': 5, + 'skip': 'paid video in list', + }] + + def _get_cheese_entries(self, season_info): + for ep_id in traverse_obj(season_info, ('episodes', lambda _, v: v['episode_can_view'], 'id')): + yield self._extract_episode(season_info, ep_id) + + def _real_extract(self, url): + season_id = self._match_id(url) + season_info = self._download_season_info('season_id', season_id) + + return self.playlist_result( + self._get_cheese_entries(season_info), season_id, + **traverse_obj(season_info, { + 'title': ('title', {str}), + 'description': ('subtitle', {str}), + })) class BilibiliSpaceBaseIE(InfoExtractor): @@ -1202,6 +1622,7 @@ def _real_extract(self, url): class BiliIntlBaseIE(InfoExtractor): _API_URL = 'https://api.bilibili.tv/intl/gateway' _NETRC_MACHINE = 'biliintl' + _HEADERS = {'Referer': 'https://www.bilibili.com/'} def _call_api(self, endpoint, *args, **kwargs): json = self._download_json(self._API_URL + endpoint, *args, **kwargs) @@ -1239,19 +1660,34 @@ def _get_subtitles(self, *, ep_id=None, aid=None): 'aid': aid, })) or {} subtitles = {} - for sub in sub_json.get('subtitles') or []: - sub_url = sub.get('url') - if not sub_url: - continue - sub_data = self._download_json( - sub_url, ep_id or aid, errnote='Unable to download subtitles', fatal=False, - note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '') - if not sub_data: - continue - subtitles.setdefault(sub.get('lang_key', 'en'), []).append({ - 'ext': 'srt', - 'data': self.json2srt(sub_data) - }) + fetched_urls = set() + for sub in traverse_obj(sub_json, (('subtitles', 'video_subtitle'), ..., {dict})): + for url in traverse_obj(sub, ((None, 'ass', 'srt'), 'url', {url_or_none})): + if url in fetched_urls: + continue + fetched_urls.add(url) + sub_ext = determine_ext(url) + sub_lang = sub.get('lang_key') or 'en' + + if sub_ext == 'ass': + subtitles.setdefault(sub_lang, []).append({ + 'ext': 'ass', + 'url': url, + }) + elif sub_ext == 'json': + sub_data = self._download_json( + url, ep_id or aid, fatal=False, + note=f'Downloading subtitles{format_field(sub, "lang", " for %s")} ({sub_lang})', + errnote='Unable to download subtitles') + + if sub_data: + subtitles.setdefault(sub_lang, []).append({ + 'ext': 'srt', + 'data': self.json2srt(sub_data), + }) + else: + self.report_warning('Unexpected subtitle extension', ep_id or aid) + return subtitles def _get_formats(self, *, ep_id=None, aid=None): @@ -1297,7 +1733,9 @@ def _get_formats(self, *, ep_id=None, aid=None): def _parse_video_metadata(self, video_data): return { 'title': video_data.get('title_display') or video_data.get('title'), + 'description': video_data.get('desc'), 'thumbnail': video_data.get('cover'), + 'timestamp': unified_timestamp(video_data.get('formatted_pub_date')), 'episode_number': int_or_none(self._search_regex( r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)), } @@ -1394,17 +1832,6 @@ class BiliIntlIE(BiliIntlBaseIE): 'episode_number': 140, }, 'skip': 'According to the copyright owner\'s request, you may only watch the video after you log in.' - }, { - 'url': 'https://www.bilibili.tv/en/video/2041863208', - 'info_dict': { - 'id': '2041863208', - 'ext': 'mp4', - 'timestamp': 1670874843, - 'description': 'Scheduled for April 2023.\nStudio: ufotable', - 'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$', - 'upload_date': '20221212', - 'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation', - }, }, { # episode comment extraction 'url': 'https://www.bilibili.tv/en/play/34580/340317', @@ -1445,9 +1872,9 @@ class BiliIntlIE(BiliIntlBaseIE): 'description': 'md5:693b6f3967fb4e7e7764ea817857c33a', 'timestamp': 1667891924, 'upload_date': '20221108', - 'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan - Bstation', + 'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan', 'comment_count': int, - 'thumbnail': 'https://pic.bstarstatic.com/ugc/f6c363659efd2eabe5683fbb906b1582.jpg', + 'thumbnail': r're:https://pic\.bstarstatic\.(?:com|net)/ugc/f6c363659efd2eabe5683fbb906b1582\.jpg', }, 'params': { 'getcomments': True @@ -1510,10 +1937,12 @@ def _extract_video_metadata(self, url, video_id, season_id): # XXX: webpage metadata may not accurate, it just used to not crash when video_data not found return merge_dicts( - self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id, fatal=False), { - 'title': self._html_search_meta('og:title', webpage), - 'description': self._html_search_meta('og:description', webpage) - }) + self._parse_video_metadata(video_data), { + 'title': get_element_by_class( + 'bstar-meta__title', webpage) or self._html_search_meta('og:title', webpage), + 'description': get_element_by_class( + 'bstar-meta__desc', webpage) or self._html_search_meta('og:description'), + }, self._search_json_ld(webpage, video_id, default={})) def _get_comments_reply(self, root_id, next_id=0, display_id=None): comment_api_raw_data = self._download_json( @@ -1601,7 +2030,8 @@ def _real_extract(self, url): 'formats': self._get_formats(ep_id=ep_id, aid=aid), 'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid), 'chapters': chapters, - '__post_extractor': self.extract_comments(video_id, ep_id) + '__post_extractor': self.extract_comments(video_id, ep_id), + 'http_headers': self._HEADERS, } diff --git a/yt_dlp/extractor/biqle.py b/yt_dlp/extractor/biqle.py deleted file mode 100644 index 027753503..000000000 --- a/yt_dlp/extractor/biqle.py +++ /dev/null @@ -1,110 +0,0 @@ -from .common import InfoExtractor -from .vk import VKIE -from ..compat import compat_b64decode -from ..utils import ( - int_or_none, - js_to_json, - traverse_obj, - unified_timestamp, -) - - -class BIQLEIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)' - _TESTS = [{ - 'url': 'https://biqle.ru/watch/-2000421746_85421746', - 'md5': 'ae6ef4f04d19ac84e4658046d02c151c', - 'info_dict': { - 'id': '-2000421746_85421746', - 'ext': 'mp4', - 'title': 'Forsaken By Hope Studio Clip', - 'description': 'Forsaken By Hope Studio Clip — Смотреть онлайн', - 'upload_date': '19700101', - 'thumbnail': r're:https://[^/]+/impf/7vN3ACwSTgChP96OdOfzFjUCzFR6ZglDQgWsIw/KPaACiVJJxM\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=b48ea459c4d33dbcba5e26d63574b1cb&type=video_thumb', - 'timestamp': 0, - }, - }, { - 'url': 'http://biqle.org/watch/-44781847_168547604', - 'md5': '7f24e72af1db0edf7c1aaba513174f97', - 'info_dict': { - 'id': '-44781847_168547604', - 'ext': 'mp4', - 'title': 'Ребенок в шоке от автоматической мойки', - 'description': 'Ребенок в шоке от автоматической мойки — Смотреть онлайн', - 'timestamp': 1396633454, - 'upload_date': '20140404', - 'thumbnail': r're:https://[^/]+/c535507/u190034692/video/l_b84df002\.jpg', - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_search_meta('name', webpage, 'Title', fatal=False) - timestamp = unified_timestamp(self._html_search_meta('uploadDate', webpage, 'Upload Date', default=None)) - description = self._html_search_meta('description', webpage, 'Description', default=None) - - global_embed_url = self._search_regex( - r'<script[^<]+?window.globEmbedUrl\s*=\s*\'((?:https?:)?//(?:daxab\.com|dxb\.to|[^/]+/player)/[^\']+)\'', - webpage, 'global Embed url') - hash = self._search_regex( - r'<script id="data-embed-video[^<]+?hash: "([^"]+)"[^<]*</script>', webpage, 'Hash') - - embed_url = global_embed_url + hash - - if VKIE.suitable(embed_url): - return self.url_result(embed_url, VKIE.ie_key(), video_id) - - embed_page = self._download_webpage( - embed_url, video_id, 'Downloading embed webpage', headers={'Referer': url}) - - glob_params = self._parse_json(self._search_regex( - r'<script id="globParams">[^<]*window.globParams = ([^;]+);[^<]+</script>', - embed_page, 'Global Parameters'), video_id, transform_source=js_to_json) - host_name = compat_b64decode(glob_params['server'][::-1]).decode() - - item = self._download_json( - f'https://{host_name}/method/video.get/{video_id}', video_id, - headers={'Referer': url}, query={ - 'token': glob_params['video']['access_token'], - 'videos': video_id, - 'ckey': glob_params['c_key'], - 'credentials': glob_params['video']['credentials'], - })['response']['items'][0] - - formats = [] - for f_id, f_url in item.get('files', {}).items(): - if f_id == 'external': - return self.url_result(f_url) - ext, height = f_id.split('_') - height_extra_key = traverse_obj(glob_params, ('video', 'partial', 'quality', height)) - if height_extra_key: - formats.append({ - 'format_id': f'{height}p', - 'url': f'https://{host_name}/{f_url[8:]}&videos={video_id}&extra_key={height_extra_key}', - 'height': int_or_none(height), - 'ext': ext, - }) - - thumbnails = [] - for k, v in item.items(): - if k.startswith('photo_') and v: - width = k.replace('photo_', '') - thumbnails.append({ - 'id': width, - 'url': v, - 'width': int_or_none(width), - }) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'comment_count': int_or_none(item.get('comments')), - 'description': description, - 'duration': int_or_none(item.get('duration')), - 'thumbnails': thumbnails, - 'timestamp': timestamp, - 'view_count': int_or_none(item.get('views')), - } diff --git a/yt_dlp/extractor/bitchute.py b/yt_dlp/extractor/bitchute.py index 0805b8b46..41367c5b9 100644 --- a/yt_dlp/extractor/bitchute.py +++ b/yt_dlp/extractor/bitchute.py @@ -7,8 +7,10 @@ ExtractorError, OnDemandPagedList, clean_html, + extract_attributes, get_element_by_class, get_element_by_id, + get_element_html_by_class, get_elements_html_by_class, int_or_none, orderedSet, @@ -17,6 +19,7 @@ traverse_obj, unified_strdate, urlencode_postdata, + urljoin, ) @@ -34,6 +37,25 @@ class BitChuteIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'BitChute', 'upload_date': '20170103', + 'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/', + 'channel': 'BitChute', + 'channel_url': 'https://www.bitchute.com/channel/bitchute/' + }, + }, { + # test case: video with different channel and uploader + 'url': 'https://www.bitchute.com/video/Yti_j9A-UZ4/', + 'md5': 'f10e6a8e787766235946d0868703f1d0', + 'info_dict': { + 'id': 'Yti_j9A-UZ4', + 'ext': 'mp4', + 'title': 'Israel at War | Full Measure', + 'description': 'md5:38cf7bc6f42da1a877835539111c69ef', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'sharylattkisson', + 'upload_date': '20231106', + 'uploader_url': 'https://www.bitchute.com/profile/9K0kUWA9zmd9/', + 'channel': 'Full Measure with Sharyl Attkisson', + 'channel_url': 'https://www.bitchute.com/channel/sharylattkisson/' }, }, { # video not downloadable in browser, but we can recover it @@ -48,6 +70,9 @@ class BitChuteIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'BitChute', 'upload_date': '20181113', + 'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/', + 'channel': 'BitChute', + 'channel_url': 'https://www.bitchute.com/channel/bitchute/' }, 'params': {'check_formats': None}, }, { @@ -99,6 +124,11 @@ def _raise_if_restricted(self, webpage): reason = clean_html(get_element_by_id('page-detail', webpage)) or page_title self.raise_geo_restricted(reason) + @staticmethod + def _make_url(html): + path = extract_attributes(get_element_html_by_class('spa', html) or '').get('href') + return urljoin('https://www.bitchute.com', path) + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( @@ -121,12 +151,19 @@ def _real_extract(self, url): 'Video is unavailable. Please make sure this video is playable in the browser ' 'before reporting this issue.', expected=True, video_id=video_id) + details = get_element_by_class('details', webpage) or '' + uploader_html = get_element_html_by_class('creator', details) or '' + channel_html = get_element_html_by_class('name', details) or '' + return { 'id': video_id, 'title': self._html_extract_title(webpage) or self._og_search_title(webpage), 'description': self._og_search_description(webpage, default=None), 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader': clean_html(get_element_by_class('owner', webpage)), + 'uploader': clean_html(uploader_html), + 'uploader_url': self._make_url(uploader_html), + 'channel': clean_html(channel_html), + 'channel_url': self._make_url(channel_html), 'upload_date': unified_strdate(self._search_regex( r'at \d+:\d+ UTC on (.+?)\.', publish_date, 'upload date', fatal=False)), 'formats': formats, @@ -154,6 +191,9 @@ class BitChuteChannelIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'BitChute', 'upload_date': '20170103', + 'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/', + 'channel': 'BitChute', + 'channel_url': 'https://www.bitchute.com/channel/bitchute/', 'duration': 16, 'view_count': int, }, @@ -169,7 +209,7 @@ class BitChuteChannelIE(InfoExtractor): 'info_dict': { 'id': 'wV9Imujxasw9', 'title': 'Bruce MacDonald and "The Light of Darkness"', - 'description': 'md5:04913227d2714af1d36d804aa2ab6b1e', + 'description': 'md5:747724ef404eebdfc04277714f81863e', } }] diff --git a/yt_dlp/extractor/bitwave.py b/yt_dlp/extractor/bitwave.py deleted file mode 100644 index a82cd263a..000000000 --- a/yt_dlp/extractor/bitwave.py +++ /dev/null @@ -1,58 +0,0 @@ -from .common import InfoExtractor - - -class BitwaveReplayIE(InfoExtractor): - IE_NAME = 'bitwave:replay' - _VALID_URL = r'https?://(?:www\.)?bitwave\.tv/(?P<user>\w+)/replay/(?P<id>\w+)/?$' - _TEST = { - 'url': 'https://bitwave.tv/RhythmicCarnage/replay/z4P6eq5L7WDrM85UCrVr', - 'only_matching': True - } - - def _real_extract(self, url): - replay_id = self._match_id(url) - replay = self._download_json( - 'https://api.bitwave.tv/v1/replays/' + replay_id, - replay_id - ) - - return { - 'id': replay_id, - 'title': replay['data']['title'], - 'uploader': replay['data']['name'], - 'uploader_id': replay['data']['name'], - 'url': replay['data']['url'], - 'thumbnails': [ - {'url': x} for x in replay['data']['thumbnails'] - ], - } - - -class BitwaveStreamIE(InfoExtractor): - IE_NAME = 'bitwave:stream' - _VALID_URL = r'https?://(?:www\.)?bitwave\.tv/(?P<id>\w+)/?$' - _TEST = { - 'url': 'https://bitwave.tv/doomtube', - 'only_matching': True - } - - def _real_extract(self, url): - username = self._match_id(url) - channel = self._download_json( - 'https://api.bitwave.tv/v1/channels/' + username, - username) - - formats = self._extract_m3u8_formats( - channel['data']['url'], username, - 'mp4') - - return { - 'id': username, - 'title': channel['data']['title'], - 'uploader': username, - 'uploader_id': username, - 'formats': formats, - 'thumbnail': channel['data']['thumbnail'], - 'is_live': True, - 'view_count': channel['data']['viewCount'] - } diff --git a/yt_dlp/extractor/bleacherreport.py b/yt_dlp/extractor/bleacherreport.py index 8d8fabe33..5e5155af2 100644 --- a/yt_dlp/extractor/bleacherreport.py +++ b/yt_dlp/extractor/bleacherreport.py @@ -22,7 +22,7 @@ class BleacherReportIE(InfoExtractor): 'upload_date': '20150615', 'uploader': 'Team Stream Now ', }, - 'add_ie': ['Ooyala'], + 'skip': 'Video removed', }, { 'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', 'md5': '6a5cd403418c7b01719248ca97fb0692', @@ -70,8 +70,6 @@ def _real_extract(self, url): video_type = video['type'] if video_type in ('cms.bleacherreport.com', 'vid.bleacherreport.com'): info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id'] - elif video_type == 'ooyala.com': - info['url'] = 'ooyala:%s' % video['id'] elif video_type == 'youtube.com': info['url'] = video['id'] elif video_type == 'vine.co': diff --git a/yt_dlp/extractor/box.py b/yt_dlp/extractor/box.py index 8ab149626..7281b3c6a 100644 --- a/yt_dlp/extractor/box.py +++ b/yt_dlp/extractor/box.py @@ -1,16 +1,17 @@ import json +import urllib.parse from .common import InfoExtractor from ..utils import ( - determine_ext, parse_iso8601, - # try_get, update_url_query, + url_or_none, ) +from ..utils.traversal import traverse_obj class BoxIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/]+)/file/(?P<id>\d+)' + _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/?#]+)/file/(?P<id>\d+)' _TEST = { 'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538', 'md5': '1f81b2fd3960f38a40a3b8823e5fcd43', @@ -18,11 +19,12 @@ class BoxIE(InfoExtractor): 'id': '510727257538', 'ext': 'mp4', 'title': 'Garber St. Louis will be 28th MLS team +scarving.mp4', - 'uploader': 'MLS Video', + 'uploader': '', 'timestamp': 1566320259, 'upload_date': '20190820', 'uploader_id': '235196876', - } + }, + 'params': {'skip_download': 'dash fragment too small'}, } def _real_extract(self, url): @@ -58,26 +60,15 @@ def _real_extract(self, url): formats = [] - # for entry in (try_get(f, lambda x: x['representations']['entries'], list) or []): - # entry_url_template = try_get( - # entry, lambda x: x['content']['url_template']) - # if not entry_url_template: - # continue - # representation = entry.get('representation') - # if representation == 'dash': - # TODO: append query to every fragment URL - # formats.extend(self._extract_mpd_formats( - # entry_url_template.replace('{+asset_path}', 'manifest.mpd'), - # file_id, query=query)) - - authenticated_download_url = f.get('authenticated_download_url') - if authenticated_download_url and f.get('is_download_available'): - formats.append({ - 'ext': f.get('extension') or determine_ext(title), - 'filesize': f.get('size'), - 'format_id': 'download', - 'url': update_url_query(authenticated_download_url, query), - }) + for url_tmpl in traverse_obj(f, ( + 'representations', 'entries', lambda _, v: v['representation'] == 'dash', + 'content', 'url_template', {url_or_none} + )): + manifest_url = update_url_query(url_tmpl.replace('{+asset_path}', 'manifest.mpd'), query) + fmts = self._extract_mpd_formats(manifest_url, file_id) + for fmt in fmts: + fmt['extra_param_to_segment_url'] = urllib.parse.urlparse(manifest_url).query + formats.extend(fmts) creator = f.get('created_by') or {} diff --git a/yt_dlp/extractor/br.py b/yt_dlp/extractor/br.py index 309452d23..6e1c63e2b 100644 --- a/yt_dlp/extractor/br.py +++ b/yt_dlp/extractor/br.py @@ -1,18 +1,15 @@ -import json - from .common import InfoExtractor from ..utils import ( - determine_ext, ExtractorError, int_or_none, parse_duration, - parse_iso8601, xpath_element, xpath_text, ) class BRIE(InfoExtractor): + _WORKING = False IE_DESC = 'Bayerischer Rundfunk' _VALID_URL = r'(?P<base_url>https?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' @@ -167,142 +164,3 @@ def _extract_thumbnails(self, variants, base_url): } for variant in variants.findall('variant') if xpath_text(variant, 'url')] thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True) return thumbnails - - -class BRMediathekIE(InfoExtractor): - IE_DESC = 'Bayerischer Rundfunk Mediathek' - _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek//?video/(?:[^/?&#]+?-)?(?P<id>av:[0-9a-f]{24})' - - _TESTS = [{ - 'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e', - 'md5': 'fdc3d485835966d1622587d08ba632ec', - 'info_dict': { - 'id': 'av:5a1e6a6e8fce6d001871cc8e', - 'ext': 'mp4', - 'title': 'Die Sendung vom 28.11.2017', - 'description': 'md5:6000cdca5912ab2277e5b7339f201ccc', - 'timestamp': 1511942766, - 'upload_date': '20171129', - } - }, { - 'url': 'https://www.br.de/mediathek//video/av:61b0db581aed360007558c12', - 'only_matching': True, - }] - - def _real_extract(self, url): - clip_id = self._match_id(url) - - clip = self._download_json( - 'https://proxy-base.master.mango.express/graphql', - clip_id, data=json.dumps({ - "query": """{ - viewer { - clip(id: "%s") { - title - description - duration - createdAt - ageRestriction - videoFiles { - edges { - node { - publicLocation - fileSize - videoProfile { - width - height - bitrate - encoding - } - } - } - } - captionFiles { - edges { - node { - publicLocation - } - } - } - teaserImages { - edges { - node { - imageFiles { - edges { - node { - publicLocation - width - height - } - } - } - } - } - } - } - } -}""" % clip_id}).encode(), headers={ - 'Content-Type': 'application/json', - })['data']['viewer']['clip'] - title = clip['title'] - - formats = [] - for edge in clip.get('videoFiles', {}).get('edges', []): - node = edge.get('node', {}) - n_url = node.get('publicLocation') - if not n_url: - continue - ext = determine_ext(n_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - n_url, clip_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - video_profile = node.get('videoProfile', {}) - tbr = int_or_none(video_profile.get('bitrate')) - format_id = 'http' - if tbr: - format_id += '-%d' % tbr - formats.append({ - 'format_id': format_id, - 'url': n_url, - 'width': int_or_none(video_profile.get('width')), - 'height': int_or_none(video_profile.get('height')), - 'tbr': tbr, - 'filesize': int_or_none(node.get('fileSize')), - }) - - subtitles = {} - for edge in clip.get('captionFiles', {}).get('edges', []): - node = edge.get('node', {}) - n_url = node.get('publicLocation') - if not n_url: - continue - subtitles.setdefault('de', []).append({ - 'url': n_url, - }) - - thumbnails = [] - for edge in clip.get('teaserImages', {}).get('edges', []): - for image_edge in edge.get('node', {}).get('imageFiles', {}).get('edges', []): - node = image_edge.get('node', {}) - n_url = node.get('publicLocation') - if not n_url: - continue - thumbnails.append({ - 'url': n_url, - 'width': int_or_none(node.get('width')), - 'height': int_or_none(node.get('height')), - }) - - return { - 'id': clip_id, - 'title': title, - 'description': clip.get('description'), - 'duration': int_or_none(clip.get('duration')), - 'timestamp': parse_iso8601(clip.get('createdAt')), - 'age_limit': int_or_none(clip.get('ageRestriction')), - 'formats': formats, - 'subtitles': subtitles, - 'thumbnails': thumbnails, - } diff --git a/yt_dlp/extractor/breakcom.py b/yt_dlp/extractor/breakcom.py deleted file mode 100644 index 00cf308c7..000000000 --- a/yt_dlp/extractor/breakcom.py +++ /dev/null @@ -1,86 +0,0 @@ -from .common import InfoExtractor -from .youtube import YoutubeIE -from ..utils import ( - int_or_none, - url_or_none, -) - - -class BreakIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?break\.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' - _TESTS = [{ - 'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', - 'info_dict': { - 'id': '2468056', - 'ext': 'mp4', - 'title': 'When Girls Act Like D-Bags', - 'age_limit': 13, - }, - }, { - # youtube embed - 'url': 'http://www.break.com/video/someone-forgot-boat-brakes-work', - 'info_dict': { - 'id': 'RrrDLdeL2HQ', - 'ext': 'mp4', - 'title': 'Whale Watching Boat Crashing Into San Diego Dock', - 'description': 'md5:afc1b2772f0a8468be51dd80eb021069', - 'upload_date': '20160331', - 'uploader': 'Steve Holden', - 'uploader_id': 'sdholden07', - }, - 'params': { - 'skip_download': True, - } - }, { - 'url': 'http://www.break.com/video/ugc/baby-flex-2773063', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id, video_id = self._match_valid_url(url).groups() - - webpage = self._download_webpage(url, display_id) - - youtube_url = YoutubeIE._extract_url(webpage) - if youtube_url: - return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) - - content = self._parse_json( - self._search_regex( - r'(?s)content["\']\s*:\s*(\[.+?\])\s*[,\n]', webpage, - 'content'), - display_id) - - formats = [] - for video in content: - video_url = url_or_none(video.get('url')) - if not video_url: - continue - bitrate = int_or_none(self._search_regex( - r'(\d+)_kbps', video_url, 'tbr', default=None)) - formats.append({ - 'url': video_url, - 'format_id': 'http-%d' % bitrate if bitrate else 'http', - 'tbr': bitrate, - }) - - title = self._search_regex( - (r'title["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', - r'<h1[^>]*>(?P<value>[^<]+)'), webpage, 'title', group='value') - - def get(key, name): - return int_or_none(self._search_regex( - r'%s["\']\s*:\s*["\'](\d+)' % key, webpage, name, - default=None)) - - age_limit = get('ratings', 'age limit') - video_id = video_id or get('pid', 'video id') or display_id - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage), - 'age_limit': age_limit, - 'formats': formats, - } diff --git a/yt_dlp/extractor/breitbart.py b/yt_dlp/extractor/breitbart.py index ea0a59c86..b5abb7f19 100644 --- a/yt_dlp/extractor/breitbart.py +++ b/yt_dlp/extractor/breitbart.py @@ -2,7 +2,7 @@ class BreitBartIE(InfoExtractor): - _VALID_URL = r'https?:\/\/(?:www\.)breitbart.com/videos/v/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?breitbart\.com/videos/v/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.breitbart.com/videos/v/5cOz1yup/?pl=Ij6NDOji', 'md5': '0aa6d1d6e183ac5ca09207fe49f17ade', diff --git a/yt_dlp/extractor/brilliantpala.py b/yt_dlp/extractor/brilliantpala.py index 6fd5b8148..0bf8622c1 100644 --- a/yt_dlp/extractor/brilliantpala.py +++ b/yt_dlp/extractor/brilliantpala.py @@ -21,10 +21,10 @@ def _initialize_pre_login(self): def _get_logged_in_username(self, url, video_id): webpage, urlh = self._download_webpage_handle(url, video_id) - if self._LOGIN_API == urlh.url: + if urlh.url.startswith(self._LOGIN_API): self.raise_login_required() return self._html_search_regex( - r'"username"\s*:\s*"(?P<username>[^"]+)"', webpage, 'stream page info', 'username') + r'"username"\s*:\s*"(?P<username>[^"]+)"', webpage, 'logged-in username') def _perform_login(self, username, password): login_form = self._hidden_inputs(self._download_webpage( diff --git a/yt_dlp/extractor/bundestag.py b/yt_dlp/extractor/bundestag.py new file mode 100644 index 000000000..9fd7c7de1 --- /dev/null +++ b/yt_dlp/extractor/bundestag.py @@ -0,0 +1,123 @@ +import re +from functools import partial + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + bug_reports_message, + clean_html, + format_field, + get_element_text_and_html_by_tag, + int_or_none, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class BundestagIE(InfoExtractor): + _VALID_URL = [ + r'https?://dbtg\.tv/[cf]vid/(?P<id>\d+)', + r'https?://www\.bundestag\.de/mediathek/?\?(?:[^#]+&)?videoid=(?P<id>\d+)', + ] + _TESTS = [{ + 'url': 'https://dbtg.tv/cvid/7605304', + 'info_dict': { + 'id': '7605304', + 'ext': 'mp4', + 'title': '145. Sitzung vom 15.12.2023, TOP 24 Barrierefreiheit', + 'description': 'md5:321a9dc6bdad201264c0045efc371561', + }, + }, { + 'url': 'https://www.bundestag.de/mediathek?videoid=7602120&url=L21lZGlhdGhla292ZXJsYXk=&mod=mediathek', + 'info_dict': { + 'id': '7602120', + 'ext': 'mp4', + 'title': '130. Sitzung vom 18.10.2023, TOP 1 Befragung der Bundesregierung', + 'description': 'Befragung der Bundesregierung', + }, + }, { + 'url': 'https://www.bundestag.de/mediathek?videoid=7604941#url=L21lZGlhdGhla292ZXJsYXk/dmlkZW9pZD03NjA0OTQx&mod=mediathek', + 'only_matching': True, + }, { + 'url': 'http://dbtg.tv/fvid/3594346', + 'only_matching': True, + }] + + _OVERLAY_URL = 'https://www.bundestag.de/mediathekoverlay' + _INSTANCE_FORMAT = 'https://cldf-wzw-od.r53.cdn.tv1.eu/13014bundestagod/_definst_/13014bundestag/ondemand/3777parlamentsfernsehen/archiv/app144277506/145293313/{0}/{0}_playlist.smil/playlist.m3u8' + + _SHARE_URL = 'https://webtv.bundestag.de/player/macros/_x_s-144277506/shareData.json?contentId=' + _SHARE_AUDIO_REGEX = r'/\d+_(?P<codec>\w+)_(?P<bitrate>\d+)kb_(?P<channels>\w+)_\w+_\d+\.(?P<ext>\w+)' + _SHARE_VIDEO_REGEX = r'/\d+_(?P<codec>\w+)_(?P<width>\w+)_(?P<height>\w+)_(?P<bitrate>\d+)kb_\w+_\w+_\d+\.(?P<ext>\w+)' + + def _bt_extract_share_formats(self, video_id): + share_data = self._download_json( + f'{self._SHARE_URL}{video_id}', video_id, note='Downloading share format JSON') + if traverse_obj(share_data, ('status', 'code', {int})) != 1: + self.report_warning(format_field( + share_data, [('status', 'message', {str})], + 'Share API response: %s', default='Unknown Share API Error') + + bug_reports_message()) + return + + for name, url in share_data.items(): + if not isinstance(name, str) or not url_or_none(url): + continue + + elif name.startswith('audio'): + match = re.search(self._SHARE_AUDIO_REGEX, url) + yield { + 'format_id': name, + 'url': url, + 'vcodec': 'none', + **traverse_obj(match, { + 'acodec': 'codec', + 'audio_channels': ('channels', {{'mono': 1, 'stereo': 2}.get}), + 'abr': ('bitrate', {int_or_none}), + 'ext': 'ext', + }), + } + + elif name.startswith('download'): + match = re.search(self._SHARE_VIDEO_REGEX, url) + yield { + 'format_id': name, + 'url': url, + **traverse_obj(match, { + 'vcodec': 'codec', + 'tbr': ('bitrate', {int_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'ext': 'ext', + }), + } + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = [] + result = {'id': video_id, 'formats': formats} + + try: + formats.extend(self._extract_m3u8_formats( + self._INSTANCE_FORMAT.format(video_id), video_id, m3u8_id='instance')) + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 404: + raise ExtractorError('Could not find video id', expected=True) + self.report_warning(f'Error extracting hls formats: {error}', video_id) + formats.extend(self._bt_extract_share_formats(video_id)) + if not formats: + self.raise_no_formats('Could not find suitable formats', video_id=video_id) + + result.update(traverse_obj(self._download_webpage( + self._OVERLAY_URL, video_id, + query={'videoid': video_id, 'view': 'main'}, + note='Downloading metadata overlay', fatal=False, + ), { + 'title': ( + {partial(get_element_text_and_html_by_tag, 'h3')}, 0, + {partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}), + 'description': ({partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}), + })) + + return result diff --git a/yt_dlp/extractor/byutv.py b/yt_dlp/extractor/byutv.py index 9ed6efe79..ad35427ed 100644 --- a/yt_dlp/extractor/byutv.py +++ b/yt_dlp/extractor/byutv.py @@ -8,9 +8,9 @@ class BYUtvIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?byutv\.org/(?:watch|player)/(?!event/)(?P<id>[0-9a-f-]+)(?:/(?P<display_id>[^/?#&]+))?' _TESTS = [{ - # ooyalaVOD 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5', 'info_dict': { 'id': 'ZvanRocTpW-G5_yZFeltTAMv6jxOU9KH', @@ -24,7 +24,6 @@ class BYUtvIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': ['Ooyala'], }, { # dvr 'url': 'https://www.byutv.org/player/8f1dab9b-b243-47c8-b525-3e2d021a3451/byu-softball-pacific-vs-byu-41219---game-2', @@ -63,19 +62,6 @@ def _real_extract(self, url): 'x-byutv-platformkey': 'xsaaw9c7y5', }) - ep = video.get('ooyalaVOD') - if ep: - return { - '_type': 'url_transparent', - 'ie_key': 'Ooyala', - 'url': 'ooyala:%s' % ep['providerId'], - 'id': video_id, - 'display_id': display_id, - 'title': ep.get('title'), - 'description': ep.get('description'), - 'thumbnail': ep.get('imageThumbnail'), - } - info = {} formats = [] subtitles = {} diff --git a/yt_dlp/extractor/camwithher.py b/yt_dlp/extractor/camwithher.py deleted file mode 100644 index a0b3749ed..000000000 --- a/yt_dlp/extractor/camwithher.py +++ /dev/null @@ -1,87 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, - unified_strdate, -) - - -class CamWithHerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?camwithher\.tv/view_video\.php\?.*\bviewkey=(?P<id>\w+)' - - _TESTS = [{ - 'url': 'http://camwithher.tv/view_video.php?viewkey=6e9a24e2c0e842e1f177&page=&viewtype=&category=', - 'info_dict': { - 'id': '5644', - 'ext': 'flv', - 'title': 'Periscope Tease', - 'description': 'In the clouds teasing on periscope to my favorite song', - 'duration': 240, - 'view_count': int, - 'comment_count': int, - 'uploader': 'MileenaK', - 'upload_date': '20160322', - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - } - }, { - 'url': 'http://camwithher.tv/view_video.php?viewkey=6dfd8b7c97531a459937', - 'only_matching': True, - }, { - 'url': 'http://camwithher.tv/view_video.php?page=&viewkey=6e9a24e2c0e842e1f177&viewtype=&category=', - 'only_matching': True, - }, { - 'url': 'http://camwithher.tv/view_video.php?viewkey=b6c3b5bea9515d1a1fc4&page=&viewtype=&category=mv', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - flv_id = self._html_search_regex( - r'<a[^>]+href=["\']/download/\?v=(\d+)', webpage, 'video id') - - # Video URL construction algorithm is reverse-engineered from cwhplayer.swf - rtmp_url = 'rtmp://camwithher.tv/clipshare/%s' % ( - ('mp4:%s.mp4' % flv_id) if int(flv_id) > 2010 else flv_id) - - title = self._html_search_regex( - r'<div[^>]+style="float:left"[^>]*>\s*<h2>(.+?)</h2>', webpage, 'title') - description = self._html_search_regex( - r'>Description:</span>(.+?)</div>', webpage, 'description', default=None) - - runtime = self._search_regex( - r'Runtime\s*:\s*(.+?) \|', webpage, 'duration', default=None) - if runtime: - runtime = re.sub(r'[\s-]', '', runtime) - duration = parse_duration(runtime) - view_count = int_or_none(self._search_regex( - r'Views\s*:\s*(\d+)', webpage, 'view count', default=None)) - comment_count = int_or_none(self._search_regex( - r'Comments\s*:\s*(\d+)', webpage, 'comment count', default=None)) - - uploader = self._search_regex( - r'Added by\s*:\s*<a[^>]+>([^<]+)</a>', webpage, 'uploader', default=None) - upload_date = unified_strdate(self._search_regex( - r'Added on\s*:\s*([\d-]+)', webpage, 'upload date', default=None)) - - return { - 'id': flv_id, - 'url': rtmp_url, - 'ext': 'flv', - 'no_resume': True, - 'title': title, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, - 'uploader': uploader, - 'upload_date': upload_date, - 'age_limit': 18 - } diff --git a/yt_dlp/extractor/carambatv.py b/yt_dlp/extractor/carambatv.py deleted file mode 100644 index d6044a319..000000000 --- a/yt_dlp/extractor/carambatv.py +++ /dev/null @@ -1,105 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - format_field, - float_or_none, - int_or_none, - try_get, -) - -from .videomore import VideomoreIE - - -class CarambaTVIE(InfoExtractor): - _VALID_URL = r'(?:carambatv:|https?://video1\.carambatv\.ru/v/)(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://video1.carambatv.ru/v/191910501', - 'md5': '2f4a81b7cfd5ab866ee2d7270cb34a2a', - 'info_dict': { - 'id': '191910501', - 'ext': 'mp4', - 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 2678.31, - }, - }, { - 'url': 'carambatv:191910501', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - video = self._download_json( - 'http://video1.carambatv.ru/v/%s/videoinfo.js' % video_id, - video_id) - - title = video['title'] - - base_url = video.get('video') or 'http://video1.carambatv.ru/v/%s/' % video_id - - formats = [{ - 'url': base_url + f['fn'], - 'height': int_or_none(f.get('height')), - 'format_id': format_field(f, 'height', '%sp'), - } for f in video['qualities'] if f.get('fn')] - - thumbnail = video.get('splash') - duration = float_or_none(try_get( - video, lambda x: x['annotations'][0]['end_time'], compat_str)) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } - - -class CarambaTVPageIE(InfoExtractor): - _VALID_URL = r'https?://carambatv\.ru/(?:[^/]+/)+(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'http://carambatv.ru/movie/bad-comedian/razborka-v-manile/', - 'md5': 'a49fb0ec2ad66503eeb46aac237d3c86', - 'info_dict': { - 'id': '475222', - 'ext': 'flv', - 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', - 'thumbnail': r're:^https?://.*\.jpg', - # duration reported by videomore is incorrect - 'duration': int, - }, - 'add_ie': [VideomoreIE.ie_key()], - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - videomore_url = VideomoreIE._extract_url(webpage) - if not videomore_url: - videomore_id = self._search_regex( - r'getVMCode\s*\(\s*["\']?(\d+)', webpage, 'videomore id', - default=None) - if videomore_id: - videomore_url = 'videomore:%s' % videomore_id - if videomore_url: - title = self._og_search_title(webpage) - return { - '_type': 'url_transparent', - 'url': videomore_url, - 'ie_key': VideomoreIE.ie_key(), - 'title': title, - } - - video_url = self._og_search_property('video:iframe', webpage, default=None) - - if not video_url: - video_id = self._search_regex( - r'(?:video_id|crmb_vuid)\s*[:=]\s*["\']?(\d+)', - webpage, 'video id') - video_url = 'carambatv:%s' % video_id - - return self.url_result(video_url, CarambaTVIE.ie_key()) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index be2d13e44..b5beb1ec8 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -1,8 +1,9 @@ -import re -import json import base64 +import json +import re import time import urllib.parse +import xml.etree.ElementTree from .common import InfoExtractor from ..compat import ( @@ -179,6 +180,13 @@ class CBCPlayerIE(InfoExtractor): 'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg', 'chapters': [], 'duration': 494.811, + 'categories': ['AudioMobile/All in a Weekend Montreal'], + 'tags': 'count:8', + 'location': 'Quebec', + 'series': 'All in a Weekend Montreal', + 'season': 'Season 2015', + 'season_number': 2015, + 'media_type': 'Excerpt', }, }, { 'url': 'http://www.cbc.ca/player/play/2164402062', @@ -194,25 +202,37 @@ class CBCPlayerIE(InfoExtractor): 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg', 'chapters': [], 'duration': 186.867, + 'series': 'CBC News: Windsor at 6:00', + 'categories': ['News/Canada/Windsor'], + 'location': 'Windsor', + 'tags': ['cancer'], + 'creator': 'Allison Johnson', + 'media_type': 'Excerpt', }, }, { # Has subtitles # These broadcasts expire after ~1 month, can find new test URL here: # https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast - 'url': 'http://www.cbc.ca/player/play/2249992771553', - 'md5': '2f2fb675dd4f0f8a5bb7588d1b13bacd', + 'url': 'http://www.cbc.ca/player/play/2284799043667', + 'md5': '9b49f0839e88b6ec0b01d840cf3d42b5', 'info_dict': { - 'id': '2249992771553', + 'id': '2284799043667', 'ext': 'mp4', - 'title': 'The National | Women’s soccer pay, Florida seawater, Swift quake', - 'description': 'md5:adba28011a56cfa47a080ff198dad27a', - 'timestamp': 1690596000, - 'duration': 2716.333, + 'title': 'The National | Hockey coach charged, Green grants, Safer drugs', + 'description': 'md5:84ef46321c94bcf7d0159bb565d26bfa', + 'timestamp': 1700272800, + 'duration': 2718.833, 'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]}, - 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/481/326/thumbnail.jpeg', + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/907/171/thumbnail.jpeg', 'uploader': 'CBCC-NEW', 'chapters': 'count:5', - 'upload_date': '20230729', + 'upload_date': '20231118', + 'categories': 'count:4', + 'series': 'The National - Full Show', + 'tags': 'count:1', + 'creator': 'News', + 'location': 'Canada', + 'media_type': 'Full Program', }, }] @@ -387,7 +407,7 @@ def _find_secret_formats(self, formats, video_id): url = re.sub(r'(Manifest\(.*?),format=[\w-]+(.*?\))', r'\1\2', base_url) secret_xml = self._download_xml(url, video_id, note='Downloading secret XML', fatal=False) - if not secret_xml: + if not isinstance(secret_xml, xml.etree.ElementTree.Element): return for child in secret_xml: diff --git a/yt_dlp/extractor/channel9.py b/yt_dlp/extractor/channel9.py deleted file mode 100644 index a88474060..000000000 --- a/yt_dlp/extractor/channel9.py +++ /dev/null @@ -1,252 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - int_or_none, - parse_iso8601, - qualities, - unescapeHTML, -) - - -class Channel9IE(InfoExtractor): - IE_DESC = 'Channel 9' - IE_NAME = 'channel9' - _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)' - _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b'] - - _TESTS = [{ - 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', - 'md5': '32083d4eaf1946db6d454313f44510ca', - 'info_dict': { - 'id': '6c413323-383a-49dc-88f9-a22800cab024', - 'ext': 'wmv', - 'title': 'Developer Kick-Off Session: Stuff We Love', - 'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731', - 'duration': 4576, - 'thumbnail': r're:https?://.*\.jpg', - 'timestamp': 1377717420, - 'upload_date': '20130828', - 'session_code': 'KOS002', - 'session_room': 'Arena 1A', - 'session_speakers': 'count:5', - }, - }, { - 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', - 'md5': 'dcf983ee6acd2088e7188c3cf79b46bc', - 'info_dict': { - 'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024', - 'ext': 'wmv', - 'title': 'Self-service BI with Power BI - nuclear testing', - 'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54', - 'duration': 1540, - 'thumbnail': r're:https?://.*\.jpg', - 'timestamp': 1386381991, - 'upload_date': '20131207', - 'authors': ['Mike Wilmot'], - }, - }, { - # low quality mp4 is best - 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', - 'info_dict': { - 'id': '33ad69d2-6a4e-4172-83a1-a523013dec76', - 'ext': 'mp4', - 'title': 'Ranges for the Standard Library', - 'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372', - 'duration': 5646, - 'thumbnail': r're:https?://.*\.jpg', - 'upload_date': '20150930', - 'timestamp': 1443640735, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS', - 'info_dict': { - 'id': 'Events/DEVintersection/DEVintersection-2016', - 'title': 'DEVintersection 2016 Orlando Sessions', - }, - 'playlist_mincount': 14, - }, { - 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS', - 'only_matching': True, - }, { - 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman', - 'only_matching': True, - }] - - _RSS_URL = 'http://channel9.msdn.com/%s/RSS' - - def _extract_list(self, video_id, rss_url=None): - if not rss_url: - rss_url = self._RSS_URL % video_id - rss = self._download_xml(rss_url, video_id, 'Downloading RSS') - entries = [self.url_result(session_url.text, 'Channel9') - for session_url in rss.findall('./channel/item/link')] - title_text = rss.find('./channel/title').text - return self.playlist_result(entries, video_id, title_text) - - def _real_extract(self, url): - content_path, rss = self._match_valid_url(url).groups() - - if rss: - return self._extract_list(content_path, url) - - webpage = self._download_webpage( - url, content_path, 'Downloading web page') - - episode_data = self._search_regex( - r"data-episode='([^']+)'", webpage, 'episode data', default=None) - if episode_data: - episode_data = self._parse_json(unescapeHTML( - episode_data), content_path) - content_id = episode_data['contentId'] - is_session = '/Sessions(' in episode_data['api'] - content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] + '?$select=Captions,CommentCount,MediaLengthInSeconds,PublishedDate,Rating,RatingCount,Title,VideoMP4High,VideoMP4Low,VideoMP4Medium,VideoPlayerPreviewImage,VideoWMV,VideoWMVHQ,Views,' - if is_session: - content_url += 'Code,Description,Room,Slides,Speakers,ZipFile&$expand=Speakers' - else: - content_url += 'Authors,Body&$expand=Authors' - content_data = self._download_json(content_url, content_id) - title = content_data['Title'] - - QUALITIES = ( - 'mp3', - 'wmv', 'mp4', - 'wmv-low', 'mp4-low', - 'wmv-mid', 'mp4-mid', - 'wmv-high', 'mp4-high', - ) - - quality_key = qualities(QUALITIES) - - def quality(quality_id, format_url): - return (len(QUALITIES) if '_Source.' in format_url - else quality_key(quality_id)) - - formats = [] - urls = set() - - SITE_QUALITIES = { - 'MP3': 'mp3', - 'MP4': 'mp4', - 'Low Quality WMV': 'wmv-low', - 'Low Quality MP4': 'mp4-low', - 'Mid Quality WMV': 'wmv-mid', - 'Mid Quality MP4': 'mp4-mid', - 'High Quality WMV': 'wmv-high', - 'High Quality MP4': 'mp4-high', - } - - formats_select = self._search_regex( - r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage, - 'formats select', default=None) - if formats_select: - for mobj in re.finditer( - r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<', - formats_select): - format_url = mobj.group('url') - if format_url in urls: - continue - urls.add(format_url) - format_id = mobj.group('format') - quality_id = SITE_QUALITIES.get(format_id, format_id) - formats.append({ - 'url': format_url, - 'format_id': quality_id, - 'quality': quality(quality_id, format_url), - 'vcodec': 'none' if quality_id == 'mp3' else None, - }) - - API_QUALITIES = { - 'VideoMP4Low': 'mp4-low', - 'VideoWMV': 'wmv-mid', - 'VideoMP4Medium': 'mp4-mid', - 'VideoMP4High': 'mp4-high', - 'VideoWMVHQ': 'wmv-hq', - } - - for format_id, q in API_QUALITIES.items(): - q_url = content_data.get(format_id) - if not q_url or q_url in urls: - continue - urls.add(q_url) - formats.append({ - 'url': q_url, - 'format_id': q, - 'quality': quality(q, q_url), - }) - - slides = content_data.get('Slides') - zip_file = content_data.get('ZipFile') - - if not formats and not slides and not zip_file: - self.raise_no_formats( - 'None of recording, slides or zip are available for %s' % content_path) - - subtitles = {} - for caption in content_data.get('Captions', []): - caption_url = caption.get('Url') - if not caption_url: - continue - subtitles.setdefault(caption.get('Language', 'en'), []).append({ - 'url': caption_url, - 'ext': 'vtt', - }) - - common = { - 'id': content_id, - 'title': title, - 'description': clean_html(content_data.get('Description') or content_data.get('Body')), - 'thumbnail': content_data.get('VideoPlayerPreviewImage'), - 'duration': int_or_none(content_data.get('MediaLengthInSeconds')), - 'timestamp': parse_iso8601(content_data.get('PublishedDate')), - 'avg_rating': int_or_none(content_data.get('Rating')), - 'rating_count': int_or_none(content_data.get('RatingCount')), - 'view_count': int_or_none(content_data.get('Views')), - 'comment_count': int_or_none(content_data.get('CommentCount')), - 'subtitles': subtitles, - } - if is_session: - speakers = [] - for s in content_data.get('Speakers', []): - speaker_name = s.get('FullName') - if not speaker_name: - continue - speakers.append(speaker_name) - - common.update({ - 'session_code': content_data.get('Code'), - 'session_room': content_data.get('Room'), - 'session_speakers': speakers, - }) - else: - authors = [] - for a in content_data.get('Authors', []): - author_name = a.get('DisplayName') - if not author_name: - continue - authors.append(author_name) - common['authors'] = authors - - contents = [] - - if slides: - d = common.copy() - d.update({'title': title + '-Slides', 'url': slides}) - contents.append(d) - - if zip_file: - d = common.copy() - d.update({'title': title + '-Zip', 'url': zip_file}) - contents.append(d) - - if formats: - d = common.copy() - d.update({'title': title, 'formats': formats}) - contents.append(d) - return self.playlist_result(contents) - else: - return self._extract_list(content_path) diff --git a/yt_dlp/extractor/chirbit.py b/yt_dlp/extractor/chirbit.py deleted file mode 100644 index 452711d97..000000000 --- a/yt_dlp/extractor/chirbit.py +++ /dev/null @@ -1,88 +0,0 @@ -import re - -from .common import InfoExtractor -from ..compat import compat_b64decode -from ..utils import parse_duration - - -class ChirbitIE(InfoExtractor): - IE_NAME = 'chirbit' - _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P<id>[\da-zA-Z]+)' - _TESTS = [{ - 'url': 'http://chirb.it/be2abG', - 'info_dict': { - 'id': 'be2abG', - 'ext': 'mp3', - 'title': 'md5:f542ea253f5255240be4da375c6a5d7e', - 'description': 'md5:f24a4e22a71763e32da5fed59e47c770', - 'duration': 306, - 'uploader': 'Gerryaudio', - }, - 'params': { - 'skip_download': True, - } - }, { - 'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5', - 'only_matching': True, - }, { - 'url': 'https://chirb.it/wp/MN58c2', - 'only_matching': True, - }] - - def _real_extract(self, url): - audio_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://chirb.it/%s' % audio_id, audio_id) - - data_fd = self._search_regex( - r'data-fd=(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'data fd', group='url') - - # Reverse engineered from https://chirb.it/js/chirbit.player.js (look - # for soundURL) - audio_url = compat_b64decode(data_fd[::-1]).decode('utf-8') - - title = self._search_regex( - r'class=["\']chirbit-title["\'][^>]*>([^<]+)', webpage, 'title') - description = self._search_regex( - r'<h3>Description</h3>\s*<pre[^>]*>([^<]+)</pre>', - webpage, 'description', default=None) - duration = parse_duration(self._search_regex( - r'class=["\']c-length["\'][^>]*>([^<]+)', - webpage, 'duration', fatal=False)) - uploader = self._search_regex( - r'id=["\']chirbit-username["\'][^>]*>([^<]+)', - webpage, 'uploader', fatal=False) - - return { - 'id': audio_id, - 'url': audio_url, - 'title': title, - 'description': description, - 'duration': duration, - 'uploader': uploader, - } - - -class ChirbitProfileIE(InfoExtractor): - IE_NAME = 'chirbit:profile' - _VALID_URL = r'https?://(?:www\.)?chirbit\.com/(?:rss/)?(?P<id>[^/]+)' - _TEST = { - 'url': 'http://chirbit.com/ScarletBeauty', - 'info_dict': { - 'id': 'ScarletBeauty', - }, - 'playlist_mincount': 3, - } - - def _real_extract(self, url): - profile_id = self._match_id(url) - - webpage = self._download_webpage(url, profile_id) - - entries = [ - self.url_result(self._proto_relative_url('//chirb.it/' + video_id)) - for _, video_id in re.findall(r'<input[^>]+id=([\'"])copy-btn-(?P<id>[0-9a-zA-Z]+)\1', webpage)] - - return self.playlist_result(entries, profile_id) diff --git a/yt_dlp/extractor/chzzk.py b/yt_dlp/extractor/chzzk.py new file mode 100644 index 000000000..6894baea5 --- /dev/null +++ b/yt_dlp/extractor/chzzk.py @@ -0,0 +1,139 @@ +import functools + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + parse_iso8601, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class CHZZKLiveIE(InfoExtractor): + IE_NAME = 'chzzk:live' + _VALID_URL = r'https?://chzzk\.naver\.com/live/(?P<id>[\da-f]+)' + _TESTS = [{ + 'url': 'https://chzzk.naver.com/live/c68b8ef525fb3d2fa146344d84991753', + 'info_dict': { + 'id': 'c68b8ef525fb3d2fa146344d84991753', + 'ext': 'mp4', + 'title': str, + 'channel': '진짜도현', + 'channel_id': 'c68b8ef525fb3d2fa146344d84991753', + 'channel_is_verified': False, + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1705510344, + 'upload_date': '20240117', + 'live_status': 'is_live', + 'view_count': int, + 'concurrent_view_count': int, + }, + 'skip': 'The channel is not currently live', + }] + + def _real_extract(self, url): + channel_id = self._match_id(url) + live_detail = self._download_json( + f'https://api.chzzk.naver.com/service/v2/channels/{channel_id}/live-detail', channel_id, + note='Downloading channel info', errnote='Unable to download channel info')['content'] + + if live_detail.get('status') == 'CLOSE': + raise ExtractorError('The channel is not currently live', expected=True) + + live_playback = self._parse_json(live_detail['livePlaybackJson'], channel_id) + + thumbnails = [] + thumbnail_template = traverse_obj( + live_playback, ('thumbnail', 'snapshotThumbnailTemplate', {url_or_none})) + if thumbnail_template and '{type}' in thumbnail_template: + for width in traverse_obj(live_playback, ('thumbnail', 'types', ..., {str})): + thumbnails.append({ + 'id': width, + 'url': thumbnail_template.replace('{type}', width), + 'width': int_or_none(width), + }) + + formats, subtitles = [], {} + for media in traverse_obj(live_playback, ('media', lambda _, v: url_or_none(v['path']))): + is_low_latency = media.get('mediaId') == 'LLHLS' + fmts, subs = self._extract_m3u8_formats_and_subtitles( + media['path'], channel_id, 'mp4', fatal=False, live=True, + m3u8_id='hls-ll' if is_low_latency else 'hls') + for f in fmts: + if is_low_latency: + f['source_preference'] = -2 + if '-afragalow.stream-audio.stream' in f['format_id']: + f['quality'] = -2 + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': channel_id, + 'is_live': True, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + **traverse_obj(live_detail, { + 'title': ('liveTitle', {str}), + 'timestamp': ('openDate', {functools.partial(parse_iso8601, delimiter=' ')}), + 'concurrent_view_count': ('concurrentUserCount', {int_or_none}), + 'view_count': ('accumulateCount', {int_or_none}), + 'channel': ('channel', 'channelName', {str}), + 'channel_id': ('channel', 'channelId', {str}), + 'channel_is_verified': ('channel', 'verifiedMark', {bool}), + }), + } + + +class CHZZKVideoIE(InfoExtractor): + IE_NAME = 'chzzk:video' + _VALID_URL = r'https?://chzzk\.naver\.com/video/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://chzzk.naver.com/video/1754', + 'md5': 'b0c0c1bb888d913b93d702b1512c7f06', + 'info_dict': { + 'id': '1754', + 'ext': 'mp4', + 'title': '치지직 테스트 방송', + 'channel': '침착맨', + 'channel_id': 'bb382c2c0cc9fa7c86ab3b037fb5799c', + 'channel_is_verified': False, + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 15577, + 'timestamp': 1702970505.417, + 'upload_date': '20231219', + 'view_count': int, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_meta = self._download_json( + f'https://api.chzzk.naver.com/service/v2/videos/{video_id}', video_id, + note='Downloading video info', errnote='Unable to download video info')['content'] + formats, subtitles = self._extract_mpd_formats_and_subtitles( + f'https://apis.naver.com/neonplayer/vodplay/v1/playback/{video_meta["videoId"]}', video_id, + query={ + 'key': video_meta['inKey'], + 'env': 'real', + 'lc': 'en_US', + 'cpl': 'en_US', + }, note='Downloading video playback', errnote='Unable to download video playback') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(video_meta, { + 'title': ('videoTitle', {str}), + 'thumbnail': ('thumbnailImageUrl', {url_or_none}), + 'timestamp': ('publishDateAt', {functools.partial(float_or_none, scale=1000)}), + 'view_count': ('readCount', {int_or_none}), + 'duration': ('duration', {int_or_none}), + 'channel': ('channel', 'channelName', {str}), + 'channel_id': ('channel', 'channelId', {str}), + 'channel_is_verified': ('channel', 'verifiedMark', {bool}), + }), + } diff --git a/yt_dlp/extractor/cinchcast.py b/yt_dlp/extractor/cinchcast.py deleted file mode 100644 index 7a7ea8b22..000000000 --- a/yt_dlp/extractor/cinchcast.py +++ /dev/null @@ -1,56 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - unified_strdate, - xpath_text, -) - - -class CinchcastIE(InfoExtractor): - _VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P<id>[0-9]+)' - _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1'] - - _TESTS = [{ - 'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single', - 'info_dict': { - 'id': '5258197', - 'ext': 'mp3', - 'title': 'Train Your Brain to Up Your Game with Coach Mandy', - 'upload_date': '20130816', - }, - }, { - # Actual test is run in generic, look for undergroundwellness - 'url': 'http://player.cinchcast.com/?platformId=1&assetType=single&assetId=7141703', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - doc = self._download_xml( - 'http://www.blogtalkradio.com/playerasset/mrss?assetType=single&assetId=%s' % video_id, - video_id) - - item = doc.find('.//item') - title = xpath_text(item, './title', fatal=True) - date_str = xpath_text( - item, './{http://developer.longtailvideo.com/trac/}date') - upload_date = unified_strdate(date_str, day_first=False) - # duration is present but wrong - formats = [{ - 'format_id': 'main', - 'url': item.find('./{http://search.yahoo.com/mrss/}content').attrib['url'], - }] - backup_url = xpath_text( - item, './{http://developer.longtailvideo.com/trac/}backupContent') - if backup_url: - formats.append({ - 'preference': 2, # seems to be more reliable - 'format_id': 'backup', - 'url': backup_url, - }) - - return { - 'id': video_id, - 'title': title, - 'upload_date': upload_date, - 'formats': formats, - } diff --git a/yt_dlp/extractor/clipsyndicate.py b/yt_dlp/extractor/clipsyndicate.py deleted file mode 100644 index 606444321..000000000 --- a/yt_dlp/extractor/clipsyndicate.py +++ /dev/null @@ -1,52 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - find_xpath_attr, - fix_xml_ampersands -) - - -class ClipsyndicateIE(InfoExtractor): - _VALID_URL = r'https?://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)' - - _TESTS = [{ - 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', - 'md5': '4d7d549451bad625e0ff3d7bd56d776c', - 'info_dict': { - 'id': '4629301', - 'ext': 'mp4', - 'title': 'Brick Briscoe', - 'duration': 612, - 'thumbnail': r're:^https?://.+\.jpg', - }, - }, { - 'url': 'http://chic.clipsyndicate.com/video/play/5844117/shark_attack', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - js_player = self._download_webpage( - 'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id, - video_id, 'Downlaoding player') - # it includes a required token - flvars = self._search_regex(r'flvars: "(.*?)"', js_player, 'flvars') - - pdoc = self._download_xml( - 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, - video_id, 'Downloading video info', - transform_source=fix_xml_ampersands) - - track_doc = pdoc.find('trackList/track') - - def find_param(name): - node = find_xpath_attr(track_doc, './/param', 'name', name) - if node is not None: - return node.attrib['value'] - - return { - 'id': video_id, - 'title': find_param('title'), - 'url': track_doc.find('location').text, - 'thumbnail': find_param('thumbnail'), - 'duration': int(find_param('duration')), - } diff --git a/yt_dlp/extractor/cloudflarestream.py b/yt_dlp/extractor/cloudflarestream.py index 748e8e908..c4c7d66a5 100644 --- a/yt_dlp/extractor/cloudflarestream.py +++ b/yt_dlp/extractor/cloudflarestream.py @@ -46,15 +46,18 @@ def _real_extract(self, url): video_id.split('.')[1] + '==='), video_id)['sub'] manifest_base_url = base_url + 'manifest/video.' - formats = self._extract_m3u8_formats( + formats, subtitles = self._extract_m3u8_formats_and_subtitles( manifest_base_url + 'm3u8', video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - formats.extend(self._extract_mpd_formats( - manifest_base_url + 'mpd', video_id, mpd_id='dash', fatal=False)) + fmts, subs = self._extract_mpd_formats_and_subtitles( + manifest_base_url + 'mpd', video_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) return { 'id': video_id, 'title': video_id, 'thumbnail': base_url + 'thumbnails/thumbnail.jpg', 'formats': formats, + 'subtitles': subtitles, } diff --git a/yt_dlp/extractor/cloudy.py b/yt_dlp/extractor/cloudy.py deleted file mode 100644 index 848643e26..000000000 --- a/yt_dlp/extractor/cloudy.py +++ /dev/null @@ -1,57 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - str_to_int, - unified_strdate, -) - - -class CloudyIE(InfoExtractor): - _IE_DESC = 'cloudy.ec' - _VALID_URL = r'https?://(?:www\.)?cloudy\.ec/(?:v/|embed\.php\?.*?\bid=)(?P<id>[A-Za-z0-9]+)' - _TESTS = [{ - 'url': 'https://www.cloudy.ec/v/af511e2527aac', - 'md5': '29832b05028ead1b58be86bf319397ca', - 'info_dict': { - 'id': 'af511e2527aac', - 'ext': 'mp4', - 'title': 'Funny Cats and Animals Compilation june 2013', - 'upload_date': '20130913', - 'view_count': int, - } - }, { - 'url': 'http://www.cloudy.ec/embed.php?autoplay=1&id=af511e2527aac', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'https://www.cloudy.ec/embed.php', video_id, query={ - 'id': video_id, - 'playerPage': 1, - 'autoplay': 1, - }) - - info = self._parse_html5_media_entries(url, webpage, video_id)[0] - - webpage = self._download_webpage( - 'https://www.cloudy.ec/v/%s' % video_id, video_id, fatal=False) - - if webpage: - info.update({ - 'title': self._search_regex( - r'<h\d[^>]*>([^<]+)<', webpage, 'title'), - 'upload_date': unified_strdate(self._search_regex( - r'>Published at (\d{4}-\d{1,2}-\d{1,2})', webpage, - 'upload date', fatal=False)), - 'view_count': str_to_int(self._search_regex( - r'([\d,.]+) views<', webpage, 'view count', fatal=False)), - }) - - if not info.get('title'): - info['title'] = video_id - - info['id'] = video_id - - return info diff --git a/yt_dlp/extractor/clubic.py b/yt_dlp/extractor/clubic.py index 403e44aaf..716f25969 100644 --- a/yt_dlp/extractor/clubic.py +++ b/yt_dlp/extractor/clubic.py @@ -6,6 +6,7 @@ class ClubicIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P<id>[0-9]+)\.html' _TESTS = [{ diff --git a/yt_dlp/extractor/cmt.py b/yt_dlp/extractor/cmt.py index 8aed7708b..6359102aa 100644 --- a/yt_dlp/extractor/cmt.py +++ b/yt_dlp/extractor/cmt.py @@ -4,6 +4,7 @@ class CMTIE(MTVIE): # XXX: Do not subclass from concrete IE + _WORKING = False IE_NAME = 'cmt.com' _VALID_URL = r'https?://(?:www\.)?cmt\.com/(?:videos|shows|(?:full-)?episodes|video-clips)/(?P<id>[^/]+)' diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 4e8049004..84bdaf25f 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -286,6 +286,9 @@ class InfoExtractor: If it is not clear whether to use timestamp or this, use the former release_date: The date (YYYYMMDD) when the video was released in UTC. If not explicitly set, calculated from release_timestamp + release_year: Year (YYYY) as integer when the video or album was released. + To be used if no exact release date is known. + If not explicitly set, calculated from release_date. modified_timestamp: UNIX timestamp of the moment the video was last modified. modified_date: The date (YYYYMMDD) when the video was last modified in UTC. If not explicitly set, calculated from modified_timestamp @@ -379,6 +382,7 @@ class InfoExtractor: 'private', 'premium_only', 'subscriber_only', 'needs_auth', 'unlisted' or 'public'. Use 'InfoExtractor._availability' to set it + media_type: The type of media as classified by the site, e.g. "episode", "clip", "trailer" _old_archive_ids: A list of old archive ids needed for backward compatibility _format_sort_fields: A list of fields to use for sorting formats __post_extractor: A function to be called just before the metadata is @@ -427,7 +431,6 @@ class InfoExtractor: and compilations). disc_number: Number of the disc or other physical medium the track belongs to, as an integer. - release_year: Year (YYYY) when the album was released. composer: Composer of the piece The following fields should only be set for clips that should be cut from the original video: @@ -2225,7 +2228,9 @@ def _extract_mpd_vod_duration( mpd_url, video_id, note='Downloading MPD VOD manifest' if note is None else note, errnote='Failed to download VOD manifest' if errnote is None else errnote, - fatal=False, data=data, headers=headers, query=query) or {} + fatal=False, data=data, headers=headers, query=query) + if not isinstance(mpd_doc, xml.etree.ElementTree.Element): + return None return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration'))) @staticmethod @@ -2339,7 +2344,9 @@ def _parse_smil_formats_and_subtitles( imgs_count = 0 srcs = set() - media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) + media = itertools.chain.from_iterable( + smil.findall(self._xpath_ns(arg, namespace)) + for arg in ['.//video', './/audio', './/media']) for medium in media: src = medium.get('src') if not src or src in srcs: diff --git a/yt_dlp/extractor/craftsy.py b/yt_dlp/extractor/craftsy.py index 307bfb946..5d3733143 100644 --- a/yt_dlp/extractor/craftsy.py +++ b/yt_dlp/extractor/craftsy.py @@ -10,7 +10,7 @@ class CraftsyIE(InfoExtractor): - _VALID_URL = r'https?://www.craftsy.com/class/(?P<id>[a-z0-9_-]+)/' + _VALID_URL = r'https?://www\.craftsy\.com/class/(?P<id>[\w-]+)' _TESTS = [{ 'url': 'https://www.craftsy.com/class/the-midnight-quilt-show-season-5/', 'info_dict': { diff --git a/yt_dlp/extractor/cwtv.py b/yt_dlp/extractor/cwtv.py index 9b83264ee..69d50daf6 100644 --- a/yt_dlp/extractor/cwtv.py +++ b/yt_dlp/extractor/cwtv.py @@ -46,6 +46,10 @@ class CWTVIE(InfoExtractor): 'timestamp': 1444107300, 'age_limit': 14, 'uploader': 'CWTV', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'chapters': 'count:4', + 'episode': 'Episode 20', + 'season': 'Season 11', }, 'params': { # m3u8 download diff --git a/yt_dlp/extractor/cybrary.py b/yt_dlp/extractor/cybrary.py index 73f2439b3..c4c78ee1b 100644 --- a/yt_dlp/extractor/cybrary.py +++ b/yt_dlp/extractor/cybrary.py @@ -45,7 +45,7 @@ def _get_vimeo_id(self, activity_id): class CybraryIE(CybraryBaseIE): - _VALID_URL = r'https?://app.cybrary.it/immersive/(?P<enrollment>[0-9]+)/activity/(?P<id>[0-9]+)' + _VALID_URL = r'https?://app\.cybrary\.it/immersive/(?P<enrollment>[0-9]+)/activity/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'https://app.cybrary.it/immersive/12487950/activity/63102', 'md5': '9ae12d37e555cb2ed554223a71a701d0', @@ -105,12 +105,12 @@ def _real_extract(self, url): 'chapter': module.get('title'), 'chapter_id': str_or_none(module.get('id')), 'title': activity.get('title'), - 'url': smuggle_url(f'https://player.vimeo.com/video/{vimeo_id}', {'http_headers': {'Referer': 'https://api.cybrary.it'}}) + 'url': smuggle_url(f'https://player.vimeo.com/video/{vimeo_id}', {'referer': 'https://api.cybrary.it'}) } class CybraryCourseIE(CybraryBaseIE): - _VALID_URL = r'https://app.cybrary.it/browse/course/(?P<id>[\w-]+)/?(?:$|[#?])' + _VALID_URL = r'https://app\.cybrary\.it/browse/course/(?P<id>[\w-]+)/?(?:$|[#?])' _TESTS = [{ 'url': 'https://app.cybrary.it/browse/course/az-500-microsoft-azure-security-technologies', 'info_dict': { diff --git a/yt_dlp/extractor/daftsex.py b/yt_dlp/extractor/daftsex.py deleted file mode 100644 index 92510c767..000000000 --- a/yt_dlp/extractor/daftsex.py +++ /dev/null @@ -1,150 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_b64decode -from ..utils import ( - ExtractorError, - int_or_none, - js_to_json, - parse_count, - parse_duration, - traverse_obj, - try_get, - unified_timestamp, -) - - -class DaftsexIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?daft\.sex/watch/(?P<id>-?\d+_\d+)' - _TESTS = [{ - 'url': 'https://daft.sex/watch/-35370899_456246186', - 'md5': '64c04ef7b4c7b04b308f3b0c78efe7cd', - 'info_dict': { - 'id': '-35370899_456246186', - 'ext': 'mp4', - 'title': 'just relaxing', - 'description': 'just relaxing – Watch video Watch video in high quality', - 'upload_date': '20201113', - 'timestamp': 1605261911, - 'thumbnail': r're:^https?://.*\.jpg$', - 'age_limit': 18, - 'duration': 15.0, - 'view_count': int - }, - }, { - 'url': 'https://daft.sex/watch/-156601359_456242791', - 'info_dict': { - 'id': '-156601359_456242791', - 'ext': 'mp4', - 'title': 'Skye Blue - Dinner And A Show', - 'description': 'Skye Blue - Dinner And A Show - Watch video Watch video in high quality', - 'upload_date': '20200916', - 'timestamp': 1600250735, - 'thumbnail': 'https://psv153-1.crazycloud.ru/videos/-156601359/456242791/thumb.jpg?extra=i3D32KaBbBFf9TqDRMAVmQ', - }, - 'skip': 'deleted / private' - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - title = self._html_search_meta('name', webpage, 'title') - timestamp = unified_timestamp(self._html_search_meta('uploadDate', webpage, 'Upload Date', default=None)) - description = self._html_search_meta('description', webpage, 'Description', default=None) - - duration = parse_duration(self._search_regex( - r'Duration: ((?:[0-9]{2}:){0,2}[0-9]{2})', - webpage, 'duration', fatal=False)) - views = parse_count(self._search_regex( - r'Views: ([0-9 ]+)', - webpage, 'views', fatal=False)) - - player_hash = self._search_regex( - r'DaxabPlayer\.Init\({[\s\S]*hash:\s*"([0-9a-zA-Z_\-]+)"[\s\S]*}', - webpage, 'player hash') - player_color = self._search_regex( - r'DaxabPlayer\.Init\({[\s\S]*color:\s*"([0-9a-z]+)"[\s\S]*}', - webpage, 'player color', fatal=False) or '' - - embed_page = self._download_webpage( - 'https://dxb.to/player/%s?color=%s' % (player_hash, player_color), - video_id, headers={'Referer': url}) - video_params = self._parse_json( - self._search_regex( - r'window\.globParams\s*=\s*({[\S\s]+})\s*;\s*<\/script>', - embed_page, 'video parameters'), - video_id, transform_source=js_to_json) - - server_domain = 'https://%s' % compat_b64decode(video_params['server'][::-1]).decode('utf-8') - - cdn_files = traverse_obj(video_params, ('video', 'cdn_files')) or {} - if cdn_files: - formats = [] - for format_id, format_data in cdn_files.items(): - ext, height = format_id.split('_') - formats.append({ - 'format_id': format_id, - 'url': f'{server_domain}/videos/{video_id.replace("_", "/")}/{height}.mp4?extra={format_data.split(".")[-1]}', - 'height': int_or_none(height), - 'ext': ext, - }) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'description': description, - 'duration': duration, - 'thumbnail': try_get(video_params, lambda vi: 'https:' + compat_b64decode(vi['video']['thumb']).decode('utf-8')), - 'timestamp': timestamp, - 'view_count': views, - 'age_limit': 18, - } - - items = self._download_json( - f'{server_domain}/method/video.get/{video_id}', video_id, - headers={'Referer': url}, query={ - 'token': video_params['video']['access_token'], - 'videos': video_id, - 'ckey': video_params['c_key'], - 'credentials': video_params['video']['credentials'], - })['response']['items'] - - if not items: - raise ExtractorError('Video is not available', video_id=video_id, expected=True) - - item = items[0] - formats = [] - for f_id, f_url in item.get('files', {}).items(): - if f_id == 'external': - return self.url_result(f_url) - ext, height = f_id.split('_') - height_extra_key = traverse_obj(video_params, ('video', 'partial', 'quality', height)) - if height_extra_key: - formats.append({ - 'format_id': f'{height}p', - 'url': f'{server_domain}/{f_url[8:]}&videos={video_id}&extra_key={height_extra_key}', - 'height': int_or_none(height), - 'ext': ext, - }) - - thumbnails = [] - for k, v in item.items(): - if k.startswith('photo_') and v: - width = k.replace('photo_', '') - thumbnails.append({ - 'id': width, - 'url': v, - 'width': int_or_none(width), - }) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'comment_count': int_or_none(item.get('comments')), - 'description': description, - 'duration': duration, - 'thumbnails': thumbnails, - 'timestamp': timestamp, - 'view_count': views, - 'age_limit': 18, - } diff --git a/yt_dlp/extractor/dailymotion.py b/yt_dlp/extractor/dailymotion.py index 21263d41b..708d6fed2 100644 --- a/yt_dlp/extractor/dailymotion.py +++ b/yt_dlp/extractor/dailymotion.py @@ -93,7 +93,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): _VALID_URL = r'''(?ix) https?:// (?: - (?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:(?:embed|swf|\#)/)|player\.html\?)?video|swf)| + (?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:(?:embed|swf|\#)/)|player(?:/\w+)?\.html\?)?video|swf)| (?:www\.)?lequipe\.fr/video ) [/=](?P<id>[^/?_&]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))? @@ -107,13 +107,17 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'id': 'x5kesuj', 'ext': 'mp4', 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller', - 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller', + 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller', 'duration': 187, 'timestamp': 1493651285, 'upload_date': '20170501', 'uploader': 'Deadline', 'uploader_id': 'x1xm8ri', 'age_limit': 0, + 'view_count': int, + 'like_count': int, + 'tags': ['hollywood', 'celeb', 'celebrity', 'movies', 'red carpet'], + 'thumbnail': r're:https://(?:s[12]\.)dmcdn\.net/v/K456B1aXqIx58LKWQ/x1080', }, }, { 'url': 'https://geo.dailymotion.com/player.html?video=x89eyek&mute=true', @@ -132,7 +136,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'view_count': int, 'like_count': int, 'tags': ['en_quete_d_esprit'], - 'thumbnail': 'https://s2.dmcdn.net/v/Tncwi1YGKdvFbDuDY/x1080', + 'thumbnail': r're:https://(?:s[12]\.)dmcdn\.net/v/Tncwi1YNg_RUl7ueu/x1080', } }, { 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', @@ -201,6 +205,12 @@ class DailymotionIE(DailymotionBaseInfoExtractor): }, { 'url': 'https://www.dailymotion.com/video/x3z49k?playlist=xv4bw', 'only_matching': True, + }, { + 'url': 'https://geo.dailymotion.com/player/x86gw.html?video=k46oCapRs4iikoz9DWy', + 'only_matching': True, + }, { + 'url': 'https://geo.dailymotion.com/player/xakln.html?video=x8mjju4&customConfig%5BcustomParams%5D=%2Ffr-fr%2Ftennis%2Fwimbledon-mens-singles%2Farticles-video', + 'only_matching': True, }] _GEO_BYPASS = False _COMMON_MEDIA_FIELDS = '''description diff --git a/yt_dlp/extractor/defense.py b/yt_dlp/extractor/defense.py deleted file mode 100644 index 7d73ea862..000000000 --- a/yt_dlp/extractor/defense.py +++ /dev/null @@ -1,37 +0,0 @@ -from .common import InfoExtractor - - -class DefenseGouvFrIE(InfoExtractor): - IE_NAME = 'defense.gouv.fr' - _VALID_URL = r'https?://.*?\.defense\.gouv\.fr/layout/set/ligthboxvideo/base-de-medias/webtv/(?P<id>[^/?#]*)' - - _TEST = { - 'url': 'http://www.defense.gouv.fr/layout/set/ligthboxvideo/base-de-medias/webtv/attaque-chimique-syrienne-du-21-aout-2013-1', - 'md5': '75bba6124da7e63d2d60b5244ec9430c', - 'info_dict': { - 'id': '11213', - 'ext': 'mp4', - 'title': 'attaque-chimique-syrienne-du-21-aout-2013-1' - } - } - - def _real_extract(self, url): - title = self._match_id(url) - webpage = self._download_webpage(url, title) - - video_id = self._search_regex( - r"flashvars.pvg_id=\"(\d+)\";", - webpage, 'ID') - - json_url = ( - 'http://static.videos.gouv.fr/brightcovehub/export/json/%s' % - video_id) - info = self._download_json(json_url, title, 'Downloading JSON config') - video_url = info['renditions'][0]['url'] - - return { - 'id': video_id, - 'ext': 'mp4', - 'url': video_url, - 'title': title, - } diff --git a/yt_dlp/extractor/dhm.py b/yt_dlp/extractor/dhm.py index 3d42fc2b0..a5f5f794c 100644 --- a/yt_dlp/extractor/dhm.py +++ b/yt_dlp/extractor/dhm.py @@ -3,6 +3,7 @@ class DHMIE(InfoExtractor): + _WORKING = False IE_DESC = 'Filmarchiv - Deutsches Historisches Museum' _VALID_URL = r'https?://(?:www\.)?dhm\.de/filmarchiv/(?:[^/]+/)+(?P<id>[^/]+)' diff --git a/yt_dlp/extractor/dotsub.py b/yt_dlp/extractor/dotsub.py deleted file mode 100644 index 079f83750..000000000 --- a/yt_dlp/extractor/dotsub.py +++ /dev/null @@ -1,81 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - float_or_none, - int_or_none, -) - - -class DotsubIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dotsub\.com/view/(?P<id>[^/]+)' - _TESTS = [{ - 'url': 'https://dotsub.com/view/9c63db2a-fa95-4838-8e6e-13deafe47f09', - 'md5': '21c7ff600f545358134fea762a6d42b6', - 'info_dict': { - 'id': '9c63db2a-fa95-4838-8e6e-13deafe47f09', - 'ext': 'flv', - 'title': 'MOTIVATION - "It\'s Possible" Best Inspirational Video Ever', - 'description': 'md5:41af1e273edbbdfe4e216a78b9d34ac6', - 'thumbnail': 're:^https?://dotsub.com/media/9c63db2a-fa95-4838-8e6e-13deafe47f09/p', - 'duration': 198, - 'uploader': 'liuxt', - 'timestamp': 1385778501.104, - 'upload_date': '20131130', - 'view_count': int, - } - }, { - 'url': 'https://dotsub.com/view/747bcf58-bd59-45b7-8c8c-ac312d084ee6', - 'md5': '2bb4a83896434d5c26be868c609429a3', - 'info_dict': { - 'id': '168006778', - 'ext': 'mp4', - 'title': 'Apartments and flats in Raipur the white symphony', - 'description': 'md5:784d0639e6b7d1bc29530878508e38fe', - 'thumbnail': 're:^https?://dotsub.com/media/747bcf58-bd59-45b7-8c8c-ac312d084ee6/p', - 'duration': 290, - 'timestamp': 1476767794.2809999, - 'upload_date': '20161018', - 'uploader': 'parthivi001', - 'uploader_id': 'user52596202', - 'view_count': int, - }, - 'add_ie': ['Vimeo'], - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - info = self._download_json( - 'https://dotsub.com/api/media/%s/metadata' % video_id, video_id) - video_url = info.get('mediaURI') - - if not video_url: - webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - [r'<source[^>]+src="([^"]+)"', r'"file"\s*:\s*\'([^\']+)'], - webpage, 'video url', default=None) - info_dict = { - 'id': video_id, - 'url': video_url, - 'ext': 'flv', - } - - if not video_url: - setup_data = self._parse_json(self._html_search_regex( - r'(?s)data-setup=([\'"])(?P<content>(?!\1).+?)\1', - webpage, 'setup data', group='content'), video_id) - info_dict = { - '_type': 'url_transparent', - 'url': setup_data['src'], - } - - info_dict.update({ - 'title': info['title'], - 'description': info.get('description'), - 'thumbnail': info.get('screenshotURI'), - 'duration': int_or_none(info.get('duration'), 1000), - 'uploader': info.get('user'), - 'timestamp': float_or_none(info.get('dateCreated'), 1000), - 'view_count': int_or_none(info.get('numberOfViews')), - }) - - return info_dict diff --git a/yt_dlp/extractor/drtv.py b/yt_dlp/extractor/drtv.py index 6c381aa14..2a6e337bf 100644 --- a/yt_dlp/extractor/drtv.py +++ b/yt_dlp/extractor/drtv.py @@ -1,21 +1,17 @@ -import binascii -import hashlib -import re +import json +import uuid from .common import InfoExtractor -from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 -from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, - float_or_none, int_or_none, mimetype2ext, - str_or_none, - traverse_obj, - unified_timestamp, + parse_iso8601, + try_call, update_url_query, url_or_none, ) +from ..utils.traversal import traverse_obj SERIES_API = 'https://production-cdn.dr-massive.com/api/page?device=web_browser&item_detail_expand=all&lang=da&max_list_prefetch=3&path=%s' @@ -24,7 +20,7 @@ class DRTVIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?dr\.dk/(?:tv/se|nyheder|(?P<radio>radio|lyd)(?:/ondemand)?)/(?:[^/]+/)*| + (?:www\.)?dr\.dk/tv/se(?:/ondemand)?/(?:[^/?#]+/)*| (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/ ) (?P<id>[\da-z_-]+) @@ -53,22 +49,6 @@ class DRTVIE(InfoExtractor): }, 'expected_warnings': ['Unable to download f4m manifest'], 'skip': 'this video has been removed', - }, { - # embed - 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang', - 'info_dict': { - 'id': 'urn:dr:mu:programcard:57c926176187a50a9c6e83c6', - 'ext': 'mp4', - 'title': 'christiania pusher street ryddes drdkrjpo', - 'description': 'md5:2a71898b15057e9b97334f61d04e6eb5', - 'timestamp': 1472800279, - 'upload_date': '20160902', - 'duration': 131.4, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download f4m manifest'], }, { # with SignLanguage formats 'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder', @@ -87,33 +67,54 @@ class DRTVIE(InfoExtractor): 'season': 'Historien om Danmark', 'series': 'Historien om Danmark', }, - 'params': { - 'skip_download': True, - }, + 'skip': 'this video has been removed', }, { - 'url': 'https://www.dr.dk/lyd/p4kbh/regionale-nyheder-kh4/p4-nyheder-2019-06-26-17-30-9', - 'only_matching': True, - }, { - 'url': 'https://www.dr.dk/drtv/se/bonderoeven_71769', + 'url': 'https://www.dr.dk/drtv/se/frank-and-kastaniegaarden_71769', 'info_dict': { 'id': '00951930010', 'ext': 'mp4', - 'title': 'Bonderøven 2019 (1:8)', - 'description': 'md5:b6dcfe9b6f0bea6703e9a0092739a5bd', - 'timestamp': 1654856100, - 'upload_date': '20220610', - 'duration': 2576.6, - 'season': 'Bonderøven 2019', - 'season_id': 'urn:dr:mu:bundle:5c201667a11fa01ca4528ce5', + 'title': 'Frank & Kastaniegaarden', + 'description': 'md5:974e1780934cf3275ef10280204bccb0', + 'release_timestamp': 1546545600, + 'release_date': '20190103', + 'duration': 2576, + 'season': 'Frank & Kastaniegaarden', + 'season_id': '67125', 'release_year': 2019, 'season_number': 2019, 'series': 'Frank & Kastaniegaarden', 'episode_number': 1, - 'episode': 'Episode 1', + 'episode': 'Frank & Kastaniegaarden', + 'thumbnail': r're:https?://.+', }, 'params': { 'skip_download': True, }, + }, { + # Foreign and Regular subtitle track + 'url': 'https://www.dr.dk/drtv/se/spise-med-price_-pasta-selv_397445', + 'info_dict': { + 'id': '00212301010', + 'ext': 'mp4', + 'episode_number': 1, + 'title': 'Spise med Price: Pasta Selv', + 'alt_title': '1. Pasta Selv', + 'release_date': '20230807', + 'description': 'md5:2da9060524fed707810d71080b3d0cd8', + 'duration': 1750, + 'season': 'Spise med Price', + 'release_timestamp': 1691438400, + 'season_id': '397440', + 'episode': 'Spise med Price: Pasta Selv', + 'thumbnail': r're:https?://.+', + 'season_number': 15, + 'series': 'Spise med Price', + 'release_year': 2022, + 'subtitles': 'mincount:2', + }, + 'params': { + 'skip_download': 'm3u8', + }, }, { 'url': 'https://www.dr.dk/drtv/episode/bonderoeven_71769', 'only_matching': True, @@ -123,226 +124,127 @@ class DRTVIE(InfoExtractor): }, { 'url': 'https://www.dr.dk/drtv/program/jagten_220924', 'only_matching': True, - }, { - 'url': 'https://www.dr.dk/lyd/p4aarhus/regionale-nyheder-ar4/regionale-nyheder-2022-05-05-12-30-3', - 'info_dict': { - 'id': 'urn:dr:mu:programcard:6265cb2571401424d0360113', - 'title': "Regionale nyheder", - 'ext': 'mp4', - 'duration': 120.043, - 'series': 'P4 Østjylland regionale nyheder', - 'timestamp': 1651746600, - 'season': 'Regionale nyheder', - 'release_year': 0, - 'season_id': 'urn:dr:mu:bundle:61c26889539f0201586b73c5', - 'description': '', - 'upload_date': '20220505', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'this video has been removed', - }, { - 'url': 'https://www.dr.dk/lyd/p4kbh/regionale-nyheder-kh4/regionale-nyheder-2023-03-14-10-30-9', - 'info_dict': { - 'ext': 'mp4', - 'id': '14802310112', - 'timestamp': 1678786200, - 'duration': 120.043, - 'season_id': 'urn:dr:mu:bundle:63a4f7c87140143504b6710f', - 'series': 'P4 København regionale nyheder', - 'upload_date': '20230314', - 'release_year': 0, - 'description': 'Hør seneste regionale nyheder fra P4 København.', - 'season': 'Regionale nyheder', - 'title': 'Regionale nyheder', - }, }] + SUBTITLE_LANGS = { + 'DanishLanguageSubtitles': 'da', + 'ForeignLanguageSubtitles': 'da_foreign', + 'CombinedLanguageSubtitles': 'da_combined', + } + + _TOKEN = None + + def _real_initialize(self): + if self._TOKEN: + return + + token_response = self._download_json( + 'https://production.dr-massive.com/api/authorization/anonymous-sso', None, + note='Downloading anonymous token', headers={ + 'content-type': 'application/json', + }, query={ + 'device': 'web_browser', + 'ff': 'idp,ldp,rpt', + 'lang': 'da', + 'supportFallbackToken': 'true', + }, data=json.dumps({ + 'deviceId': str(uuid.uuid4()), + 'scopes': ['Catalog'], + 'optout': True, + }).encode()) + + self._TOKEN = traverse_obj( + token_response, (lambda _, x: x['type'] == 'UserAccount', 'value', {str}), get_all=False) + if not self._TOKEN: + raise ExtractorError('Unable to get anonymous token') + def _real_extract(self, url): - raw_video_id, is_radio_url = self._match_valid_url(url).group('id', 'radio') + url_slug = self._match_id(url) + webpage = self._download_webpage(url, url_slug) - webpage = self._download_webpage(url, raw_video_id) - - if '>Programmet er ikke længere tilgængeligt' in webpage: - raise ExtractorError( - 'Video %s is not available' % raw_video_id, expected=True) - - video_id = self._search_regex( - (r'data-(?:material-identifier|episode-slug)="([^"]+)"', - r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'), - webpage, 'video id', default=None) - - if not video_id: - video_id = self._search_regex( - r'(urn(?:%3A|:)dr(?:%3A|:)mu(?:%3A|:)programcard(?:%3A|:)[\da-f]+)', - webpage, 'urn', default=None) - if video_id: - video_id = compat_urllib_parse_unquote(video_id) - - _PROGRAMCARD_BASE = 'https://www.dr.dk/mu-online/api/1.4/programcard' - query = {'expanded': 'true'} - - if video_id: - programcard_url = '%s/%s' % (_PROGRAMCARD_BASE, video_id) + json_data = self._search_json( + r'window\.__data\s*=', webpage, 'data', url_slug, fatal=False) or {} + item = traverse_obj( + json_data, ('cache', 'page', ..., (None, ('entries', 0)), 'item', {dict}), get_all=False) + if item: + item_id = item.get('id') else: - programcard_url = _PROGRAMCARD_BASE - if is_radio_url: - video_id = self._search_nextjs_data( - webpage, raw_video_id)['props']['pageProps']['episode']['productionNumber'] - else: - json_data = self._search_json( - r'window\.__data\s*=', webpage, 'data', raw_video_id) - video_id = traverse_obj(json_data, ( - 'cache', 'page', ..., (None, ('entries', 0)), 'item', 'customId', - {lambda x: x.split(':')[-1]}), get_all=False) - if not video_id: - raise ExtractorError('Unable to extract video id') - query['productionnumber'] = video_id + item_id = url_slug.rsplit('_', 1)[-1] + item = self._download_json( + f'https://production-cdn.dr-massive.com/api/items/{item_id}', item_id, + note='Attempting to download backup item data', query={ + 'device': 'web_browser', + 'expand': 'all', + 'ff': 'idp,ldp,rpt', + 'geoLocation': 'dk', + 'isDeviceAbroad': 'false', + 'lang': 'da', + 'segments': 'drtv,optedout', + 'sub': 'Anonymous', + }) - data = self._download_json( - programcard_url, video_id, 'Downloading video JSON', query=query) - - supplementary_data = {} - if re.search(r'_\d+$', raw_video_id): - supplementary_data = self._download_json( - SERIES_API % f'/episode/{raw_video_id}', raw_video_id, fatal=False) or {} - - title = str_or_none(data.get('Title')) or re.sub( - r'\s*\|\s*(?:TV\s*\|\s*DR|DRTV)$', '', - self._og_search_title(webpage)) - description = self._og_search_description( - webpage, default=None) or data.get('Description') - - timestamp = unified_timestamp( - data.get('PrimaryBroadcastStartTime') or data.get('SortDateTime')) - - thumbnail = None - duration = None - - restricted_to_denmark = False + video_id = try_call(lambda: item['customId'].rsplit(':', 1)[-1]) or item_id + stream_data = self._download_json( + f'https://production.dr-massive.com/api/account/items/{item_id}/videos', video_id, + note='Downloading stream data', query={ + 'delivery': 'stream', + 'device': 'web_browser', + 'ff': 'idp,ldp,rpt', + 'lang': 'da', + 'resolution': 'HD-1080', + 'sub': 'Anonymous', + }, headers={'authorization': f'Bearer {self._TOKEN}'}) formats = [] subtitles = {} + for stream in traverse_obj(stream_data, (lambda _, x: x['url'])): + format_id = stream.get('format', 'na') + access_service = stream.get('accessService') + preference = None + subtitle_suffix = '' + if access_service in ('SpokenSubtitles', 'SignLanguage', 'VisuallyInterpreted'): + preference = -1 + format_id += f'-{access_service}' + subtitle_suffix = f'-{access_service}' + elif access_service == 'StandardVideo': + preference = 1 + fmts, subs = self._extract_m3u8_formats_and_subtitles( + stream.get('url'), video_id, ext='mp4', preference=preference, m3u8_id=format_id, fatal=False) + formats.extend(fmts) - assets = [] - primary_asset = data.get('PrimaryAsset') - if isinstance(primary_asset, dict): - assets.append(primary_asset) - secondary_assets = data.get('SecondaryAssets') - if isinstance(secondary_assets, list): - for secondary_asset in secondary_assets: - if isinstance(secondary_asset, dict): - assets.append(secondary_asset) + api_subtitles = traverse_obj(stream, ('subtitles', lambda _, v: url_or_none(v['link']), {dict})) + if not api_subtitles: + self._merge_subtitles(subs, target=subtitles) - def hex_to_bytes(hex): - return binascii.a2b_hex(hex.encode('ascii')) + for sub_track in api_subtitles: + lang = sub_track.get('language') or 'da' + subtitles.setdefault(self.SUBTITLE_LANGS.get(lang, lang) + subtitle_suffix, []).append({ + 'url': sub_track['link'], + 'ext': mimetype2ext(sub_track.get('format')) or 'vtt' + }) - def decrypt_uri(e): - n = int(e[2:10], 16) - a = e[10 + n:] - data = hex_to_bytes(e[10:10 + n]) - key = hashlib.sha256(('%s:sRBzYNXBzkKgnjj8pGtkACch' % a).encode('utf-8')).digest() - iv = hex_to_bytes(a) - decrypted = unpad_pkcs7(aes_cbc_decrypt_bytes(data, key, iv)) - return decrypted.decode('utf-8').split('?')[0] - - for asset in assets: - kind = asset.get('Kind') - if kind == 'Image': - thumbnail = url_or_none(asset.get('Uri')) - elif kind in ('VideoResource', 'AudioResource'): - duration = float_or_none(asset.get('DurationInMilliseconds'), 1000) - restricted_to_denmark = asset.get('RestrictedToDenmark') - asset_target = asset.get('Target') - for link in asset.get('Links', []): - uri = link.get('Uri') - if not uri: - encrypted_uri = link.get('EncryptedUri') - if not encrypted_uri: - continue - try: - uri = decrypt_uri(encrypted_uri) - except Exception: - self.report_warning( - 'Unable to decrypt EncryptedUri', video_id) - continue - uri = url_or_none(uri) - if not uri: - continue - target = link.get('Target') - format_id = target or '' - if asset_target in ('SpokenSubtitles', 'SignLanguage', 'VisuallyInterpreted'): - preference = -1 - format_id += '-%s' % asset_target - elif asset_target == 'Default': - preference = 1 - else: - preference = None - if target == 'HDS': - f4m_formats = self._extract_f4m_formats( - uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', - video_id, preference, f4m_id=format_id, fatal=False) - if kind == 'AudioResource': - for f in f4m_formats: - f['vcodec'] = 'none' - formats.extend(f4m_formats) - elif target == 'HLS': - fmts, subs = self._extract_m3u8_formats_and_subtitles( - uri, video_id, 'mp4', entry_protocol='m3u8_native', - quality=preference, m3u8_id=format_id, fatal=False) - formats.extend(fmts) - self._merge_subtitles(subs, target=subtitles) - else: - bitrate = link.get('Bitrate') - if bitrate: - format_id += '-%s' % bitrate - formats.append({ - 'url': uri, - 'format_id': format_id, - 'tbr': int_or_none(bitrate), - 'ext': link.get('FileFormat'), - 'vcodec': 'none' if kind == 'AudioResource' else None, - 'quality': preference, - }) - subtitles_list = asset.get('SubtitlesList') or asset.get('Subtitleslist') - if isinstance(subtitles_list, list): - LANGS = { - 'Danish': 'da', - } - for subs in subtitles_list: - if not isinstance(subs, dict): - continue - sub_uri = url_or_none(subs.get('Uri')) - if not sub_uri: - continue - lang = subs.get('Language') or 'da' - subtitles.setdefault(LANGS.get(lang, lang), []).append({ - 'url': sub_uri, - 'ext': mimetype2ext(subs.get('MimeType')) or 'vtt' - }) - - if not formats and restricted_to_denmark: - self.raise_geo_restricted( - 'Unfortunately, DR is not allowed to show this program outside Denmark.', - countries=self._GEO_COUNTRIES) + if not formats and traverse_obj(item, ('season', 'customFields', 'IsGeoRestricted')): + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) return { 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, 'formats': formats, 'subtitles': subtitles, - 'series': str_or_none(data.get('SeriesTitle')), - 'season': str_or_none(data.get('SeasonTitle')), - 'season_number': int_or_none(data.get('SeasonNumber')), - 'season_id': str_or_none(data.get('SeasonUrn')), - 'episode': traverse_obj(supplementary_data, ('entries', 0, 'item', 'contextualTitle')) or str_or_none(data.get('EpisodeTitle')), - 'episode_number': traverse_obj(supplementary_data, ('entries', 0, 'item', 'episodeNumber')) or int_or_none(data.get('EpisodeNumber')), - 'release_year': int_or_none(data.get('ProductionYear')), + **traverse_obj(item, { + 'title': 'title', + 'alt_title': 'contextualTitle', + 'description': 'description', + 'thumbnail': ('images', 'wallpaper'), + 'release_timestamp': ('customFields', 'BroadcastTimeDK', {parse_iso8601}), + 'duration': ('duration', {int_or_none}), + 'series': ('season', 'show', 'title'), + 'season': ('season', 'title'), + 'season_number': ('season', 'seasonNumber', {int_or_none}), + 'season_id': 'seasonId', + 'episode': 'episodeName', + 'episode_number': ('episodeNumber', {int_or_none}), + 'release_year': ('releaseYear', {int_or_none}), + }), } @@ -412,6 +314,8 @@ class DRTVSeasonIE(InfoExtractor): 'display_id': 'frank-and-kastaniegaarden', 'title': 'Frank & Kastaniegaarden', 'series': 'Frank & Kastaniegaarden', + 'season_number': 2008, + 'alt_title': 'Season 2008', }, 'playlist_mincount': 8 }, { @@ -421,6 +325,8 @@ class DRTVSeasonIE(InfoExtractor): 'display_id': 'frank-and-kastaniegaarden', 'title': 'Frank & Kastaniegaarden', 'series': 'Frank & Kastaniegaarden', + 'season_number': 2009, + 'alt_title': 'Season 2009', }, 'playlist_mincount': 19 }] @@ -434,6 +340,7 @@ def _real_extract(self, url): 'url': f'https://www.dr.dk/drtv{episode["path"]}', 'ie_key': DRTVIE.ie_key(), 'title': episode.get('title'), + 'alt_title': episode.get('contextualTitle'), 'episode': episode.get('episodeName'), 'description': episode.get('shortDescription'), 'series': traverse_obj(data, ('entries', 0, 'item', 'title')), @@ -446,6 +353,7 @@ def _real_extract(self, url): 'id': season_id, 'display_id': display_id, 'title': traverse_obj(data, ('entries', 0, 'item', 'title')), + 'alt_title': traverse_obj(data, ('entries', 0, 'item', 'contextualTitle')), 'series': traverse_obj(data, ('entries', 0, 'item', 'title')), 'entries': entries, 'season_number': traverse_obj(data, ('entries', 0, 'item', 'seasonNumber')) @@ -463,6 +371,7 @@ class DRTVSeriesIE(InfoExtractor): 'display_id': 'frank-and-kastaniegaarden', 'title': 'Frank & Kastaniegaarden', 'series': 'Frank & Kastaniegaarden', + 'alt_title': '', }, 'playlist_mincount': 15 }] @@ -476,6 +385,7 @@ def _real_extract(self, url): 'url': f'https://www.dr.dk/drtv{season.get("path")}', 'ie_key': DRTVSeasonIE.ie_key(), 'title': season.get('title'), + 'alt_title': season.get('contextualTitle'), 'series': traverse_obj(data, ('entries', 0, 'item', 'title')), 'season_number': traverse_obj(data, ('entries', 0, 'item', 'seasonNumber')) } for season in traverse_obj(data, ('entries', 0, 'item', 'show', 'seasons', 'items'))] @@ -485,6 +395,7 @@ def _real_extract(self, url): 'id': series_id, 'display_id': display_id, 'title': traverse_obj(data, ('entries', 0, 'item', 'title')), + 'alt_title': traverse_obj(data, ('entries', 0, 'item', 'contextualTitle')), 'series': traverse_obj(data, ('entries', 0, 'item', 'title')), 'entries': entries } diff --git a/yt_dlp/extractor/duboku.py b/yt_dlp/extractor/duboku.py index fb0546cae..fc9564cef 100644 --- a/yt_dlp/extractor/duboku.py +++ b/yt_dlp/extractor/duboku.py @@ -138,7 +138,7 @@ def _real_extract(self, url): # of the video. return { '_type': 'url_transparent', - 'url': smuggle_url(data_url, {'http_headers': headers}), + 'url': smuggle_url(data_url, {'referer': webpage_url}), 'id': video_id, 'title': title, 'series': series_title, diff --git a/yt_dlp/extractor/duoplay.py b/yt_dlp/extractor/duoplay.py new file mode 100644 index 000000000..7d3f39942 --- /dev/null +++ b/yt_dlp/extractor/duoplay.py @@ -0,0 +1,104 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + extract_attributes, + get_element_text_and_html_by_tag, + int_or_none, + join_nonempty, + str_or_none, + try_call, + unified_timestamp, +) +from ..utils.traversal import traverse_obj + + +class DuoplayIE(InfoExtractor): + _VALID_URL = r'https://duoplay\.ee/(?P<id>\d+)/[\w-]+/?(?:\?(?:[^#]+&)?ep=(?P<ep>\d+))?' + _TESTS = [{ + 'note': 'Siberi võmm S02E12', + 'url': 'https://duoplay.ee/4312/siberi-vomm?ep=24', + 'md5': '1ff59d535310ac9c5cf5f287d8f91b2d', + 'info_dict': { + 'id': '4312_24', + 'ext': 'mp4', + 'title': 'Operatsioon "Öö"', + 'thumbnail': r're:https://.+\.jpg(?:\?c=\d+)?$', + 'description': 'md5:8ef98f38569d6b8b78f3d350ccc6ade8', + 'upload_date': '20170523', + 'timestamp': 1495567800, + 'series': 'Siberi võmm', + 'series_id': '4312', + 'season': 'Season 2', + 'season_number': 2, + 'episode': 'Operatsioon "Öö"', + 'episode_number': 12, + 'episode_id': 24, + }, + }, { + 'note': 'Empty title', + 'url': 'https://duoplay.ee/17/uhikarotid?ep=14', + 'md5': '6aca68be71112314738dd17cced7f8bf', + 'info_dict': { + 'id': '17_14', + 'ext': 'mp4', + 'title': 'Ühikarotid', + 'thumbnail': r're:https://.+\.jpg(?:\?c=\d+)?$', + 'description': 'md5:4719b418e058c209def41d48b601276e', + 'upload_date': '20100916', + 'timestamp': 1284661800, + 'series': 'Ühikarotid', + 'series_id': '17', + 'season': 'Season 2', + 'season_number': 2, + 'episode_id': 14, + 'release_year': 2010, + }, + }, { + 'note': 'Movie without expiry', + 'url': 'https://duoplay.ee/5501/pilvede-all.-neljas-ode', + 'md5': '7abf63d773a49ef7c39f2c127842b8fd', + 'info_dict': { + 'id': '5501', + 'ext': 'mp4', + 'title': 'Pilvede all. Neljas õde', + 'thumbnail': r're:https://.+\.jpg(?:\?c=\d+)?$', + 'description': 'md5:d86a70f8f31e82c369d4d4f4c79b1279', + 'cast': 'count:9', + 'upload_date': '20221214', + 'timestamp': 1671054000, + 'release_year': 2018, + }, + }] + + def _real_extract(self, url): + telecast_id, episode = self._match_valid_url(url).group('id', 'ep') + video_id = join_nonempty(telecast_id, episode, delim='_') + webpage = self._download_webpage(url, video_id) + video_player = try_call(lambda: extract_attributes( + get_element_text_and_html_by_tag('video-player', webpage)[1])) + if not video_player or not video_player.get('manifest-url'): + raise ExtractorError('No video found', expected=True) + + episode_attr = self._parse_json(video_player.get(':episode') or '', video_id, fatal=False) or {} + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(video_player['manifest-url'], video_id, 'mp4'), + **traverse_obj(episode_attr, { + 'title': 'title', + 'description': 'synopsis', + 'thumbnail': ('images', 'original'), + 'timestamp': ('airtime', {lambda x: unified_timestamp(x + ' +0200')}), + 'cast': ('cast', {lambda x: x.split(', ')}), + 'release_year': ('year', {int_or_none}), + }), + **(traverse_obj(episode_attr, { + 'title': (None, ('subtitle', ('episode_nr', {lambda x: f'Episode {x}' if x else None}))), + 'series': 'title', + 'series_id': ('telecast_id', {str_or_none}), + 'season_number': ('season_id', {int_or_none}), + 'episode': 'subtitle', + 'episode_number': ('episode_nr', {int_or_none}), + 'episode_id': ('episode_id', {int_or_none}), + }, get_all=False) if episode_attr.get('category') != 'movies' else {}), + } diff --git a/yt_dlp/extractor/echomsk.py b/yt_dlp/extractor/echomsk.py deleted file mode 100644 index 850eabbff..000000000 --- a/yt_dlp/extractor/echomsk.py +++ /dev/null @@ -1,43 +0,0 @@ -import re - -from .common import InfoExtractor - - -class EchoMskIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?echo\.msk\.ru/sounds/(?P<id>\d+)' - _TEST = { - 'url': 'http://www.echo.msk.ru/sounds/1464134.html', - 'md5': '2e44b3b78daff5b458e4dbc37f191f7c', - 'info_dict': { - 'id': '1464134', - 'ext': 'mp3', - 'title': 'Особое мнение - 29 декабря 2014, 19:08', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - audio_url = self._search_regex( - r'<a rel="mp3" href="([^"]+)">', webpage, 'audio URL') - - title = self._html_search_regex( - r'<a href="/programs/[^"]+" target="_blank">([^<]+)</a>', - webpage, 'title') - - air_date = self._html_search_regex( - r'(?s)<div class="date">(.+?)</div>', - webpage, 'date', fatal=False, default=None) - - if air_date: - air_date = re.sub(r'(\s)\1+', r'\1', air_date) - if air_date: - title = '%s - %s' % (title, air_date) - - return { - 'id': video_id, - 'url': audio_url, - 'title': title, - } diff --git a/yt_dlp/extractor/ehow.py b/yt_dlp/extractor/ehow.py deleted file mode 100644 index 74469ce36..000000000 --- a/yt_dlp/extractor/ehow.py +++ /dev/null @@ -1,36 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote - - -class EHowIE(InfoExtractor): - IE_NAME = 'eHow' - _VALID_URL = r'https?://(?:www\.)?ehow\.com/[^/_?]*_(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html', - 'md5': '9809b4e3f115ae2088440bcb4efbf371', - 'info_dict': { - 'id': '12245069', - 'ext': 'flv', - 'title': 'Hardwood Flooring Basics', - 'description': 'Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...', - 'uploader': 'Erick Nathan', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - r'(?:file|source)=(http[^\'"&]*)', webpage, 'video URL') - final_url = compat_urllib_parse_unquote(video_url) - uploader = self._html_search_meta('uploader', webpage) - title = self._og_search_title(webpage).replace(' | eHow', '') - - return { - 'id': video_id, - 'url': final_url, - 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage), - 'description': self._og_search_description(webpage), - 'uploader': uploader, - } diff --git a/yt_dlp/extractor/elementorembed.py b/yt_dlp/extractor/elementorembed.py new file mode 100644 index 000000000..638893f6f --- /dev/null +++ b/yt_dlp/extractor/elementorembed.py @@ -0,0 +1,72 @@ +import re + +from .common import InfoExtractor +from .vimeo import VimeoIE +from .youtube import YoutubeIE +from ..utils import unescapeHTML, url_or_none +from ..utils.traversal import traverse_obj + + +class ElementorEmbedIE(InfoExtractor): + _VALID_URL = False + _WEBPAGE_TESTS = [{ + 'url': 'https://capitaltv.cy/2023/12/14/υγεια-και-ζωη-14-12-2023-δρ-ξενια-κωσταντινιδο/', + 'info_dict': { + 'id': 'KgzuxwuQwM4', + 'ext': 'mp4', + 'title': 'ΥΓΕΙΑ ΚΑΙ ΖΩΗ 14 12 2023 ΔΡ ΞΕΝΙΑ ΚΩΣΤΑΝΤΙΝΙΔΟΥ', + 'thumbnail': 'https://i.ytimg.com/vi/KgzuxwuQwM4/maxresdefault.jpg', + 'playable_in_embed': True, + 'tags': 'count:16', + 'like_count': int, + 'channel': 'Capital TV Cyprus', + 'channel_id': 'UCR8LwVKTLGEXt4ZAErpCMrg', + 'availability': 'public', + 'description': 'md5:7a3308a22881aea4612358c4ba121f77', + 'duration': 2891, + 'upload_date': '20231214', + 'uploader_id': '@capitaltvcyprus6389', + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UCR8LwVKTLGEXt4ZAErpCMrg', + 'uploader_url': 'https://www.youtube.com/@capitaltvcyprus6389', + 'uploader': 'Capital TV Cyprus', + 'age_limit': 0, + 'categories': ['News & Politics'], + 'view_count': int, + 'channel_follower_count': int, + }, + }, { + 'url': 'https://elementor.com/academy/theme-builder-collection/?playlist=76011151&video=9e59909', + 'info_dict': { + 'id': '?playlist=76011151&video=9e59909', + 'title': 'Theme Builder Collection - Academy', + 'age_limit': 0, + 'timestamp': 1702196984.0, + 'upload_date': '20231210', + 'description': 'md5:7f52c52715ee9e54fd7f82210511673d', + 'thumbnail': 'https://elementor.com/academy/wp-content/uploads/2021/07/Theme-Builder-1.png', + }, + 'playlist_count': 11, + 'params': { + 'skip_download': True, + }, + }] + _WIDGET_REGEX = r'<div[^>]+class="[^"]*elementor-widget-video(?:-playlist)?[^"]*"[^>]*data-settings="([^"]*)"' + + def _extract_from_webpage(self, url, webpage): + for data_settings in re.findall(self._WIDGET_REGEX, webpage): + data = self._parse_json(data_settings, None, fatal=False, transform_source=unescapeHTML) + if youtube_url := traverse_obj(data, ('youtube_url', {url_or_none})): + yield self.url_result(youtube_url, ie=YoutubeIE) + + for video in traverse_obj(data, ('tabs', lambda _, v: v['_id'], {dict})): + if youtube_url := traverse_obj(video, ('youtube_url', {url_or_none})): + yield self.url_result(youtube_url, ie=YoutubeIE) + if vimeo_url := traverse_obj(video, ('vimeo_url', {url_or_none})): + yield self.url_result(vimeo_url, ie=VimeoIE) + for direct_url in traverse_obj(video, (('hosted_url', 'external_url'), 'url', {url_or_none})): + yield { + 'id': video['_id'], + 'url': direct_url, + 'title': video.get('title'), + } diff --git a/yt_dlp/extractor/elevensports.py b/yt_dlp/extractor/elevensports.py deleted file mode 100644 index 99c52b3a9..000000000 --- a/yt_dlp/extractor/elevensports.py +++ /dev/null @@ -1,59 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - parse_iso8601, - traverse_obj, - url_or_none, -) - - -class ElevenSportsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?elevensports\.com/view/event/(?P<id>\w+)' - _TESTS = [{ - 'url': 'https://elevensports.com/view/event/clf46yr3kenn80jgrqsjmwefk', - 'md5': 'c0958d9ff90e4503a75544358758921d', - 'info_dict': { - 'id': 'clf46yr3kenn80jgrqsjmwefk', - 'title': 'Cleveland SC vs Lionsbridge FC', - 'ext': 'mp4', - 'description': 'md5:03b5238d6549f4ea1fddadf69b5e0b58', - 'upload_date': '20230323', - 'timestamp': 1679612400, - 'thumbnail': r're:^https?://.*\.(?:jpg|png)', - }, - 'params': {'skip_download': 'm3u8'} - }, { - 'url': 'https://elevensports.com/view/event/clhpyd53b06160jez74qhgkmf', - 'md5': 'c0958d9ff90e4503a75544358758921d', - 'info_dict': { - 'id': 'clhpyd53b06160jez74qhgkmf', - 'title': 'AJNLF vs ARRAF', - 'ext': 'mp4', - 'description': 'md5:c8c5e75c78f37c6d15cd6c475e43a8c1', - 'upload_date': '20230521', - 'timestamp': 1684684800, - 'thumbnail': r're:^https?://.*\.(?:jpg|png)', - }, - 'params': {'skip_download': 'm3u8'} - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - event_id = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['event']['mclsEventId'] - event_data = self._download_json( - f'https://mcls-api.mycujoo.tv/bff/events/v1beta1/{event_id}', video_id, - headers={'Authorization': 'Bearer FBVKACGN37JQC5SFA0OVK8KKSIOP153G'}) - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - event_data['streams'][0]['full_url'], video_id, 'mp4', m3u8_id='hls') - - return { - 'id': video_id, - 'formats': formats, - 'subtitles': subtitles, - **traverse_obj(event_data, { - 'title': ('title', {str}), - 'description': ('description', {str}), - 'timestamp': ('start_time', {parse_iso8601}), - 'thumbnail': ('thumbnail_url', {url_or_none}), - }), - } diff --git a/yt_dlp/extractor/ellentube.py b/yt_dlp/extractor/ellentube.py deleted file mode 100644 index 6eb00f9c9..000000000 --- a/yt_dlp/extractor/ellentube.py +++ /dev/null @@ -1,130 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - clean_html, - extract_attributes, - float_or_none, - int_or_none, - try_get, -) - - -class EllenTubeBaseIE(InfoExtractor): - def _extract_data_config(self, webpage, video_id): - details = self._search_regex( - r'(<[^>]+\bdata-component=(["\'])[Dd]etails.+?></div>)', webpage, - 'details') - return self._parse_json( - extract_attributes(details)['data-config'], video_id) - - def _extract_video(self, data, video_id): - title = data['title'] - - formats = [] - duration = None - for entry in data.get('media'): - if entry.get('id') == 'm3u8': - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - entry['url'], video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls') - duration = int_or_none(entry.get('duration')) - break - - def get_insight(kind): - return int_or_none(try_get( - data, lambda x: x['insight']['%ss' % kind])) - - return { - 'extractor_key': EllenTubeIE.ie_key(), - 'id': video_id, - 'title': title, - 'description': data.get('description'), - 'duration': duration, - 'thumbnail': data.get('thumbnail'), - 'timestamp': float_or_none(data.get('publishTime'), scale=1000), - 'view_count': get_insight('view'), - 'like_count': get_insight('like'), - 'formats': formats, - 'subtitles': subtitles, - } - - -class EllenTubeIE(EllenTubeBaseIE): - _VALID_URL = r'''(?x) - (?: - ellentube:| - https://api-prod\.ellentube\.com/ellenapi/api/item/ - ) - (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) - ''' - _TESTS = [{ - 'url': 'https://api-prod.ellentube.com/ellenapi/api/item/0822171c-3829-43bf-b99f-d77358ae75e3', - 'md5': '2fabc277131bddafdd120e0fc0f974c9', - 'info_dict': { - 'id': '0822171c-3829-43bf-b99f-d77358ae75e3', - 'ext': 'mp4', - 'title': 'Ellen Meets Las Vegas Survivors Jesus Campos and Stephen Schuck', - 'description': 'md5:76e3355e2242a78ad9e3858e5616923f', - 'thumbnail': r're:^https?://.+?', - 'duration': 514, - 'timestamp': 1508505120, - 'upload_date': '20171020', - 'view_count': int, - 'like_count': int, - } - }, { - 'url': 'ellentube:734a3353-f697-4e79-9ca9-bfc3002dc1e0', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - data = self._download_json( - 'https://api-prod.ellentube.com/ellenapi/api/item/%s' % video_id, - video_id) - return self._extract_video(data, video_id) - - -class EllenTubeVideoIE(EllenTubeBaseIE): - _VALID_URL = r'https?://(?:www\.)?ellentube\.com/video/(?P<id>.+?)\.html' - _TEST = { - 'url': 'https://www.ellentube.com/video/ellen-meets-las-vegas-survivors-jesus-campos-and-stephen-schuck.html', - 'only_matching': True, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._extract_data_config(webpage, display_id)['id'] - return self.url_result( - 'ellentube:%s' % video_id, ie=EllenTubeIE.ie_key(), - video_id=video_id) - - -class EllenTubePlaylistIE(EllenTubeBaseIE): - _VALID_URL = r'https?://(?:www\.)?ellentube\.com/(?:episode|studios)/(?P<id>.+?)\.html' - _TESTS = [{ - 'url': 'https://www.ellentube.com/episode/dax-shepard-jordan-fisher-haim.html', - 'info_dict': { - 'id': 'dax-shepard-jordan-fisher-haim', - 'title': "Dax Shepard, 'DWTS' Team Jordan Fisher & Lindsay Arnold, HAIM", - 'description': 'md5:bfc982194dabb3f4e325e43aa6b2e21c', - }, - 'playlist_count': 6, - }, { - 'url': 'https://www.ellentube.com/studios/macey-goes-rving0.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - data = self._extract_data_config(webpage, display_id)['data'] - feed = self._download_json( - 'https://api-prod.ellentube.com/ellenapi/api/feed/?%s' - % data['filter'], display_id) - entries = [ - self._extract_video(elem, elem['id']) - for elem in feed if elem.get('type') == 'VIDEO' and elem.get('id')] - return self.playlist_result( - entries, display_id, data.get('title'), - clean_html(data.get('description'))) diff --git a/yt_dlp/extractor/eltrecetv.py b/yt_dlp/extractor/eltrecetv.py new file mode 100644 index 000000000..f64023af7 --- /dev/null +++ b/yt_dlp/extractor/eltrecetv.py @@ -0,0 +1,62 @@ +from .common import InfoExtractor + + +class ElTreceTVIE(InfoExtractor): + IE_DESC = 'El Trece TV (Argentina)' + _VALID_URL = r'https?://(?:www\.)?eltrecetv\.com\.ar/[\w-]+/capitulos/temporada-\d+/(?P<id>[\w-]+)' + _TESTS = [ + { + 'url': 'https://www.eltrecetv.com.ar/ahora-caigo/capitulos/temporada-2023/programa-del-061023/', + 'md5': '71a66673dc63f9a5939d97bfe4b311ba', + 'info_dict': { + 'id': 'AHCA05102023145553329621094', + 'ext': 'mp4', + 'title': 'AHORA CAIGO - Programa 06/10/23', + 'thumbnail': 'https://thumbs.vodgc.net/AHCA05102023145553329621094.JPG?649339', + } + }, + { + 'url': 'https://www.eltrecetv.com.ar/poco-correctos/capitulos/temporada-2023/programa-del-250923-invitada-dalia-gutmann/', + 'only_matching': True, + }, + { + 'url': 'https://www.eltrecetv.com.ar/argentina-tierra-de-amor-y-venganza/capitulos/temporada-2023/atav-2-capitulo-121-del-250923/', + 'only_matching': True, + }, + { + 'url': 'https://www.eltrecetv.com.ar/ahora-caigo/capitulos/temporada-2023/programa-del-250923/', + 'only_matching': True, + }, + { + 'url': 'https://www.eltrecetv.com.ar/pasaplatos/capitulos/temporada-2023/pasaplatos-el-restaurante-del-250923/', + 'only_matching': True, + }, + { + 'url': 'https://www.eltrecetv.com.ar/el-galpon/capitulos/temporada-2023/programa-del-160923-invitado-raul-lavie/', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + slug = self._match_id(url) + webpage = self._download_webpage(url, slug) + config = self._search_json( + r'Fusion.globalContent\s*=', webpage, 'content', slug)['promo_items']['basic']['embed']['config'] + video_url = config['m3u8'] + video_id = self._search_regex(r'/(\w+)\.m3u8', video_url, 'video id', default=slug) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4', m3u8_id='hls') + formats.extend([{ + 'url': f['url'][:-23], + 'format_id': f['format_id'].replace('hls', 'http'), + 'width': f.get('width'), + 'height': f.get('height'), + } for f in formats if f['url'].endswith('/tracks-v1a1/index.m3u8') and f.get('height') != 1080]) + + return { + 'id': video_id, + 'title': config.get('title'), + 'thumbnail': config.get('thumbnail'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/embedly.py b/yt_dlp/extractor/embedly.py index 458aaa0a0..a424b49df 100644 --- a/yt_dlp/extractor/embedly.py +++ b/yt_dlp/extractor/embedly.py @@ -106,4 +106,4 @@ def _real_extract(self, url): return self.url_result(src, YoutubeTabIE) return self.url_result(smuggle_url( urllib.parse.unquote(traverse_obj(qs, ('src', 0), ('url', 0))), - {'http_headers': {'Referer': url}})) + {'referer': url})) diff --git a/yt_dlp/extractor/engadget.py b/yt_dlp/extractor/engadget.py deleted file mode 100644 index e7c5d7bf1..000000000 --- a/yt_dlp/extractor/engadget.py +++ /dev/null @@ -1,15 +0,0 @@ -from .common import InfoExtractor - - -class EngadgetIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?engadget\.com/video/(?P<id>[^/?#]+)' - - _TESTS = [{ - # video with vidible ID - 'url': 'https://www.engadget.com/video/57a28462134aa15a39f0421a/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result('aol-video:%s' % video_id) diff --git a/yt_dlp/extractor/epidemicsound.py b/yt_dlp/extractor/epidemicsound.py new file mode 100644 index 000000000..0d81b11c8 --- /dev/null +++ b/yt_dlp/extractor/epidemicsound.py @@ -0,0 +1,107 @@ +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, + orderedSet, + parse_iso8601, + parse_qs, + parse_resolution, + str_or_none, + traverse_obj, + url_or_none, +) + + +class EpidemicSoundIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?epidemicsound\.com/track/(?P<id>[0-9a-zA-Z]+)' + _TESTS = [{ + 'url': 'https://www.epidemicsound.com/track/yFfQVRpSPz/', + 'md5': 'd98ff2ddb49e8acab9716541cbc9dfac', + 'info_dict': { + 'id': '45014', + 'display_id': 'yFfQVRpSPz', + 'ext': 'mp3', + 'title': 'Door Knock Door 1', + 'alt_title': 'Door Knock Door 1', + 'tags': ['foley', 'door', 'knock', 'glass', 'window', 'glass door knock'], + 'categories': ['Misc. Door'], + 'duration': 1, + 'thumbnail': 'https://cdn.epidemicsound.com/curation-assets/commercial-release-cover-images/default-sfx/3000x3000.jpg', + 'timestamp': 1415320353, + 'upload_date': '20141107', + }, + }, { + 'url': 'https://www.epidemicsound.com/track/mj8GTTwsZd/', + 'md5': 'c82b745890f9baf18dc2f8d568ee3830', + 'info_dict': { + 'id': '148700', + 'display_id': 'mj8GTTwsZd', + 'ext': 'mp3', + 'title': 'Noplace', + 'tags': ['liquid drum n bass', 'energetic'], + 'categories': ['drum and bass'], + 'duration': 237, + 'timestamp': 1694426482, + 'thumbnail': 'https://cdn.epidemicsound.com/curation-assets/commercial-release-cover-images/11138/3000x3000.jpg', + 'upload_date': '20230911', + 'release_timestamp': 1700535606, + 'release_date': '20231121', + }, + }] + + @staticmethod + def _epidemic_parse_thumbnail(url: str): + if not url_or_none(url): + return None + + return { + 'url': url, + **(traverse_obj(url, ({parse_qs}, { + 'width': ('width', 0, {int_or_none}), + 'height': ('height', 0, {int_or_none}), + })) or parse_resolution(url)), + } + + @staticmethod + def _epidemic_fmt_or_none(f): + if not f.get('format'): + f['format'] = f.get('format_id') + elif not f.get('format_id'): + f['format_id'] = f['format'] + if not f['url'] or not f['format']: + return None + if f.get('format_note'): + f['format_note'] = f'track ID {f["format_note"]}' + if f['format'] != 'full': + f['preference'] = -2 + return f + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json(f'https://www.epidemicsound.com/json/track/{video_id}', video_id) + + thumbnails = traverse_obj(json_data, [('imageUrl', 'cover')]) + thumb_base_url = traverse_obj(json_data, ('coverArt', 'baseUrl', {url_or_none})) + if thumb_base_url: + thumbnails.extend(traverse_obj(json_data, ( + 'coverArt', 'sizes', ..., {thumb_base_url.__add__}))) + + return traverse_obj(json_data, { + 'id': ('id', {str_or_none}), + 'display_id': ('publicSlug', {str}), + 'title': ('title', {str}), + 'alt_title': ('oldTitle', {str}), + 'duration': ('length', {float_or_none}), + 'timestamp': ('added', {parse_iso8601}), + 'release_timestamp': ('releaseDate', {parse_iso8601}), + 'categories': ('genres', ..., 'tag', {str}), + 'tags': ('metadataTags', ..., {str}), + 'age_limit': ('isExplicit', {lambda b: 18 if b else None}), + 'thumbnails': ({lambda _: thumbnails}, {orderedSet}, ..., {self._epidemic_parse_thumbnail}), + 'formats': ('stems', {dict.items}, ..., { + 'format': (0, {str_or_none}), + 'format_note': (1, 's3TrackId', {str_or_none}), + 'format_id': (1, 'stemType', {str}), + 'url': (1, 'lqMp3Url', {url_or_none}), + }, {self._epidemic_fmt_or_none}), + }) diff --git a/yt_dlp/extractor/eplus.py b/yt_dlp/extractor/eplus.py index 3ebdcf5fb..6383691a1 100644 --- a/yt_dlp/extractor/eplus.py +++ b/yt_dlp/extractor/eplus.py @@ -1,15 +1,20 @@ +import json + from .common import InfoExtractor from ..utils import ( ExtractorError, try_call, unified_timestamp, + urlencode_postdata, ) class EplusIbIE(InfoExtractor): - IE_NAME = 'eplus:inbound' - IE_DESC = 'e+ (イープラス) overseas' - _VALID_URL = r'https?://live\.eplus\.jp/ex/player\?ib=(?P<id>(?:\w|%2B|%2F){86}%3D%3D)' + _NETRC_MACHINE = 'eplus' + IE_NAME = 'eplus' + IE_DESC = 'e+ (イープラス)' + _VALID_URL = [r'https?://live\.eplus\.jp/ex/player\?ib=(?P<id>(?:\w|%2B|%2F){86}%3D%3D)', + r'https?://live\.eplus\.jp/(?P<id>sample|\d+)'] _TESTS = [{ 'url': 'https://live.eplus.jp/ex/player?ib=YEFxb3Vyc2Dombnjg7blkrLlrablnJLjgrnjgq%2Fjg7zjg6vjgqLjgqTjg4njg6vlkIzlpb3kvJpgTGllbGxhIQ%3D%3D', 'info_dict': { @@ -29,14 +34,97 @@ class EplusIbIE(InfoExtractor): 'No video formats found!', 'Requested format is not available', ], + }, { + 'url': 'https://live.eplus.jp/sample', + 'info_dict': { + 'id': 'stream1ng20210719-test-005', + 'title': 'Online streaming test for DRM', + 'live_status': 'was_live', + 'release_date': '20210719', + 'release_timestamp': 1626703200, + 'description': None, + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': [ + 'Could not find the playlist URL. This event may not be accessible', + 'No video formats found!', + 'Requested format is not available', + 'This video is DRM protected', + ], + }, { + 'url': 'https://live.eplus.jp/2053935', + 'info_dict': { + 'id': '331320-0001-001', + 'title': '丘みどり2020配信LIVE Vol.2 ~秋麗~ 【Streaming+(配信チケット)】', + 'live_status': 'was_live', + 'release_date': '20200920', + 'release_timestamp': 1600596000, + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': [ + 'Could not find the playlist URL. This event may not be accessible', + 'No video formats found!', + 'Requested format is not available', + ], }] + _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0' + + def _login(self, username, password, urlh): + if not self._get_cookies('https://live.eplus.jp/').get('ci_session'): + raise ExtractorError('Unable to get ci_session cookie') + + cltft_token = urlh.headers.get('X-CLTFT-Token') + if not cltft_token: + raise ExtractorError('Unable to get X-CLTFT-Token') + self._set_cookie('live.eplus.jp', 'X-CLTFT-Token', cltft_token) + + login_json = self._download_json( + 'https://live.eplus.jp/member/api/v1/FTAuth/idpw', None, + note='Sending pre-login info', errnote='Unable to send pre-login info', headers={ + 'Content-Type': 'application/json; charset=UTF-8', + 'Referer': urlh.url, + 'X-Cltft-Token': cltft_token, + 'Accept': '*/*', + }, data=json.dumps({ + 'loginId': username, + 'loginPassword': password, + }).encode()) + if not login_json.get('isSuccess'): + raise ExtractorError('Login failed: Invalid id or password', expected=True) + + self._request_webpage( + urlh.url, None, note='Logging in', errnote='Unable to log in', + data=urlencode_postdata({ + 'loginId': username, + 'loginPassword': password, + 'Token.Default': cltft_token, + 'op': 'nextPage', + }), headers={'Referer': urlh.url}) + def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage, urlh = self._download_webpage_handle( + url, video_id, headers={'User-Agent': self._USER_AGENT}) + if urlh.url.startswith('https://live.eplus.jp/member/auth'): + username, password = self._get_login_info() + if not username: + self.raise_login_required() + self._login(username, password, urlh) + webpage = self._download_webpage( + url, video_id, headers={'User-Agent': self._USER_AGENT}) data_json = self._search_json(r'<script>\s*var app\s*=', webpage, 'data json', video_id) + if data_json.get('drm_mode') == 'ON': + self.report_drm(video_id) + delivery_status = data_json.get('delivery_status') archive_mode = data_json.get('archive_mode') release_timestamp = try_call(lambda: unified_timestamp(data_json['event_datetime']) - 32400) @@ -64,7 +152,7 @@ def _real_extract(self, url): formats = [] m3u8_playlist_urls = self._search_json( - r'var listChannels\s*=', webpage, 'hls URLs', video_id, contains_pattern=r'\[.+\]', default=[]) + r'var\s+listChannels\s*=', webpage, 'hls URLs', video_id, contains_pattern=r'\[.+\]', default=[]) if not m3u8_playlist_urls: if live_status == 'is_upcoming': self.raise_no_formats( diff --git a/yt_dlp/extractor/escapist.py b/yt_dlp/extractor/escapist.py deleted file mode 100644 index 85a1cbf40..000000000 --- a/yt_dlp/extractor/escapist.py +++ /dev/null @@ -1,108 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - determine_ext, - clean_html, - int_or_none, - float_or_none, -) - - -def _decrypt_config(key, string): - a = '' - i = '' - r = '' - - while len(a) < (len(string) / 2): - a += key - - a = a[0:int(len(string) / 2)] - - t = 0 - while t < len(string): - i += chr(int(string[t] + string[t + 1], 16)) - t += 2 - - icko = [s for s in i] - - for t, c in enumerate(a): - r += chr(ord(c) ^ ord(icko[t])) - - return r - - -class EscapistIE(InfoExtractor): - _VALID_URL = r'https?://?(?:(?:www|v1)\.)?escapistmagazine\.com/videos/view/[^/]+/(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate', - 'md5': 'ab3a706c681efca53f0a35f1415cf0d1', - 'info_dict': { - 'id': '6618', - 'ext': 'mp4', - 'description': "Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.", - 'title': "Breaking Down Baldur's Gate", - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 264, - 'uploader': 'The Escapist', - } - }, { - 'url': 'http://www.escapistmagazine.com/videos/view/zero-punctuation/10044-Evolve-One-vs-Multiplayer', - 'md5': '9e8c437b0dbb0387d3bd3255ca77f6bf', - 'info_dict': { - 'id': '10044', - 'ext': 'mp4', - 'description': 'This week, Zero Punctuation reviews Evolve.', - 'title': 'Evolve - One vs Multiplayer', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 304, - 'uploader': 'The Escapist', - } - }, { - 'url': 'http://escapistmagazine.com/videos/view/the-escapist-presents/6618', - 'only_matching': True, - }, { - 'url': 'https://v1.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - ims_video = self._parse_json( - self._search_regex( - r'imsVideo\.play\(({.+?})\);', webpage, 'imsVideo'), - video_id) - video_id = ims_video['videoID'] - key = ims_video['hash'] - - config = self._download_webpage( - 'http://www.escapistmagazine.com/videos/vidconfig.php', - video_id, 'Downloading video config', headers={ - 'Referer': url, - }, query={ - 'videoID': video_id, - 'hash': key, - }) - - data = self._parse_json(_decrypt_config(key, config), video_id) - - video_data = data['videoData'] - - title = clean_html(video_data['title']) - - formats = [{ - 'url': video['src'], - 'format_id': '%s-%sp' % (determine_ext(video['src']), video['res']), - 'height': int_or_none(video.get('res')), - } for video in data['files']['videos']] - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage) or data.get('poster'), - 'description': self._og_search_description(webpage), - 'duration': float_or_none(video_data.get('duration'), 1000), - 'uploader': video_data.get('publisher'), - 'series': video_data.get('show'), - } diff --git a/yt_dlp/extractor/esri.py b/yt_dlp/extractor/esri.py deleted file mode 100644 index 02e7efaf0..000000000 --- a/yt_dlp/extractor/esri.py +++ /dev/null @@ -1,70 +0,0 @@ -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - int_or_none, - parse_filesize, - unified_strdate, -) - - -class EsriVideoIE(InfoExtractor): - _VALID_URL = r'https?://video\.esri\.com/watch/(?P<id>[0-9]+)' - _TEST = { - 'url': 'https://video.esri.com/watch/1124/arcgis-online-_dash_-developing-applications', - 'md5': 'd4aaf1408b221f1b38227a9bbaeb95bc', - 'info_dict': { - 'id': '1124', - 'ext': 'mp4', - 'title': 'ArcGIS Online - Developing Applications', - 'description': 'Jeremy Bartley demonstrates how to develop applications with ArcGIS Online.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 185, - 'upload_date': '20120419', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - formats = [] - for width, height, content in re.findall( - r'(?s)<li><strong>(\d+)x(\d+):</strong>(.+?)</li>', webpage): - for video_url, ext, filesize in re.findall( - r'<a[^>]+href="([^"]+)">([^<]+) \(([^<]+)\)</a>', content): - formats.append({ - 'url': compat_urlparse.urljoin(url, video_url), - 'ext': ext.lower(), - 'format_id': '%s-%s' % (ext.lower(), height), - 'width': int(width), - 'height': int(height), - 'filesize_approx': parse_filesize(filesize), - }) - - title = self._html_search_meta('title', webpage, 'title') - description = self._html_search_meta( - 'description', webpage, 'description', fatal=False) - - thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail', fatal=False) - if thumbnail: - thumbnail = re.sub(r'_[st]\.jpg$', '_x.jpg', thumbnail) - - duration = int_or_none(self._search_regex( - [r'var\s+videoSeconds\s*=\s*(\d+)', r"'duration'\s*:\s*(\d+)"], - webpage, 'duration', fatal=False)) - - upload_date = unified_strdate(self._html_search_meta( - 'last-modified', webpage, 'upload date', fatal=False)) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'upload_date': upload_date, - 'formats': formats - } diff --git a/yt_dlp/extractor/expotv.py b/yt_dlp/extractor/expotv.py deleted file mode 100644 index bda6e3cb2..000000000 --- a/yt_dlp/extractor/expotv.py +++ /dev/null @@ -1,74 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, -) - - -class ExpoTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?expotv\.com/videos/[^?#]*/(?P<id>[0-9]+)($|[?#])' - _TEST = { - 'url': 'http://www.expotv.com/videos/reviews/3/40/NYX-Butter-lipstick/667916', - 'md5': 'fe1d728c3a813ff78f595bc8b7a707a8', - 'info_dict': { - 'id': '667916', - 'ext': 'mp4', - 'title': 'NYX Butter Lipstick Little Susie', - 'description': 'Goes on like butter, but looks better!', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Stephanie S.', - 'upload_date': '20150520', - 'view_count': int, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - player_key = self._search_regex( - r'<param name="playerKey" value="([^"]+)"', webpage, 'player key') - config = self._download_json( - 'http://client.expotv.com/video/config/%s/%s' % (video_id, player_key), - video_id, 'Downloading video configuration') - - formats = [] - for fcfg in config['sources']: - media_url = fcfg.get('file') - if not media_url: - continue - if fcfg.get('type') == 'm3u8': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')) - else: - formats.append({ - 'url': media_url, - 'height': int_or_none(fcfg.get('height')), - 'format_id': fcfg.get('label'), - 'ext': self._search_regex( - r'filename=.*\.([a-z0-9_A-Z]+)&', media_url, - 'file extension', default=None) or fcfg.get('type'), - }) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - thumbnail = config.get('image') - view_count = int_or_none(self._search_regex( - r'<h5>Plays: ([0-9]+)</h5>', webpage, 'view counts')) - uploader = self._search_regex( - r'<div class="reviewer">\s*<img alt="([^"]+)"', webpage, 'uploader', - fatal=False) - upload_date = unified_strdate(self._search_regex( - r'<h5>Reviewed on ([0-9/.]+)</h5>', webpage, 'upload date', - fatal=False), day_first=False) - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': description, - 'view_count': view_count, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, - } diff --git a/yt_dlp/extractor/extremetube.py b/yt_dlp/extractor/extremetube.py deleted file mode 100644 index 2c1969899..000000000 --- a/yt_dlp/extractor/extremetube.py +++ /dev/null @@ -1,48 +0,0 @@ -from ..utils import str_to_int -from .keezmovies import KeezMoviesIE - - -class ExtremeTubeIE(KeezMoviesIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P<id>[^/#?&]+)' - _TESTS = [{ - 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', - 'md5': '92feaafa4b58e82f261e5419f39c60cb', - 'info_dict': { - 'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431', - 'ext': 'mp4', - 'title': 'Music Video 14 british euro brit european cumshots swallow', - 'uploader': 'anonim', - 'view_count': int, - 'age_limit': 18, - } - }, { - 'url': 'http://www.extremetube.com/gay/video/abcde-1234', - 'only_matching': True, - }, { - 'url': 'http://www.extremetube.com/video/latina-slut-fucked-by-fat-black-dick', - 'only_matching': True, - }, { - 'url': 'http://www.extremetube.com/video/652431', - 'only_matching': True, - }] - - def _real_extract(self, url): - webpage, info = self._extract_info(url) - - if not info['title']: - info['title'] = self._search_regex( - r'<h1[^>]+title="([^"]+)"[^>]*>', webpage, 'title') - - uploader = self._html_search_regex( - r'Uploaded by:\s*</[^>]+>\s*<a[^>]+>(.+?)</a>', - webpage, 'uploader', fatal=False) - view_count = str_to_int(self._search_regex( - r'Views:\s*</[^>]+>\s*<[^>]+>([\d,\.]+)</', - webpage, 'view count', fatal=False)) - - info.update({ - 'uploader': uploader, - 'view_count': view_count, - }) - - return info diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 50a750d3b..a16a067ab 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -16,6 +16,7 @@ determine_ext, error_to_compat_str, float_or_none, + format_field, get_element_by_id, get_first, int_or_none, @@ -51,12 +52,12 @@ class FacebookIE(InfoExtractor): )\?(?:.*?)(?:v|video_id|story_fbid)=| [^/]+/videos/(?:[^/]+/)?| [^/]+/posts/| - groups/[^/]+/permalink/| + groups/[^/]+/(?:permalink|posts)/| watchparty/ )| facebook: ) - (?P<id>[0-9]+) + (?P<id>pfbid[A-Za-z0-9]+|\d+) ''' _EMBED_REGEX = [ r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1', @@ -231,6 +232,39 @@ class FacebookIE(InfoExtractor): 'uploader_id': '100013949973717', }, 'skip': 'Requires logging in', + }, { + # data.node.comet_sections.content.story.attachments[].throwbackStyles.attachment_target_renderer.attachment.target.attachments[].styles.attachment.media + 'url': 'https://www.facebook.com/groups/1645456212344334/posts/3737828833107051/', + 'info_dict': { + 'id': '1569199726448814', + 'ext': 'mp4', + 'title': 'Pence MUST GO!', + 'description': 'Vickie Gentry shared a memory.', + 'timestamp': 1511548260, + 'upload_date': '20171124', + 'uploader': 'Vickie Gentry', + 'uploader_id': 'pfbid0FuZhHCeWDAxWxEbr3yKPFaRstXvRxgsp9uCPG6GjD4J2AitB35NUAuJ4Q75KcjiDl', + 'thumbnail': r're:^https?://.*', + 'duration': 148.435, + }, + }, { + 'url': 'https://www.facebook.com/attn/posts/pfbid0j1Czf2gGDVqeQ8KiMLFm3pWN8GxsQmeRrVhimWDzMuKQoR8r4b1knNsejELmUgyhl', + 'info_dict': { + 'id': '6968553779868435', + 'ext': 'mp4', + 'description': 'md5:2f2fcf93e97ac00244fe64521bbdb0cb', + 'uploader': 'ATTN:', + 'upload_date': '20231207', + 'title': 'ATTN:', + 'duration': 132.675, + 'uploader_id': '100064451419378', + 'view_count': int, + 'thumbnail': r're:^https?://.*', + 'timestamp': 1701975646, + }, + }, { + 'url': 'https://www.facebook.com/story.php?story_fbid=pfbid0Fnzhm8UuzjBYpPMNFzaSpFE9UmLdU4fJN8qTANi1Dmtj5q7DNrL5NERXfsAzDEV7l&id=100073071055552', + 'only_matching': True, }, { 'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'only_matching': True, @@ -420,6 +454,29 @@ def extract_metadata(webpage): r'data-sjs>({.*?ScheduledServerJS.*?})</script>', webpage)] post = traverse_obj(post_data, ( ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] + + automatic_captions, subtitles = {}, {} + subs_data = traverse_obj(post, (..., 'video', ..., 'attachments', ..., lambda k, v: ( + k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video'))) + is_video_broadcast = get_first(subs_data, 'is_video_broadcast', expected_type=bool) + captions = get_first(subs_data, 'video_available_captions_locales', 'captions_url') + if url_or_none(captions): # if subs_data only had a 'captions_url' + locale = self._html_search_meta(['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US') + subtitles[locale] = [{'url': captions}] + # or else subs_data had 'video_available_captions_locales', a list of dicts + for caption in traverse_obj(captions, ( + {lambda x: sorted(x, key=lambda c: c['locale'])}, lambda _, v: v['captions_url']) + ): + lang = caption.get('localized_language') or '' + subs = { + 'url': caption['captions_url'], + 'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang), + } + if caption.get('localized_creation_method') or is_video_broadcast: + automatic_captions.setdefault(caption['locale'], []).append(subs) + else: + subtitles.setdefault(caption['locale'], []).append(subs) + media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) title = get_first(media, ('title', 'text')) @@ -463,6 +520,8 @@ def extract_metadata(webpage): webpage, 'view count', default=None)), 'concurrent_view_count': get_first(post, ( ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), + 'automatic_captions': automatic_captions, + 'subtitles': subtitles, } info_json_ld = self._search_json_ld(webpage, video_id, default={}) @@ -586,9 +645,11 @@ def parse_attachment(attachment, key='media'): nodes = variadic(traverse_obj(data, 'nodes', 'node') or []) attachments = traverse_obj(nodes, ( ..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments', - ..., ('styles', 'style_type_renderer'), 'attachment'), expected_type=dict) or [] + ..., ('styles', 'style_type_renderer', ('throwbackStyles', 'attachment_target_renderer')), + 'attachment', {dict})) for attachment in attachments: - ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] + ns = traverse_obj(attachment, ('all_subattachments', 'nodes', ..., {dict}), + ('target', 'attachments', ..., 'styles', 'attachment', {dict})) for n in ns: parse_attachment(n) parse_attachment(attachment) @@ -611,7 +672,7 @@ def parse_attachment(attachment, key='media'): if len(entries) > 1: return self.playlist_result(entries, video_id) - video_info = entries[0] + video_info = entries[0] if entries else {'id': video_id} webpage_info = extract_metadata(webpage) # honor precise duration in video info if video_info.get('duration'): diff --git a/yt_dlp/extractor/fc2.py b/yt_dlp/extractor/fc2.py index ba19b6cab..bbc4b5693 100644 --- a/yt_dlp/extractor/fc2.py +++ b/yt_dlp/extractor/fc2.py @@ -2,11 +2,9 @@ from .common import InfoExtractor from ..compat import compat_parse_qs -from ..dependencies import websockets from ..networking import Request from ..utils import ( ExtractorError, - WebSocketsWrapper, js_to_json, traverse_obj, update_url_query, @@ -167,8 +165,6 @@ class FC2LiveIE(InfoExtractor): }] def _real_extract(self, url): - if not websockets: - raise ExtractorError('websockets library is not available. Please install it.', expected=True) video_id = self._match_id(url) webpage = self._download_webpage('https://live.fc2.com/%s/' % video_id, video_id) @@ -199,13 +195,9 @@ def _real_extract(self, url): ws_url = update_url_query(control_server['url'], {'control_token': control_server['control_token']}) playlist_data = None - self.to_screen('%s: Fetching HLS playlist info via WebSocket' % video_id) - ws = WebSocketsWrapper(ws_url, { - 'Cookie': str(self._get_cookies('https://live.fc2.com/'))[12:], + ws = self._request_webpage(Request(ws_url, headers={ 'Origin': 'https://live.fc2.com', - 'Accept': '*/*', - 'User-Agent': self.get_param('http_headers')['User-Agent'], - }) + }), video_id, note='Fetching HLS playlist info via WebSocket') self.write_debug('Sending HLS server request') diff --git a/yt_dlp/extractor/fifa.py b/yt_dlp/extractor/fifa.py index 8b4db3a8a..f604cbd40 100644 --- a/yt_dlp/extractor/fifa.py +++ b/yt_dlp/extractor/fifa.py @@ -8,7 +8,7 @@ class FifaIE(InfoExtractor): - _VALID_URL = r'https?://www.fifa.com/fifaplus/(?P<locale>\w{2})/watch/([^#?]+/)?(?P<id>\w+)' + _VALID_URL = r'https?://www\.fifa\.com/fifaplus/(?P<locale>\w{2})/watch/([^#?]+/)?(?P<id>\w+)' _TESTS = [{ 'url': 'https://www.fifa.com/fifaplus/en/watch/7on10qPcnyLajDDU3ntg6y', 'info_dict': { diff --git a/yt_dlp/extractor/filmmodu.py b/yt_dlp/extractor/filmmodu.py index 9eb550eed..1e793560d 100644 --- a/yt_dlp/extractor/filmmodu.py +++ b/yt_dlp/extractor/filmmodu.py @@ -3,7 +3,7 @@ class FilmmoduIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?filmmodu.org/(?P<id>[^/]+-(?:turkce-dublaj-izle|altyazili-izle))' + _VALID_URL = r'https?://(?:www\.)?filmmodu\.org/(?P<id>[^/]+-(?:turkce-dublaj-izle|altyazili-izle))' _TESTS = [{ 'url': 'https://www.filmmodu.org/f9-altyazili-izle', 'md5': 'aeefd955c2a508a5bdaa3bcec8eeb0d4', diff --git a/yt_dlp/extractor/floatplane.py b/yt_dlp/extractor/floatplane.py new file mode 100644 index 000000000..2cf4d4e64 --- /dev/null +++ b/yt_dlp/extractor/floatplane.py @@ -0,0 +1,268 @@ +import functools + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + OnDemandPagedList, + clean_html, + determine_ext, + format_field, + int_or_none, + join_nonempty, + parse_codecs, + parse_iso8601, + urljoin, +) +from ..utils.traversal import traverse_obj + + +class FloatplaneIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|beta)\.)?floatplane\.com/post/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://www.floatplane.com/post/2Yf3UedF7C', + 'info_dict': { + 'id': 'yuleLogLTT', + 'ext': 'mp4', + 'display_id': '2Yf3UedF7C', + 'title': '8K Yule Log Fireplace with Crackling Fire Sounds - 10 Hours', + 'description': 'md5:adf2970e0de1c5e3df447818bb0309f6', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'duration': 36035, + 'comment_count': int, + 'like_count': int, + 'dislike_count': int, + 'release_date': '20191206', + 'release_timestamp': 1575657000, + 'uploader': 'LinusTechTips', + 'uploader_id': '59f94c0bdd241b70349eb72b', + 'uploader_url': 'https://www.floatplane.com/channel/linustechtips/home', + 'channel': 'Linus Tech Tips', + 'channel_id': '63fe42c309e691e4e36de93d', + 'channel_url': 'https://www.floatplane.com/channel/linustechtips/home/main', + 'availability': 'subscriber_only', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.floatplane.com/post/j2jqG3JmgJ', + 'info_dict': { + 'id': 'j2jqG3JmgJ', + 'title': 'TJM: Does Anyone Care About Avatar: The Way of Water?', + 'description': 'md5:00bf17dc5733e4031e99b7fd6489f274', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'comment_count': int, + 'like_count': int, + 'dislike_count': int, + 'release_timestamp': 1671915900, + 'release_date': '20221224', + 'uploader': 'LinusTechTips', + 'uploader_id': '59f94c0bdd241b70349eb72b', + 'uploader_url': 'https://www.floatplane.com/channel/linustechtips/home', + 'channel': "They're Just Movies", + 'channel_id': '64135f82fc76ab7f9fbdc876', + 'channel_url': 'https://www.floatplane.com/channel/linustechtips/home/tajm', + 'availability': 'subscriber_only', + }, + 'playlist_count': 2, + }, { + 'url': 'https://www.floatplane.com/post/3tK2tInhoN', + 'info_dict': { + 'id': '3tK2tInhoN', + 'title': 'Extras - How Linus Communicates with Editors (Compensator 4)', + 'description': 'md5:83cd40aae1ce124df33769600c80ca5b', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'comment_count': int, + 'like_count': int, + 'dislike_count': int, + 'release_timestamp': 1700529120, + 'release_date': '20231121', + 'uploader': 'LinusTechTips', + 'uploader_id': '59f94c0bdd241b70349eb72b', + 'uploader_url': 'https://www.floatplane.com/channel/linustechtips/home', + 'channel': 'FP Exclusives', + 'channel_id': '6413623f5b12cca228a28e78', + 'channel_url': 'https://www.floatplane.com/channel/linustechtips/home/fpexclusive', + 'availability': 'subscriber_only', + }, + 'playlist_count': 2, + }, { + 'url': 'https://beta.floatplane.com/post/d870PEFXS1', + 'info_dict': { + 'id': 'bg9SuYKEww', + 'ext': 'mp4', + 'display_id': 'd870PEFXS1', + 'title': 'LCS Drama, TLOU 2 Remaster, Destiny 2 Player Count Drops, + More!', + 'description': 'md5:80d612dcabf41b17487afcbe303ec57d', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'release_timestamp': 1700622000, + 'release_date': '20231122', + 'duration': 513, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'uploader': 'LinusTechTips', + 'uploader_id': '59f94c0bdd241b70349eb72b', + 'uploader_url': 'https://www.floatplane.com/channel/linustechtips/home', + 'channel': 'GameLinked', + 'channel_id': '649dbade3540dbc3945eeda7', + 'channel_url': 'https://www.floatplane.com/channel/linustechtips/home/gamelinked', + 'availability': 'subscriber_only', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_initialize(self): + if not self._get_cookies('https://www.floatplane.com').get('sails.sid'): + self.raise_login_required() + + def _real_extract(self, url): + post_id = self._match_id(url) + + post_data = self._download_json( + 'https://www.floatplane.com/api/v3/content/post', post_id, query={'id': post_id}, + note='Downloading post data', errnote='Unable to download post data') + + if not any(traverse_obj(post_data, ('metadata', ('hasVideo', 'hasAudio')))): + raise ExtractorError('Post does not contain a video or audio track', expected=True) + + items = [] + for media in traverse_obj(post_data, (('videoAttachments', 'audioAttachments'), ...)): + media_id = media['id'] + media_typ = media.get('type') or 'video' + + metadata = self._download_json( + f'https://www.floatplane.com/api/v3/content/{media_typ}', media_id, query={'id': media_id}, + note=f'Downloading {media_typ} metadata') + + stream = self._download_json( + 'https://www.floatplane.com/api/v2/cdn/delivery', media_id, query={ + 'type': 'vod' if media_typ == 'video' else 'aod', + 'guid': metadata['guid'] + }, note=f'Downloading {media_typ} stream data') + + path_template = traverse_obj(stream, ('resource', 'uri', {str})) + + def format_path(params): + path = path_template + for i, val in (params or {}).items(): + path = path.replace(f'{{qualityLevelParams.{i}}}', val) + return path + + formats = [] + for quality in traverse_obj(stream, ('resource', 'data', 'qualityLevels', ...)): + url = urljoin(stream['cdn'], format_path(traverse_obj( + stream, ('resource', 'data', 'qualityLevelParams', quality['name'])))) + formats.append({ + **traverse_obj(quality, { + 'format_id': 'name', + 'format_note': 'label', + 'width': ('width', {int}), + 'height': ('height', {int}), + }), + **parse_codecs(quality.get('codecs')), + 'url': url, + 'ext': determine_ext(url.partition('/chunk.m3u8')[0], 'mp4'), + }) + + items.append({ + 'id': media_id, + **traverse_obj(metadata, { + 'title': 'title', + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('thumbnail', 'path'), + }), + 'formats': formats, + }) + + uploader_url = format_field( + post_data, [('creator', 'urlname')], 'https://www.floatplane.com/channel/%s/home') or None + channel_url = urljoin(f'{uploader_url}/', traverse_obj(post_data, ('channel', 'urlname'))) + + post_info = { + 'id': post_id, + 'display_id': post_id, + **traverse_obj(post_data, { + 'title': 'title', + 'description': ('text', {clean_html}), + 'uploader': ('creator', 'title'), + 'uploader_id': ('creator', 'id'), + 'channel': ('channel', 'title'), + 'channel_id': ('channel', 'id'), + 'like_count': ('likes', {int_or_none}), + 'dislike_count': ('dislikes', {int_or_none}), + 'comment_count': ('comments', {int_or_none}), + 'release_timestamp': ('releaseDate', {parse_iso8601}), + 'thumbnail': ('thumbnail', 'path'), + }), + 'uploader_url': uploader_url, + 'channel_url': channel_url, + 'availability': self._availability(needs_subscription=True), + } + + if len(items) > 1: + return self.playlist_result(items, **post_info) + + post_info.update(items[0]) + return post_info + + +class FloatplaneChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|beta)\.)?floatplane\.com/channel/(?P<id>[\w-]+)/home(?:/(?P<channel>[\w-]+))?' + _PAGE_SIZE = 20 + _TESTS = [{ + 'url': 'https://www.floatplane.com/channel/linustechtips/home/ltxexpo', + 'info_dict': { + 'id': 'linustechtips/ltxexpo', + 'title': 'LTX Expo', + 'description': 'md5:9819002f9ebe7fd7c75a3a1d38a59149', + }, + 'playlist_mincount': 51, + }, { + 'url': 'https://www.floatplane.com/channel/ShankMods/home', + 'info_dict': { + 'id': 'ShankMods', + 'title': 'Shank Mods', + 'description': 'md5:6dff1bb07cad8e5448e04daad9be1b30', + }, + 'playlist_mincount': 14, + }, { + 'url': 'https://beta.floatplane.com/channel/bitwit_ultra/home', + 'info_dict': { + 'id': 'bitwit_ultra', + 'title': 'Bitwit Ultra', + 'description': 'md5:1452f280bb45962976d4789200f676dd', + }, + 'playlist_mincount': 200, + }] + + def _fetch_page(self, display_id, creator_id, channel_id, page): + query = { + 'id': creator_id, + 'limit': self._PAGE_SIZE, + 'fetchAfter': page * self._PAGE_SIZE, + } + if channel_id: + query['channel'] = channel_id + page_data = self._download_json( + 'https://www.floatplane.com/api/v3/content/creator', display_id, + query=query, note=f'Downloading page {page + 1}') + for post in page_data or []: + yield self.url_result( + f'https://www.floatplane.com/post/{post["id"]}', + FloatplaneIE, id=post['id'], title=post.get('title'), + release_timestamp=parse_iso8601(post.get('releaseDate'))) + + def _real_extract(self, url): + creator, channel = self._match_valid_url(url).group('id', 'channel') + display_id = join_nonempty(creator, channel, delim='/') + + creator_data = self._download_json( + 'https://www.floatplane.com/api/v3/creator/named', + display_id, query={'creatorURL[0]': creator})[0] + + channel_data = traverse_obj( + creator_data, ('channels', lambda _, v: v['urlname'] == channel), get_all=False) or {} + + return self.playlist_result(OnDemandPagedList(functools.partial( + self._fetch_page, display_id, creator_data['id'], channel_data.get('id')), self._PAGE_SIZE), + display_id, title=channel_data.get('title') or creator_data.get('title'), + description=channel_data.get('about') or creator_data.get('about')) diff --git a/yt_dlp/extractor/fourzerostudio.py b/yt_dlp/extractor/fourzerostudio.py deleted file mode 100644 index c388a3a07..000000000 --- a/yt_dlp/extractor/fourzerostudio.py +++ /dev/null @@ -1,106 +0,0 @@ -from .common import InfoExtractor -from ..utils import traverse_obj, unified_timestamp - - -class FourZeroStudioArchiveIE(InfoExtractor): - _VALID_URL = r'https?://0000\.studio/(?P<uploader_id>[^/]+)/broadcasts/(?P<id>[^/]+)/archive' - IE_NAME = '0000studio:archive' - _TESTS = [{ - 'url': 'https://0000.studio/mumeijiten/broadcasts/1290f433-fce0-4909-a24a-5f7df09665dc/archive', - 'info_dict': { - 'id': '1290f433-fce0-4909-a24a-5f7df09665dc', - 'title': 'noteで『canape』様へのファンレターを執筆します。(数秘術その2)', - 'timestamp': 1653802534, - 'release_timestamp': 1653796604, - 'thumbnails': 'count:1', - 'comments': 'count:7', - 'uploader': '『中崎雄心』の執務室。', - 'uploader_id': 'mumeijiten', - } - }] - - def _real_extract(self, url): - video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') - webpage = self._download_webpage(url, video_id) - nuxt_data = self._search_nuxt_data(webpage, video_id, traverse=None) - - pcb = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorBroadcast'), get_all=False) - uploader_internal_id = traverse_obj(nuxt_data, ( - 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'id'), get_all=False) - - formats, subs = self._extract_m3u8_formats_and_subtitles(pcb['archiveUrl'], video_id, ext='mp4') - - return { - 'id': video_id, - 'title': pcb.get('title'), - 'age_limit': 18 if pcb.get('isAdult') else None, - 'timestamp': unified_timestamp(pcb.get('finishTime')), - 'release_timestamp': unified_timestamp(pcb.get('createdAt')), - 'thumbnails': [{ - 'url': pcb['thumbnailUrl'], - 'ext': 'png', - }] if pcb.get('thumbnailUrl') else None, - 'formats': formats, - 'subtitles': subs, - 'comments': [{ - 'author': c.get('username'), - 'author_id': c.get('postedUserId'), - 'author_thumbnail': c.get('userThumbnailUrl'), - 'id': c.get('id'), - 'text': c.get('body'), - 'timestamp': unified_timestamp(c.get('createdAt')), - 'like_count': c.get('likeCount'), - 'is_favorited': c.get('isLikedByOwner'), - 'author_is_uploader': c.get('postedUserId') == uploader_internal_id, - } for c in traverse_obj(nuxt_data, ( - 'ssrRefs', ..., lambda _, v: v['__typename'] == 'PublicCreatorBroadcastComment')) or []], - 'uploader_id': uploader_id, - 'uploader': traverse_obj(nuxt_data, ( - 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'username'), get_all=False), - } - - -class FourZeroStudioClipIE(InfoExtractor): - _VALID_URL = r'https?://0000\.studio/(?P<uploader_id>[^/]+)/archive-clip/(?P<id>[^/]+)' - IE_NAME = '0000studio:clip' - _TESTS = [{ - 'url': 'https://0000.studio/soeji/archive-clip/e46b0278-24cd-40a8-92e1-b8fc2b21f34f', - 'info_dict': { - 'id': 'e46b0278-24cd-40a8-92e1-b8fc2b21f34f', - 'title': 'わたベーさんからイラスト差し入れいただきました。ありがとうございました!', - 'timestamp': 1652109105, - 'like_count': 1, - 'uploader': 'ソエジマケイタ', - 'uploader_id': 'soeji', - } - }] - - def _real_extract(self, url): - video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') - webpage = self._download_webpage(url, video_id) - nuxt_data = self._search_nuxt_data(webpage, video_id, traverse=None) - - clip_info = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorArchivedClip'), get_all=False) - - info = next(( - m for m in self._parse_html5_media_entries(url, webpage, video_id) - if 'mp4' in traverse_obj(m, ('formats', ..., 'ext')) - ), None) - if not info: - self.report_warning('Failed to find a desired media element. Falling back to using NUXT data.') - info = { - 'formats': [{ - 'ext': 'mp4', - 'url': url, - } for url in clip_info.get('mediaFiles') or [] if url], - } - return { - **info, - 'id': video_id, - 'title': clip_info.get('clipComment'), - 'timestamp': unified_timestamp(clip_info.get('createdAt')), - 'like_count': clip_info.get('likeCount'), - 'uploader_id': uploader_id, - 'uploader': traverse_obj(nuxt_data, ( - 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'username'), get_all=False), - } diff --git a/yt_dlp/extractor/foxgay.py b/yt_dlp/extractor/foxgay.py deleted file mode 100644 index f4f29c65d..000000000 --- a/yt_dlp/extractor/foxgay.py +++ /dev/null @@ -1,58 +0,0 @@ -import itertools - -from .common import InfoExtractor -from ..utils import ( - get_element_by_id, - int_or_none, - remove_end, -) - - -class FoxgayIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P<id>\d+)\.shtml' - _TEST = { - 'url': 'http://foxgay.com/videos/fuck-turkish-style-2582.shtml', - 'md5': '344558ccfea74d33b7adbce22e577f54', - 'info_dict': { - 'id': '2582', - 'ext': 'mp4', - 'title': 'Fuck Turkish-style', - 'description': 'md5:6ae2d9486921891efe89231ace13ffdf', - 'age_limit': 18, - 'thumbnail': r're:https?://.*\.jpg$', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = remove_end(self._html_extract_title(webpage), ' - Foxgay.com') - description = get_element_by_id('inf_tit', webpage) - - # The default user-agent with foxgay cookies leads to pages without videos - self.cookiejar.clear('.foxgay.com') - # Find the URL for the iFrame which contains the actual video. - iframe_url = self._html_search_regex( - r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1', webpage, - 'video frame', group='url') - iframe = self._download_webpage( - iframe_url, video_id, headers={'User-Agent': 'curl/7.50.1'}, - note='Downloading video frame') - video_data = self._parse_json(self._search_regex( - r'video_data\s*=\s*([^;]+);', iframe, 'video data'), video_id) - - formats = [{ - 'url': source, - 'height': int_or_none(resolution), - } for source, resolution in zip( - video_data['sources'], video_data.get('resolutions', itertools.repeat(None)))] - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'description': description, - 'thumbnail': video_data.get('act_vid', {}).get('thumb'), - 'age_limit': 18, - } diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py index 052317204..0ceecde74 100644 --- a/yt_dlp/extractor/francetv.py +++ b/yt_dlp/extractor/francetv.py @@ -1,12 +1,14 @@ from .common import InfoExtractor +from .dailymotion import DailymotionIE from ..utils import ( - determine_ext, ExtractorError, + determine_ext, format_field, + int_or_none, + join_nonempty, parse_iso8601, parse_qs, ) -from .dailymotion import DailymotionIE class FranceTVBaseInfoExtractor(InfoExtractor): @@ -82,6 +84,8 @@ def _extract_video(self, video_id, catalogue=None): videos = [] title = None subtitle = None + episode_number = None + season_number = None image = None duration = None timestamp = None @@ -112,7 +116,9 @@ def _extract_video(self, video_id, catalogue=None): if meta: if title is None: title = meta.get('title') - # XXX: what is meta['pre_title']? + # meta['pre_title'] contains season and episode number for series in format "S<ID> E<ID>" + season_number, episode_number = self._search_regex( + r'S(\d+)\s*E(\d+)', meta.get('pre_title'), 'episode info', group=(1, 2), default=(None, None)) if subtitle is None: subtitle = meta.get('additional_title') if image is None: @@ -191,19 +197,19 @@ def _extract_video(self, video_id, catalogue=None): } for sheet in spritesheets] }) - if subtitle: - title += ' - %s' % subtitle - title = title.strip() - return { 'id': video_id, - 'title': title, + 'title': join_nonempty(title, subtitle, delim=' - ').strip(), 'thumbnail': image, 'duration': duration, 'timestamp': timestamp, 'is_live': is_live, 'formats': formats, 'subtitles': subtitles, + 'episode': subtitle if episode_number else None, + 'series': title if episode_number else None, + 'episode_number': int_or_none(episode_number), + 'season_number': int_or_none(season_number), } def _real_extract(self, url): @@ -230,14 +236,31 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): 'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1', 'ext': 'mp4', 'title': '13h15, le dimanche... - Les mystères de Jésus', - 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', 'timestamp': 1502623500, + 'duration': 2580, + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20170813', }, 'params': { 'skip_download': True, }, 'add_ie': [FranceTVIE.ie_key()], + }, { + 'url': 'https://www.france.tv/enfants/six-huit-ans/foot2rue/saison-1/3066387-duel-au-vieux-port.html', + 'info_dict': { + 'id': 'a9050959-eedd-4b4a-9b0d-de6eeaa73e44', + 'ext': 'mp4', + 'title': 'Foot2Rue - Duel au vieux port', + 'episode': 'Duel au vieux port', + 'series': 'Foot2Rue', + 'episode_number': 1, + 'season_number': 1, + 'timestamp': 1642761360, + 'upload_date': '20220121', + 'season': 'Season 1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1441, + }, }, { # france3 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html', diff --git a/yt_dlp/extractor/fusion.py b/yt_dlp/extractor/fusion.py deleted file mode 100644 index 689422fca..000000000 --- a/yt_dlp/extractor/fusion.py +++ /dev/null @@ -1,81 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - determine_ext, - int_or_none, - mimetype2ext, - parse_iso8601, -) - - -class FusionIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?fusion\.(?:net|tv)/(?:video/|show/.+?\bvideo=)(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://fusion.tv/video/201781/u-s-and-panamanian-forces-work-together-to-stop-a-vessel-smuggling-drugs/', - 'info_dict': { - 'id': '3145868', - 'ext': 'mp4', - 'title': 'U.S. and Panamanian forces work together to stop a vessel smuggling drugs', - 'description': 'md5:0cc84a9943c064c0f46b128b41b1b0d7', - 'duration': 140.0, - 'timestamp': 1442589635, - 'uploader': 'UNIVISON', - 'upload_date': '20150918', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Anvato'], - }, { - 'url': 'http://fusion.tv/video/201781', - 'only_matching': True, - }, { - 'url': 'https://fusion.tv/show/food-exposed-with-nelufar-hedayat/?ancla=full-episodes&video=588644', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - video = self._download_json( - 'https://platform.fusion.net/wp-json/fusiondotnet/v1/video/' + video_id, video_id) - - info = { - 'id': video_id, - 'title': video['title'], - 'description': video.get('excerpt'), - 'timestamp': parse_iso8601(video.get('published')), - 'series': video.get('show'), - } - - formats = [] - src = video.get('src') or {} - for f_id, f in src.items(): - for q_id, q in f.items(): - q_url = q.get('url') - if not q_url: - continue - ext = determine_ext(q_url, mimetype2ext(q.get('type'))) - if ext == 'smil': - formats.extend(self._extract_smil_formats(q_url, video_id, fatal=False)) - elif f_id == 'm3u8-variant' or (ext == 'm3u8' and q_id == 'Variant'): - formats.extend(self._extract_m3u8_formats( - q_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'format_id': '-'.join([f_id, q_id]), - 'url': q_url, - 'width': int_or_none(q.get('width')), - 'height': int_or_none(q.get('height')), - 'tbr': int_or_none(self._search_regex(r'_(\d+)\.m(?:p4|3u8)', q_url, 'bitrate')), - 'ext': 'mp4' if ext == 'm3u8' else ext, - 'protocol': 'm3u8_native' if ext == 'm3u8' else 'https', - }) - if formats: - info['formats'] = formats - else: - info.update({ - '_type': 'url', - 'url': 'anvato:uni:' + video['video_ids']['anvato'], - 'ie_key': 'Anvato', - }) - - return info diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 5e1240c13..1f0011c09 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -17,6 +17,7 @@ determine_protocol, dict_get, extract_basic_auth, + filter_dict, format_field, int_or_none, is_html, @@ -35,6 +36,7 @@ unsmuggle_url, update_url_query, url_or_none, + urlhandle_detect_ext, urljoin, variadic, xpath_attr, @@ -372,46 +374,6 @@ class GenericIE(InfoExtractor): }, 'skip': 'There is a limit of 200 free downloads / month for the test song', }, - # ooyala video - { - 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', - 'md5': '166dd577b433b4d4ebfee10b0824d8ff', - 'info_dict': { - 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', - 'ext': 'mp4', - 'title': '2cc213299525360.mov', # that's what we get - 'duration': 238.231, - }, - 'add_ie': ['Ooyala'], - }, - { - # ooyala video embedded with http://player.ooyala.com/iframe.js - 'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/', - 'info_dict': { - 'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB', - 'ext': 'mp4', - 'title': '"Steve Jobs: Man in the Machine" trailer', - 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."', - 'duration': 135.427, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'movie expired', - }, - # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js - { - 'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/', - 'info_dict': { - 'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2', - 'ext': 'mp4', - 'title': 'Steampunk Fest Comes to Honesdale', - 'duration': 43.276, - }, - 'params': { - 'skip_download': True, - } - }, # embed.ly video { 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', @@ -504,7 +466,8 @@ class GenericIE(InfoExtractor): 'title': 'Ужастики, русский трейлер (2015)', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 153, - } + }, + 'skip': 'Site dead', }, # XHamster embed { @@ -776,14 +739,16 @@ class GenericIE(InfoExtractor): 'playlist_mincount': 1, 'add_ie': ['Youtube'], }, - # Cinchcast embed + # Libsyn embed { 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/', 'info_dict': { - 'id': '7141703', + 'id': '3793998', 'ext': 'mp3', 'upload_date': '20141126', - 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing', + 'title': 'Underground Wellness Radio - Jack Tips: 5 Steps to Permanent Gut Healing', + 'thumbnail': 'https://assets.libsyn.com/secure/item/3793998/?height=90&width=90', + 'duration': 3989.0, } }, # Cinerama player @@ -1565,16 +1530,6 @@ class GenericIE(InfoExtractor): 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', }, }, - { - # vzaar embed - 'url': 'http://help.vzaar.com/article/165-embedding-video', - 'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4', - 'info_dict': { - 'id': '8707641', - 'ext': 'mp4', - 'title': 'Building A Business Online: Principal Chairs Q & A', - }, - }, { # multiple HTML5 videos on one page 'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html', @@ -2434,10 +2389,10 @@ def _real_extract(self, url): # to accept raw bytes and being able to download only a chunk. # It may probably better to solve this by checking Content-Type for application/octet-stream # after a HEAD request, but not sure if we can rely on this. - full_response = self._request_webpage(url, video_id, headers={ + full_response = self._request_webpage(url, video_id, headers=filter_dict({ 'Accept-Encoding': 'identity', - **smuggled_data.get('http_headers', {}) - }) + 'Referer': smuggled_data.get('referer'), + })) new_url = full_response.url url = urllib.parse.urlparse(url)._replace(scheme=urllib.parse.urlparse(new_url).scheme).geturl() if new_url != extract_basic_auth(url)[0]: @@ -2457,9 +2412,9 @@ def _real_extract(self, url): m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: self.report_detected('direct video link') - headers = smuggled_data.get('http_headers', {}) + headers = filter_dict({'Referer': smuggled_data.get('referer')}) format_id = str(m.group('format_id')) - ext = determine_ext(url) + ext = determine_ext(url, default_ext=None) or urlhandle_detect_ext(full_response) subtitles = {} if format_id.endswith('mpegurl') or ext == 'm3u8': formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers) @@ -2471,6 +2426,7 @@ def _real_extract(self, url): formats = [{ 'format_id': format_id, 'url': url, + 'ext': ext, 'vcodec': 'none' if m.group('type') == 'audio' else None }] info_dict['direct'] = True @@ -2708,7 +2664,7 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): 'url': smuggle_url(json_ld['url'], { 'force_videoid': video_id, 'to_generic': True, - 'http_headers': {'Referer': url}, + 'referer': url, }), }, json_ld)] diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py new file mode 100644 index 000000000..6fdbcd736 --- /dev/null +++ b/yt_dlp/extractor/getcourseru.py @@ -0,0 +1,179 @@ +import re +import time +import urllib.parse + +from .common import InfoExtractor +from ..utils import ExtractorError, int_or_none, url_or_none, urlencode_postdata +from ..utils.traversal import traverse_obj + + +class GetCourseRuPlayerIE(InfoExtractor): + _VALID_URL = r'https?://player02\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=[^#&]+' + _EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL}[^\'"]*)'] + _TESTS = [{ + 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1NDQ5NjQyLCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=354ad2c993d95d5ac629e3133d6cefea&vh-static-feature=zigzag', + 'info_dict': { + 'id': '513573381', + 'title': '190bdf93f1b29735309853a7a19e24b3', + 'ext': 'mp4', + 'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80', + 'duration': 1693 + }, + 'skip': 'JWT expired', + }] + + def _real_extract(self, url): + webpage = self._download_webpage(url, None, 'Downloading player page') + window_configs = self._search_json( + r'window\.configs\s*=', webpage, 'config', None) + video_id = str(window_configs['gcFileId']) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + window_configs['masterPlaylistUrl'], video_id) + + return { + **traverse_obj(window_configs, { + 'title': ('videoHash', {str}), + 'thumbnail': ('previewUrl', {url_or_none}), + 'duration': ('videoDuration', {int_or_none}), + }), + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles + } + + +class GetCourseRuIE(InfoExtractor): + _NETRC_MACHINE = 'getcourseru' + _DOMAINS = [ + 'academymel.online', + 'marafon.mani-beauty.com', + 'on.psbook.ru' + ] + _BASE_URL_RE = rf'https?://(?:(?!player02\.)[^.]+\.getcourse\.(?:ru|io)|{"|".join(map(re.escape, _DOMAINS))})' + _VALID_URL = [ + rf'{_BASE_URL_RE}/(?!pl/|teach/)(?P<id>[^?#]+)', + rf'{_BASE_URL_RE}/(:?pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P<id>\d+)', + ] + _TESTS = [{ + 'url': 'http://academymel.online/3video_1', + 'info_dict': { + 'id': '3059742', + 'display_id': '3video_1', + 'title': 'Промоуроки Академии МЕЛ', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '513573381', + 'ext': 'mp4', + 'title': 'Промоуроки Академии МЕЛ', + 'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80', + 'duration': 1693 + }, + }] + }, { + 'url': 'https://academymel.getcourse.ru/3video_1', + 'info_dict': { + 'id': '3059742', + 'display_id': '3video_1', + 'title': 'Промоуроки Академии МЕЛ', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '513573381', + 'ext': 'mp4', + 'title': 'Промоуроки Академии МЕЛ', + 'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80', + 'duration': 1693 + }, + }] + }, { + 'url': 'https://academymel.getcourse.ru/pl/teach/control/lesson/view?id=319141781&editMode=0', + 'info_dict': { + 'id': '319141781', + 'title': '1. Разминка у стены', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '4919601', + 'ext': 'mp4', + 'title': '1. Разминка у стены', + 'thumbnail': 'https://preview-htz.vhcdn.com/preview/5a521788e7dc25b4f70c3dff6512d90e/preview.jpg?version=1703223532&host=vh-81', + 'duration': 704 + }, + }], + 'skip': 'paid lesson' + }, { + 'url': 'https://manibeauty.getcourse.ru/pl/teach/control/lesson/view?id=272499894', + 'info_dict': { + 'id': '272499894', + 'title': 'Мотивация к тренировкам', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '447479687', + 'ext': 'mp4', + 'title': 'Мотивация к тренировкам', + 'thumbnail': 'https://preview-htz.vhcdn.com/preview/70ed5b9f489dd03b4aff55bfdff71a26/preview.jpg?version=1685115787&host=vh-71', + 'duration': 30 + }, + }], + 'skip': 'paid lesson' + }, { + 'url': 'https://gaismasmandalas.getcourse.io/ATLAUTSEVBUT', + 'only_matching': True, + }] + + _LOGIN_URL_PATH = '/cms/system/login' + + def _login(self, hostname, username, password): + if self._get_cookies(f'https://{hostname}').get('PHPSESSID5'): + return + login_url = f'https://{hostname}{self._LOGIN_URL_PATH}' + webpage = self._download_webpage(login_url, None) + + self._request_webpage( + login_url, None, 'Logging in', 'Failed to log in', + data=urlencode_postdata({ + 'action': 'processXdget', + 'xdgetId': self._html_search_regex( + r'<form[^>]+\bclass="[^"]*\bstate-login[^"]*"[^>]+\bdata-xdget-id="([^"]+)"', + webpage, 'xdgetId'), + 'params[action]': 'login', + 'params[url]': login_url, + 'params[object_type]': 'cms_page', + 'params[object_id]': -1, + 'params[email]': username, + 'params[password]': password, + 'requestTime': int(time.time()), + 'requestSimpleSign': self._html_search_regex( + r'window.requestSimpleSign\s*=\s*"([\da-f]+)"', webpage, 'simple sign'), + })) + + def _real_extract(self, url): + hostname = urllib.parse.urlparse(url).hostname + username, password = self._get_login_info(netrc_machine=hostname) + if username: + self._login(hostname, username, password) + + display_id = self._match_id(url) + # NB: 404 is returned due to yt-dlp not properly following redirects #9020 + webpage, urlh = self._download_webpage_handle(url, display_id, expected_status=404) + if self._LOGIN_URL_PATH in urlh.url or urlh.status == 404: + raise ExtractorError( + f'This video is only available for registered users. {self._login_hint("any", netrc=hostname)}', + expected=True) + + playlist_id = self._search_regex( + r'window\.(?:lessonId|gcsObjectId)\s*=\s*(\d+)', webpage, 'playlist id', default=display_id) + title = self._og_search_title(webpage) or self._html_extract_title(webpage) + + return self.playlist_from_matches( + re.findall(GetCourseRuPlayerIE._EMBED_REGEX[0], webpage), + playlist_id, title, display_id=display_id, ie=GetCourseRuPlayerIE, video_kwargs={ + 'url_transparent': True, + 'title': title, + }) diff --git a/yt_dlp/extractor/gfycat.py b/yt_dlp/extractor/gfycat.py deleted file mode 100644 index edc2e56e4..000000000 --- a/yt_dlp/extractor/gfycat.py +++ /dev/null @@ -1,145 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - int_or_none, - float_or_none, - qualities, - ExtractorError, -) - - -class GfycatIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?i:ru/|ifr/|gifs/detail/)?(?P<id>[^-/?#\."\']+)' - _EMBED_REGEX = [rf'<(?:iframe|source)[^>]+\bsrc=["\'](?P<url>{_VALID_URL})'] - _TESTS = [{ - 'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher', - 'info_dict': { - 'id': 'DeadlyDecisiveGermanpinscher', - 'ext': 'mp4', - 'title': 'Ghost in the Shell', - 'timestamp': 1410656006, - 'upload_date': '20140914', - 'uploader': 'anonymous', - 'duration': 10.4, - 'view_count': int, - 'like_count': int, - 'categories': list, - 'age_limit': 0, - 'uploader_id': 'anonymous', - 'description': '', - } - }, { - 'url': 'http://gfycat.com/ifr/JauntyTimelyAmazontreeboa', - 'info_dict': { - 'id': 'JauntyTimelyAmazontreeboa', - 'ext': 'mp4', - 'title': 'JauntyTimelyAmazontreeboa', - 'timestamp': 1411720126, - 'upload_date': '20140926', - 'uploader': 'anonymous', - 'duration': 3.52, - 'view_count': int, - 'like_count': int, - 'categories': list, - 'age_limit': 0, - 'uploader_id': 'anonymous', - 'description': '', - } - }, { - 'url': 'https://gfycat.com/alienatedsolidgreathornedowl', - 'info_dict': { - 'id': 'alienatedsolidgreathornedowl', - 'ext': 'mp4', - 'upload_date': '20211226', - 'uploader_id': 'reactions', - 'timestamp': 1640536930, - 'like_count': int, - 'description': '', - 'title': 'Ingrid Michaelson, Zooey Deschanel - Merry Christmas Happy New Year', - 'categories': list, - 'age_limit': 0, - 'duration': 2.9583333333333335, - 'uploader': 'Reaction GIFs', - 'view_count': int, - } - }, { - 'url': 'https://gfycat.com/ru/RemarkableDrearyAmurstarfish', - 'only_matching': True - }, { - 'url': 'https://gfycat.com/gifs/detail/UnconsciousLankyIvorygull', - 'only_matching': True - }, { - 'url': 'https://gfycat.com/acceptablehappygoluckyharborporpoise-baseball', - 'only_matching': True - }, { - 'url': 'https://thumbs.gfycat.com/acceptablehappygoluckyharborporpoise-size_restricted.gif', - 'only_matching': True - }, { - 'url': 'https://giant.gfycat.com/acceptablehappygoluckyharborporpoise.mp4', - 'only_matching': True - }, { - 'url': 'http://gfycat.com/IFR/JauntyTimelyAmazontreeboa', - 'only_matching': True - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - gfy = self._download_json( - 'https://api.gfycat.com/v1/gfycats/%s' % video_id, - video_id, 'Downloading video info') - if 'error' in gfy: - raise ExtractorError('Gfycat said: ' + gfy['error'], expected=True) - gfy = gfy['gfyItem'] - - title = gfy.get('title') or gfy['gfyName'] - description = gfy.get('description') - timestamp = int_or_none(gfy.get('createDate')) - uploader = gfy.get('userName') or gfy.get('username') - view_count = int_or_none(gfy.get('views')) - like_count = int_or_none(gfy.get('likes')) - dislike_count = int_or_none(gfy.get('dislikes')) - age_limit = 18 if gfy.get('nsfw') == '1' else 0 - - width = int_or_none(gfy.get('width')) - height = int_or_none(gfy.get('height')) - fps = int_or_none(gfy.get('frameRate')) - num_frames = int_or_none(gfy.get('numFrames')) - - duration = float_or_none(num_frames, fps) if num_frames and fps else None - - categories = gfy.get('tags') or gfy.get('extraLemmas') or [] - - FORMATS = ('gif', 'webm', 'mp4') - quality = qualities(FORMATS) - - formats = [] - for format_id in FORMATS: - video_url = gfy.get('%sUrl' % format_id) - if not video_url: - continue - filesize = int_or_none(gfy.get('%sSize' % format_id)) - formats.append({ - 'url': video_url, - 'format_id': format_id, - 'width': width, - 'height': height, - 'fps': fps, - 'filesize': filesize, - 'quality': quality(format_id), - }) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'uploader': gfy.get('userDisplayName') or uploader, - 'uploader_id': uploader, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'categories': categories, - 'age_limit': age_limit, - 'formats': formats, - } diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 2fdec20f6..06658dd47 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -19,9 +19,9 @@ class GoogleDriveIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:docs|drive)\.google\.com/ + (?:docs|drive|drive\.usercontent)\.google\.com/ (?: - (?:uc|open)\?.*?id=| + (?:uc|open|download)\?.*?id=| file/d/ )| video\.google\.com/get_player\?.*?docid= @@ -53,6 +53,9 @@ class GoogleDriveIE(InfoExtractor): }, { 'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28', 'only_matching': True, + }, { + 'url': 'https://drive.usercontent.google.com/download?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ', + 'only_matching': True, }] _FORMATS_EXT = { '5': 'flv', @@ -205,9 +208,10 @@ def get_value(key): formats.append(f) source_url = update_url_query( - 'https://drive.google.com/uc', { + 'https://drive.usercontent.google.com/download', { 'id': video_id, 'export': 'download', + 'confirm': 't', }) def request_source_file(source_url, kind, data=None): diff --git a/yt_dlp/extractor/gopro.py b/yt_dlp/extractor/gopro.py index ae965374c..ec1595bc5 100644 --- a/yt_dlp/extractor/gopro.py +++ b/yt_dlp/extractor/gopro.py @@ -57,8 +57,8 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - metadata = self._parse_json( - self._html_search_regex(r'window\.__reflectData\s*=\s*([^;]+)', webpage, 'metadata'), video_id) + metadata = self._search_json( + r'window\.__reflectData\s*=', webpage, 'metadata', video_id) video_info = metadata['collectionMedia'][0] media_data = self._download_json( @@ -99,7 +99,7 @@ def _real_extract(self, url): 'duration': int_or_none( video_info.get('source_duration')), 'artist': str_or_none( - video_info.get('music_track_artist')), + video_info.get('music_track_artist')) or None, 'track': str_or_none( - video_info.get('music_track_name')), + video_info.get('music_track_name')) or None, } diff --git a/yt_dlp/extractor/groupon.py b/yt_dlp/extractor/groupon.py index 362d3ff83..c1cbda35f 100644 --- a/yt_dlp/extractor/groupon.py +++ b/yt_dlp/extractor/groupon.py @@ -31,7 +31,6 @@ class GrouponIE(InfoExtractor): } _PROVIDERS = { - 'ooyala': ('ooyala:%s', 'Ooyala'), 'youtube': ('%s', 'Youtube'), } diff --git a/yt_dlp/extractor/harpodeon.py b/yt_dlp/extractor/harpodeon.py index 0aa47337f..46eaddb32 100644 --- a/yt_dlp/extractor/harpodeon.py +++ b/yt_dlp/extractor/harpodeon.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import int_or_none class HarpodeonIE(InfoExtractor): @@ -14,7 +14,7 @@ class HarpodeonIE(InfoExtractor): 'title': 'The Smoking Out of Bella Butts', 'description': 'md5:47e16bdb41fc8a79c83ab83af11c8b77', 'creator': 'Vitagraph Company of America', - 'release_date': '19150101' + 'release_year': 1915, } }, { 'url': 'https://www.harpodeon.com/preview/The_Smoking_Out_of_Bella_Butts/268068288', @@ -25,7 +25,7 @@ class HarpodeonIE(InfoExtractor): 'title': 'The Smoking Out of Bella Butts', 'description': 'md5:47e16bdb41fc8a79c83ab83af11c8b77', 'creator': 'Vitagraph Company of America', - 'release_date': '19150101' + 'release_year': 1915, } }, { 'url': 'https://www.harpodeon.com/preview/Behind_the_Screen/421838710', @@ -36,7 +36,7 @@ class HarpodeonIE(InfoExtractor): 'title': 'Behind the Screen', 'description': 'md5:008972a3dc51fba3965ee517d2ba9155', 'creator': 'Lone Star Corporation', - 'release_date': '19160101' + 'release_year': 1916, } }] @@ -66,5 +66,5 @@ def _real_extract(self, url): 'http_headers': {'Referer': url}, 'description': self._html_search_meta('description', webpage, fatal=False), 'creator': creator, - 'release_date': unified_strdate(f'{release_year}0101') + 'release_year': int_or_none(release_year), } diff --git a/yt_dlp/extractor/helsinki.py b/yt_dlp/extractor/helsinki.py deleted file mode 100644 index e518cae1a..000000000 --- a/yt_dlp/extractor/helsinki.py +++ /dev/null @@ -1,38 +0,0 @@ -from .common import InfoExtractor -from ..utils import js_to_json - - -class HelsinkiIE(InfoExtractor): - IE_DESC = 'helsinki.fi' - _VALID_URL = r'https?://video\.helsinki\.fi/Arkisto/flash\.php\?id=(?P<id>\d+)' - _TEST = { - 'url': 'http://video.helsinki.fi/Arkisto/flash.php?id=20258', - 'info_dict': { - 'id': '20258', - 'ext': 'mp4', - 'title': 'Tietotekniikkafoorumi-iltapäivä', - 'description': 'md5:f5c904224d43c133225130fe156a5ee0', - }, - 'params': { - 'skip_download': True, # RTMP - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - params = self._parse_json(self._html_search_regex( - r'(?s)jwplayer\("player"\).setup\((\{.*?\})\);', - webpage, 'player code'), video_id, transform_source=js_to_json) - formats = [{ - 'url': s['file'], - 'ext': 'mp4', - } for s in params['sources']] - - return { - 'id': video_id, - 'title': self._og_search_title(webpage).replace('Video: ', ''), - 'description': self._og_search_description(webpage), - 'formats': formats, - } diff --git a/yt_dlp/extractor/hitbox.py b/yt_dlp/extractor/hitbox.py deleted file mode 100644 index f0c689883..000000000 --- a/yt_dlp/extractor/hitbox.py +++ /dev/null @@ -1,209 +0,0 @@ -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - clean_html, - determine_ext, - float_or_none, - int_or_none, - parse_iso8601, -) - - -class HitboxIE(InfoExtractor): - IE_NAME = 'hitbox' - _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?:[^/]+/)*videos?/(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://www.hitbox.tv/video/203213', - 'info_dict': { - 'id': '203213', - 'title': 'hitbox @ gamescom, Sub Button Hype extended, Giveaway - hitbox News Update with Oxy', - 'alt_title': 'hitboxlive - Aug 9th #6', - 'description': '', - 'ext': 'mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 215.1666, - 'resolution': 'HD 720p', - 'uploader': 'hitboxlive', - 'view_count': int, - 'timestamp': 1407576133, - 'upload_date': '20140809', - 'categories': ['Live Show'], - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'https://www.smashcast.tv/hitboxlive/videos/203213', - 'only_matching': True, - }] - - def _extract_metadata(self, url, video_id): - thumb_base = 'https://edge.sf.hitbox.tv' - metadata = self._download_json( - '%s/%s' % (url, video_id), video_id, 'Downloading metadata JSON') - - date = 'media_live_since' - media_type = 'livestream' - if metadata.get('media_type') == 'video': - media_type = 'video' - date = 'media_date_added' - - video_meta = metadata.get(media_type, [])[0] - title = video_meta.get('media_status') - alt_title = video_meta.get('media_title') - description = clean_html( - video_meta.get('media_description') - or video_meta.get('media_description_md')) - duration = float_or_none(video_meta.get('media_duration')) - uploader = video_meta.get('media_user_name') - views = int_or_none(video_meta.get('media_views')) - timestamp = parse_iso8601(video_meta.get(date), ' ') - categories = [video_meta.get('category_name')] - thumbs = [{ - 'url': thumb_base + video_meta.get('media_thumbnail'), - 'width': 320, - 'height': 180 - }, { - 'url': thumb_base + video_meta.get('media_thumbnail_large'), - 'width': 768, - 'height': 432 - }] - - return { - 'id': video_id, - 'title': title, - 'alt_title': alt_title, - 'description': description, - 'ext': 'mp4', - 'thumbnails': thumbs, - 'duration': duration, - 'uploader': uploader, - 'view_count': views, - 'timestamp': timestamp, - 'categories': categories, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - player_config = self._download_json( - 'https://www.smashcast.tv/api/player/config/video/%s' % video_id, - video_id, 'Downloading video JSON') - - formats = [] - for video in player_config['clip']['bitrates']: - label = video.get('label') - if label == 'Auto': - continue - video_url = video.get('url') - if not video_url: - continue - bitrate = int_or_none(video.get('bitrate')) - if determine_ext(video_url) == 'm3u8': - if not video_url.startswith('http'): - continue - formats.append({ - 'url': video_url, - 'ext': 'mp4', - 'tbr': bitrate, - 'format_note': label, - 'protocol': 'm3u8_native', - }) - else: - formats.append({ - 'url': video_url, - 'tbr': bitrate, - 'format_note': label, - }) - - metadata = self._extract_metadata( - 'https://www.smashcast.tv/api/media/video', video_id) - metadata['formats'] = formats - - return metadata - - -class HitboxLiveIE(HitboxIE): # XXX: Do not subclass from concrete IE - IE_NAME = 'hitbox:live' - _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.hitbox.tv/dimak', - 'info_dict': { - 'id': 'dimak', - 'ext': 'mp4', - 'description': 'md5:c9f80fa4410bc588d7faa40003fc7d0e', - 'timestamp': int, - 'upload_date': compat_str, - 'title': compat_str, - 'uploader': 'Dimak', - }, - 'params': { - # live - 'skip_download': True, - }, - }, { - 'url': 'https://www.smashcast.tv/dimak', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if HitboxIE.suitable(url) else super(HitboxLiveIE, cls).suitable(url) - - def _real_extract(self, url): - video_id = self._match_id(url) - - player_config = self._download_json( - 'https://www.smashcast.tv/api/player/config/live/%s' % video_id, - video_id) - - formats = [] - cdns = player_config.get('cdns') - servers = [] - for cdn in cdns: - # Subscribe URLs are not playable - if cdn.get('rtmpSubscribe') is True: - continue - base_url = cdn.get('netConnectionUrl') - host = re.search(r'.+\.([^\.]+\.[^\./]+)/.+', base_url).group(1) - if base_url not in servers: - servers.append(base_url) - for stream in cdn.get('bitrates'): - label = stream.get('label') - if label == 'Auto': - continue - stream_url = stream.get('url') - if not stream_url: - continue - bitrate = int_or_none(stream.get('bitrate')) - if stream.get('provider') == 'hls' or determine_ext(stream_url) == 'm3u8': - if not stream_url.startswith('http'): - continue - formats.append({ - 'url': stream_url, - 'ext': 'mp4', - 'tbr': bitrate, - 'format_note': label, - 'rtmp_live': True, - }) - else: - formats.append({ - 'url': '%s/%s' % (base_url, stream_url), - 'ext': 'mp4', - 'tbr': bitrate, - 'rtmp_live': True, - 'format_note': host, - 'page_url': url, - 'player_url': 'http://www.hitbox.tv/static/player/flowplayer/flowplayer.commercial-3.2.16.swf', - }) - - metadata = self._extract_metadata( - 'https://www.smashcast.tv/api/media/live', video_id) - metadata['formats'] = formats - metadata['is_live'] = True - metadata['title'] = metadata.get('title') - - return metadata diff --git a/yt_dlp/extractor/howcast.py b/yt_dlp/extractor/howcast.py deleted file mode 100644 index 59cf80f1a..000000000 --- a/yt_dlp/extractor/howcast.py +++ /dev/null @@ -1,41 +0,0 @@ -from .common import InfoExtractor -from ..utils import parse_iso8601 - - -class HowcastIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?howcast\.com/videos/(?P<id>\d+)' - _TEST = { - 'url': 'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly', - 'md5': '7d45932269a288149483144f01b99789', - 'info_dict': { - 'id': '390161', - 'ext': 'mp4', - 'title': 'How to Tie a Square Knot Properly', - 'description': 'md5:dbe792e5f6f1489027027bf2eba188a3', - 'timestamp': 1276081287, - 'upload_date': '20100609', - 'duration': 56.823, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Ooyala'], - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - embed_code = self._search_regex( - r'<iframe[^>]+src="[^"]+\bembed_code=([^\b]+)\b', - webpage, 'ooyala embed code') - - return { - '_type': 'url_transparent', - 'ie_key': 'Ooyala', - 'url': 'ooyala:%s' % embed_code, - 'id': video_id, - 'timestamp': parse_iso8601(self._html_search_meta( - 'article:published_time', webpage, 'timestamp')), - } diff --git a/yt_dlp/extractor/howstuffworks.py b/yt_dlp/extractor/howstuffworks.py deleted file mode 100644 index 238fc0b42..000000000 --- a/yt_dlp/extractor/howstuffworks.py +++ /dev/null @@ -1,86 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - find_xpath_attr, - int_or_none, - js_to_json, - unescapeHTML, - determine_ext, -) - - -class HowStuffWorksIE(InfoExtractor): - _VALID_URL = r'https?://[\da-z-]+\.(?:howstuffworks|stuff(?:(?:youshould|theydontwantyouto)know|toblowyourmind|momnevertoldyou)|(?:brain|car)stuffshow|fwthinking|geniusstuff)\.com/(?:[^/]+/)*(?:\d+-)?(?P<id>.+?)-video\.htm' - _TESTS = [ - { - 'url': 'http://www.stufftoblowyourmind.com/videos/optical-illusions-video.htm', - 'md5': '76646a5acc0c92bf7cd66751ca5db94d', - 'info_dict': { - 'id': '855410', - 'ext': 'mp4', - 'title': 'Your Trickster Brain: Optical Illusions -- Science on the Web', - 'description': 'md5:e374ff9561f6833ad076a8cc0a5ab2fb', - }, - }, - { - 'url': 'http://shows.howstuffworks.com/more-shows/why-does-balloon-stick-to-hair-video.htm', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - clip_js = self._search_regex( - r'(?s)var clip = ({.*?});', webpage, 'clip info') - clip_info = self._parse_json( - clip_js, display_id, transform_source=js_to_json) - - video_id = clip_info['content_id'] - formats = [] - m3u8_url = clip_info.get('m3u8') - if m3u8_url and determine_ext(m3u8_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', format_id='hls', fatal=True)) - flv_url = clip_info.get('flv_url') - if flv_url: - formats.append({ - 'url': flv_url, - 'format_id': 'flv', - }) - for video in clip_info.get('mp4', []): - formats.append({ - 'url': video['src'], - 'format_id': 'mp4-%s' % video['bitrate'], - 'vbr': int_or_none(video['bitrate'].rstrip('k')), - }) - - if not formats: - smil = self._download_xml( - 'http://services.media.howstuffworks.com/videos/%s/smil-service.smil' % video_id, - video_id, 'Downloading video SMIL') - - http_base = find_xpath_attr( - smil, - './{0}head/{0}meta'.format('{http://www.w3.org/2001/SMIL20/Language}'), - 'name', - 'httpBase').get('content') - - URL_SUFFIX = '?v=2.11.3&fp=LNX 11,2,202,356&r=A&g=A' - - for video in smil.findall( - './{0}body/{0}switch/{0}video'.format('{http://www.w3.org/2001/SMIL20/Language}')): - vbr = int_or_none(video.attrib['system-bitrate'], scale=1000) - formats.append({ - 'url': '%s/%s%s' % (http_base, video.attrib['src'], URL_SUFFIX), - 'format_id': '%dk' % vbr, - 'vbr': vbr, - }) - - return { - 'id': '%s' % video_id, - 'display_id': display_id, - 'title': unescapeHTML(clip_info['clip_title']), - 'description': unescapeHTML(clip_info.get('caption')), - 'thumbnail': clip_info.get('video_still_url'), - 'duration': int_or_none(clip_info.get('duration')), - 'formats': formats, - } diff --git a/yt_dlp/extractor/iheart.py b/yt_dlp/extractor/iheart.py index 2c6a5b6a1..fb6f51e2c 100644 --- a/yt_dlp/extractor/iheart.py +++ b/yt_dlp/extractor/iheart.py @@ -23,7 +23,7 @@ def _extract_episode(self, episode): class IHeartRadioIE(IHeartRadioBaseIE): - IENAME = 'iheartradio' + IE_NAME = 'iheartradio' _VALID_URL = r'(?:https?://(?:www\.)?iheart\.com/podcast/[^/]+/episode/(?P<display_id>[^/?&#]+)-|iheartradio:)(?P<id>\d+)' _TEST = { 'url': 'https://www.iheart.com/podcast/105-behind-the-bastards-29236323/episode/part-one-alexander-lukashenko-the-dictator-70346499/?embed=true', diff --git a/yt_dlp/extractor/ilpost.py b/yt_dlp/extractor/ilpost.py new file mode 100644 index 000000000..ae98399ee --- /dev/null +++ b/yt_dlp/extractor/ilpost.py @@ -0,0 +1,69 @@ +import functools + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + url_or_none, + urlencode_postdata, +) +from ..utils.traversal import traverse_obj + + +class IlPostIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ilpost\.it/episodes/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.ilpost.it/episodes/1-avis-akvasas-ka/', + 'md5': '43649f002d85e1c2f319bb478d479c40', + 'info_dict': { + 'id': '2972047', + 'ext': 'mp3', + 'display_id': '1-avis-akvasas-ka', + 'title': '1. Avis akvasas ka', + 'url': 'https://www.ilpost.it/wp-content/uploads/2023/12/28/1703781217-l-invasione-pt1-v6.mp3', + 'timestamp': 1703835014, + 'upload_date': '20231229', + 'duration': 2495.0, + 'availability': 'public', + 'series_id': '235598', + 'description': '', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + endpoint_metadata = self._search_json( + r'var\s+ilpostpodcast\s*=', webpage, 'metadata', display_id) + episode_id = endpoint_metadata['post_id'] + podcast_id = endpoint_metadata['podcast_id'] + podcast_metadata = self._download_json( + endpoint_metadata['ajax_url'], display_id, data=urlencode_postdata({ + 'action': 'checkpodcast', + 'cookie': endpoint_metadata['cookie'], + 'post_id': episode_id, + 'podcast_id': podcast_id, + })) + + episode = traverse_obj(podcast_metadata, ( + 'data', 'postcastList', lambda _, v: str(v['id']) == episode_id, {dict}), get_all=False) + if not episode: + raise ExtractorError('Episode could not be extracted') + + return { + 'id': episode_id, + 'display_id': display_id, + 'series_id': podcast_id, + 'vcodec': 'none', + **traverse_obj(episode, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'url': ('podcast_raw_url', {url_or_none}), + 'thumbnail': ('image', {url_or_none}), + 'timestamp': ('timestamp', {int_or_none}), + 'duration': ('milliseconds', {functools.partial(float_or_none, scale=1000)}), + 'availability': ('free', {lambda v: 'public' if v else 'subscriber_only'}), + }), + } diff --git a/yt_dlp/extractor/imgur.py b/yt_dlp/extractor/imgur.py index bff6ed57f..1fa0a2a79 100644 --- a/yt_dlp/extractor/imgur.py +++ b/yt_dlp/extractor/imgur.py @@ -1,99 +1,243 @@ +import functools import re from .common import InfoExtractor from ..utils import ( + ExtractorError, + determine_ext, + float_or_none, int_or_none, js_to_json, mimetype2ext, - ExtractorError, + parse_iso8601, + str_or_none, + strip_or_none, + traverse_obj, + url_or_none, ) -class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|(?:t(?:opic)?|r)/[^/]+)/)(?P<id>[a-zA-Z0-9]+)' +class ImgurBaseIE(InfoExtractor): + _CLIENT_ID = '546c25a59c58ad7' + + @classmethod + def _imgur_result(cls, item_id): + return cls.url_result(f'https://imgur.com/{item_id}', ImgurIE, item_id) + + def _call_api(self, endpoint, video_id, **kwargs): + return self._download_json( + f'https://api.imgur.com/post/v1/{endpoint}/{video_id}?client_id={self._CLIENT_ID}&include=media,account', + video_id, **kwargs) + + @staticmethod + def get_description(s): + if 'Discover the magic of the internet at Imgur' in s: + return None + return s or None + + +class ImgurIE(ImgurBaseIE): + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|t|topic|r)/)(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'https://i.imgur.com/A61SaA1.gifv', + 'url': 'https://imgur.com/A61SaA1', 'info_dict': { 'id': 'A61SaA1', 'ext': 'mp4', - 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', + 'title': 'MRW gifv is up and running without any bugs', + 'timestamp': 1416446068, + 'upload_date': '20141120', + 'dislike_count': int, + 'comment_count': int, + 'release_timestamp': 1416446068, + 'release_date': '20141120', + 'like_count': int, + 'thumbnail': 'https://i.imgur.com/A61SaA1h.jpg', }, }, { - 'url': 'https://imgur.com/A61SaA1', + 'url': 'https://i.imgur.com/A61SaA1.gifv', 'only_matching': True, }, { 'url': 'https://i.imgur.com/crGpqCV.mp4', 'only_matching': True, }, { - # no title 'url': 'https://i.imgur.com/jxBXAMC.gifv', - 'only_matching': True, + 'info_dict': { + 'id': 'jxBXAMC', + 'ext': 'mp4', + 'title': 'Fahaka puffer feeding', + 'timestamp': 1533835503, + 'upload_date': '20180809', + 'release_date': '20180809', + 'like_count': int, + 'duration': 30.0, + 'comment_count': int, + 'release_timestamp': 1533835503, + 'thumbnail': 'https://i.imgur.com/jxBXAMCh.jpg', + 'dislike_count': int, + }, }] def _real_extract(self, url): video_id = self._match_id(url) + data = self._call_api('media', video_id) + if not traverse_obj(data, ('media', 0, ( + ('type', {lambda t: t == 'video' or None}), + ('metadata', 'is_animated'))), get_all=False): + raise ExtractorError(f'{video_id} is not a video or animated image', expected=True) webpage = self._download_webpage( - 'https://i.imgur.com/{id}.gifv'.format(id=video_id), video_id) + f'https://i.imgur.com/{video_id}.gifv', video_id, fatal=False) or '' + formats = [] - width = int_or_none(self._og_search_property( - 'video:width', webpage, default=None)) - height = int_or_none(self._og_search_property( - 'video:height', webpage, default=None)) + media_fmt = traverse_obj(data, ('media', 0, { + 'url': ('url', {url_or_none}), + 'ext': ('ext', {str}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'filesize': ('size', {int_or_none}), + 'acodec': ('metadata', 'has_sound', {lambda b: None if b else 'none'}), + })) + media_url = media_fmt.get('url') + if media_url: + if not media_fmt.get('ext'): + media_fmt['ext'] = mimetype2ext(traverse_obj( + data, ('media', 0, 'mime_type'))) or determine_ext(media_url) + if traverse_obj(data, ('media', 0, 'type')) == 'image': + media_fmt['acodec'] = 'none' + media_fmt.setdefault('preference', -10) + formats.append(media_fmt) video_elements = self._search_regex( r'(?s)<div class="video-elements">(.*?)</div>', webpage, 'video elements', default=None) - if not video_elements: - raise ExtractorError( - 'No sources found for video %s. Maybe an image?' % video_id, - expected=True) - formats = [] - for m in re.finditer(r'<source\s+src="(?P<src>[^"]+)"\s+type="(?P<type>[^"]+)"', video_elements): - formats.append({ - 'format_id': m.group('type').partition('/')[2], - 'url': self._proto_relative_url(m.group('src')), - 'ext': mimetype2ext(m.group('type')), - 'width': width, - 'height': height, - 'http_headers': { - 'User-Agent': 'yt-dlp (like wget)', - }, - }) + if video_elements: + def og_get_size(media_type): + return { + p: int_or_none(self._og_search_property(f'{media_type}:{p}', webpage, default=None)) + for p in ('width', 'height') + } - gif_json = self._search_regex( - r'(?s)var\s+videoItem\s*=\s*(\{.*?\})', - webpage, 'GIF code', fatal=False) - if gif_json: - gifd = self._parse_json( - gif_json, video_id, transform_source=js_to_json) - formats.append({ - 'format_id': 'gif', - 'preference': -10, # gifs are worse than videos - 'width': width, - 'height': height, - 'ext': 'gif', - 'acodec': 'none', - 'vcodec': 'gif', - 'container': 'gif', - 'url': self._proto_relative_url(gifd['gifUrl']), - 'filesize': gifd.get('size'), - 'http_headers': { - 'User-Agent': 'yt-dlp (like wget)', - }, + size = og_get_size('video') + if not any(size.values()): + size = og_get_size('image') + + formats = traverse_obj( + re.finditer(r'<source\s+src="(?P<src>[^"]+)"\s+type="(?P<type>[^"]+)"', video_elements), + (..., { + 'format_id': ('type', {lambda s: s.partition('/')[2]}), + 'url': ('src', {self._proto_relative_url}), + 'ext': ('type', {mimetype2ext}), + })) + for f in formats: + f.update(size) + + # We can get the original gif format from the webpage as well + gif_json = traverse_obj(self._search_json( + r'var\s+videoItem\s*=', webpage, 'GIF info', video_id, + transform_source=js_to_json, fatal=False), { + 'url': ('gifUrl', {self._proto_relative_url}), + 'filesize': ('size', {int_or_none}), }) + if gif_json: + gif_json.update(size) + gif_json.update({ + 'format_id': 'gif', + 'preference': -10, # gifs < videos + 'ext': 'gif', + 'acodec': 'none', + 'vcodec': 'gif', + 'container': 'gif', + }) + formats.append(gif_json) + + search = functools.partial(self._html_search_meta, html=webpage, default=None) + + twitter_fmt = { + 'format_id': 'twitter', + 'url': url_or_none(search('twitter:player:stream')), + 'ext': mimetype2ext(search('twitter:player:stream:content_type')), + 'width': int_or_none(search('twitter:width')), + 'height': int_or_none(search('twitter:height')), + } + if twitter_fmt['url']: + formats.append(twitter_fmt) + + if not formats: + self.raise_no_formats( + f'No sources found for video {video_id}. Maybe a plain image?', expected=True) + self._remove_duplicate_formats(formats) return { + 'title': self._og_search_title(webpage, default=None), + 'description': self.get_description(self._og_search_description(webpage, default='')), + **traverse_obj(data, { + 'uploader_id': ('account_id', {lambda a: str(a) if int_or_none(a) else None}), + 'uploader': ('account', 'username', {lambda x: strip_or_none(x) or None}), + 'uploader_url': ('account', 'avatar_url', {url_or_none}), + 'like_count': ('upvote_count', {int_or_none}), + 'dislike_count': ('downvote_count', {int_or_none}), + 'comment_count': ('comment_count', {int_or_none}), + 'age_limit': ('is_mature', {lambda x: 18 if x else None}), + 'timestamp': (('updated_at', 'created_at'), {parse_iso8601}), + 'release_timestamp': ('created_at', {parse_iso8601}), + }, get_all=False), + **traverse_obj(data, ('media', 0, 'metadata', { + 'title': ('title', {lambda x: strip_or_none(x) or None}), + 'description': ('description', {self.get_description}), + 'duration': ('duration', {float_or_none}), + 'timestamp': (('updated_at', 'created_at'), {parse_iso8601}), + 'release_timestamp': ('created_at', {parse_iso8601}), + }), get_all=False), 'id': video_id, 'formats': formats, - 'title': self._og_search_title(webpage, default=video_id), + 'thumbnail': url_or_none(search('thumbnailUrl')), } -class ImgurGalleryIE(InfoExtractor): +class ImgurGalleryBaseIE(ImgurBaseIE): + _GALLERY = True + + def _real_extract(self, url): + gallery_id = self._match_id(url) + + data = self._call_api('albums', gallery_id, fatal=False, expected_status=404) + + info = traverse_obj(data, { + 'title': ('title', {lambda x: strip_or_none(x) or None}), + 'description': ('description', {self.get_description}), + }) + + if traverse_obj(data, 'is_album'): + + def yield_media_ids(): + for m_id in traverse_obj(data, ( + 'media', lambda _, v: v.get('type') == 'video' or v['metadata']['is_animated'], + 'id', {lambda x: str_or_none(x) or None})): + yield m_id + + # if a gallery with exactly one video, apply album metadata to video + media_id = ( + self._GALLERY + and traverse_obj(data, ('image_count', {lambda c: c == 1})) + and next(yield_media_ids(), None)) + + if not media_id: + result = self.playlist_result( + map(self._imgur_result, yield_media_ids()), gallery_id) + result.update(info) + return result + gallery_id = media_id + + result = self._imgur_result(gallery_id) + info['_type'] = 'url_transparent' + result.update(info) + return result + + +class ImgurGalleryIE(ImgurGalleryBaseIE): IE_NAME = 'imgur:gallery' - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/]+)/(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/?#]+)/(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'http://imgur.com/gallery/Q95ko', @@ -102,49 +246,121 @@ class ImgurGalleryIE(InfoExtractor): 'title': 'Adding faces make every GIF better', }, 'playlist_count': 25, + 'skip': 'Zoinks! You\'ve taken a wrong turn.', }, { + # TODO: static images - replace with animated/video gallery 'url': 'http://imgur.com/topic/Aww/ll5Vk', 'only_matching': True, }, { 'url': 'https://imgur.com/gallery/YcAQlkx', + 'add_ies': ['Imgur'], 'info_dict': { 'id': 'YcAQlkx', 'ext': 'mp4', 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', - } + 'timestamp': 1358554297, + 'upload_date': '20130119', + 'uploader_id': '1648642', + 'uploader': 'wittyusernamehere', + 'release_timestamp': 1358554297, + 'thumbnail': 'https://i.imgur.com/YcAQlkxh.jpg', + 'release_date': '20130119', + 'uploader_url': 'https://i.imgur.com/u3R4I2S_d.png?maxwidth=290&fidelity=grand', + 'comment_count': int, + 'dislike_count': int, + 'like_count': int, + }, }, { + # TODO: static image - replace with animated/video gallery 'url': 'http://imgur.com/topic/Funny/N8rOudd', 'only_matching': True, }, { 'url': 'http://imgur.com/r/aww/VQcQPhM', - 'only_matching': True, + 'add_ies': ['Imgur'], + 'info_dict': { + 'id': 'VQcQPhM', + 'ext': 'mp4', + 'title': 'The boss is here', + 'timestamp': 1476494751, + 'upload_date': '20161015', + 'uploader_id': '19138530', + 'uploader': 'thematrixcam', + 'comment_count': int, + 'dislike_count': int, + 'uploader_url': 'https://i.imgur.com/qCjr5Pi_d.png?maxwidth=290&fidelity=grand', + 'release_timestamp': 1476494751, + 'like_count': int, + 'release_date': '20161015', + 'thumbnail': 'https://i.imgur.com/VQcQPhMh.jpg', + }, + }, + # from https://github.com/ytdl-org/youtube-dl/pull/16674 + { + 'url': 'https://imgur.com/t/unmuted/6lAn9VQ', + 'info_dict': { + 'id': '6lAn9VQ', + 'title': 'Penguins !', + }, + 'playlist_count': 3, + }, { + 'url': 'https://imgur.com/t/unmuted/kx2uD3C', + 'add_ies': ['Imgur'], + 'info_dict': { + 'id': 'ZVMv45i', + 'ext': 'mp4', + 'title': 'Intruder', + 'timestamp': 1528129683, + 'upload_date': '20180604', + 'release_timestamp': 1528129683, + 'release_date': '20180604', + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'duration': 30.03, + 'thumbnail': 'https://i.imgur.com/ZVMv45ih.jpg', + }, + }, { + 'url': 'https://imgur.com/t/unmuted/wXSK0YH', + 'add_ies': ['Imgur'], + 'info_dict': { + 'id': 'JCAP4io', + 'ext': 'mp4', + 'title': 're:I got the blues$', + 'description': 'Luka’s vocal stylings.\n\nFP edit: don’t encourage me. I’ll never stop posting Luka and friends.', + 'timestamp': 1527809525, + 'upload_date': '20180531', + 'like_count': int, + 'dislike_count': int, + 'duration': 30.03, + 'comment_count': int, + 'release_timestamp': 1527809525, + 'thumbnail': 'https://i.imgur.com/JCAP4ioh.jpg', + 'release_date': '20180531', + }, }] - def _real_extract(self, url): - gallery_id = self._match_id(url) - data = self._download_json( - 'https://imgur.com/gallery/%s.json' % gallery_id, - gallery_id)['data']['image'] - - if data.get('is_album'): - entries = [ - self.url_result('http://imgur.com/%s' % image['hash'], ImgurIE.ie_key(), image['hash']) - for image in data['album_images']['images'] if image.get('hash')] - return self.playlist_result(entries, gallery_id, data.get('title'), data.get('description')) - - return self.url_result('http://imgur.com/%s' % gallery_id, ImgurIE.ie_key(), gallery_id) - - -class ImgurAlbumIE(ImgurGalleryIE): # XXX: Do not subclass from concrete IE +class ImgurAlbumIE(ImgurGalleryBaseIE): IE_NAME = 'imgur:album' _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P<id>[a-zA-Z0-9]+)' - + _GALLERY = False _TESTS = [{ + # TODO: only static images - replace with animated/video gallery 'url': 'http://imgur.com/a/j6Orj', + 'only_matching': True, + }, + # from https://github.com/ytdl-org/youtube-dl/pull/21693 + { + 'url': 'https://imgur.com/a/iX265HX', 'info_dict': { - 'id': 'j6Orj', - 'title': 'A Literary Analysis of "Star Wars: The Force Awakens"', + 'id': 'iX265HX', + 'title': 'enen-no-shouboutai' }, - 'playlist_count': 12, + 'playlist_count': 2, + }, { + 'url': 'https://imgur.com/a/8pih2Ed', + 'info_dict': { + 'id': '8pih2Ed' + }, + 'playlist_mincount': 1, }] diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index bfc4b7b88..dbaa332c2 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -10,6 +10,7 @@ ExtractorError, decode_base_n, encode_base_n, + filter_dict, float_or_none, format_field, get_element_by_attribute, @@ -703,28 +704,31 @@ def _real_extract(self, url): user_info = self._search_json(r'"user":', story_info, 'user info', story_id, fatal=False) if not user_info: self.raise_login_required('This content is unreachable') - user_id = user_info.get('id') + user_id = traverse_obj(user_info, 'pk', 'id', expected_type=str) story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}' + if not story_info_url: # user id is only mandatory for non-highlights + raise ExtractorError('Unable to extract user id') + videos = traverse_obj(self._download_json( f'{self._API_BASE_URL}/feed/reels_media/?reel_ids={story_info_url}', story_id, errnote=False, fatal=False, headers=self._API_HEADERS), 'reels') if not videos: self.raise_login_required('You need to log in to access this content') - full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (str(user_id), 'user', 'full_name')) + full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (user_id, 'user', 'full_name')) story_title = traverse_obj(videos, (f'highlight:{story_id}', 'title')) if not story_title: story_title = f'Story by {username}' - highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (str(user_id), 'items')) + highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (user_id, 'items')) info_data = [] for highlight in highlights: highlight_data = self._extract_product(highlight) if highlight_data.get('formats'): info_data.append({ - **highlight_data, 'uploader': full_name, 'uploader_id': user_id, + **filter_dict(highlight_data), }) return self.playlist_result(info_data, playlist_id=story_id, playlist_title=story_title) diff --git a/yt_dlp/extractor/itprotv.py b/yt_dlp/extractor/itprotv.py index 4ac12603a..b9d5c196d 100644 --- a/yt_dlp/extractor/itprotv.py +++ b/yt_dlp/extractor/itprotv.py @@ -31,7 +31,7 @@ def _check_if_logged_in(self, webpage): class ITProTVIE(ITProTVBaseIE): - _VALID_URL = r'https://app.itpro.tv/course/(?P<course>[\w-]+)/(?P<id>[\w-]+)' + _VALID_URL = r'https://app\.itpro\.tv/course/(?P<course>[\w-]+)/(?P<id>[\w-]+)' _TESTS = [{ 'url': 'https://app.itpro.tv/course/guided-tour/introductionitprotv', 'md5': 'bca4a28c2667fd1a63052e71a94bb88c', @@ -102,7 +102,7 @@ def _real_extract(self, url): class ITProTVCourseIE(ITProTVBaseIE): - _VALID_URL = r'https?://app.itpro.tv/course/(?P<id>[\w-]+)/?(?:$|[#?])' + _VALID_URL = r'https?://app\.itpro\.tv/course/(?P<id>[\w-]+)/?(?:$|[#?])' _TESTS = [ { 'url': 'https://app.itpro.tv/course/guided-tour', diff --git a/yt_dlp/extractor/jable.py b/yt_dlp/extractor/jable.py index 84c3225e4..71fed49ea 100644 --- a/yt_dlp/extractor/jable.py +++ b/yt_dlp/extractor/jable.py @@ -10,7 +10,7 @@ class JableIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?jable.tv/videos/(?P<id>[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?jable\.tv/videos/(?P<id>[\w-]+)' _TESTS = [{ 'url': 'https://jable.tv/videos/pppd-812/', 'md5': 'f1537283a9bc073c31ff86ca35d9b2a6', @@ -64,7 +64,7 @@ def _real_extract(self, url): class JablePlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?jable.tv/(?:categories|models|tags)/(?P<id>[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?jable\.tv/(?:categories|models|tags)/(?P<id>[\w-]+)' _TESTS = [{ 'url': 'https://jable.tv/models/kaede-karen/', 'info_dict': { diff --git a/yt_dlp/extractor/jiosaavn.py b/yt_dlp/extractor/jiosaavn.py new file mode 100644 index 000000000..a59209835 --- /dev/null +++ b/yt_dlp/extractor/jiosaavn.py @@ -0,0 +1,105 @@ +from .common import InfoExtractor +from ..utils import ( + int_or_none, + js_to_json, + url_or_none, + urlencode_postdata, + urljoin, +) +from ..utils.traversal import traverse_obj + + +class JioSaavnBaseIE(InfoExtractor): + def _extract_initial_data(self, url, audio_id): + webpage = self._download_webpage(url, audio_id) + return self._search_json( + r'window\.__INITIAL_DATA__\s*=', webpage, + 'init json', audio_id, transform_source=js_to_json) + + +class JioSaavnSongIE(JioSaavnBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?:jiosaavn\.com/song/[^/?#]+/|saavn\.com/s/song/(?:[^/?#]+/){3})(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/song/leja-re/OQsEfQFVUXk', + 'md5': '3b84396d15ed9e083c3106f1fa589c04', + 'info_dict': { + 'id': 'OQsEfQFVUXk', + 'ext': 'mp4', + 'title': 'Leja Re', + 'album': 'Leja Re', + 'thumbnail': 'https://c.saavncdn.com/258/Leja-Re-Hindi-2018-20181124024539-500x500.jpg', + 'duration': 205, + 'view_count': int, + 'release_year': 2018, + }, + }, { + 'url': 'https://www.saavn.com/s/song/hindi/Saathiya/O-Humdum-Suniyo-Re/KAMiazoCblU', + 'only_matching': True, + }] + + _VALID_BITRATES = ('16', '32', '64', '128', '320') + + def _real_extract(self, url): + audio_id = self._match_id(url) + extract_bitrates = self._configuration_arg('bitrate', ['128', '320'], ie_key='JioSaavn') + if invalid_bitrates := [br for br in extract_bitrates if br not in self._VALID_BITRATES]: + raise ValueError( + f'Invalid bitrate(s): {", ".join(invalid_bitrates)}. ' + + f'Valid bitrates are: {", ".join(self._VALID_BITRATES)}') + + song_data = self._extract_initial_data(url, audio_id)['song']['song'] + formats = [] + for bitrate in extract_bitrates: + media_data = self._download_json( + 'https://www.jiosaavn.com/api.php', audio_id, f'Downloading format info for {bitrate}', + fatal=False, data=urlencode_postdata({ + '__call': 'song.generateAuthToken', + '_format': 'json', + 'bitrate': bitrate, + 'url': song_data['encrypted_media_url'], + })) + if not media_data.get('auth_url'): + self.report_warning(f'Unable to extract format info for {bitrate}') + continue + formats.append({ + 'url': media_data['auth_url'], + 'ext': media_data.get('type'), + 'format_id': bitrate, + 'abr': int(bitrate), + 'vcodec': 'none', + }) + + return { + 'id': audio_id, + 'formats': formats, + **traverse_obj(song_data, { + 'title': ('title', 'text'), + 'album': ('album', 'text'), + 'thumbnail': ('image', 0, {url_or_none}), + 'duration': ('duration', {int_or_none}), + 'view_count': ('play_count', {int_or_none}), + 'release_year': ('year', {int_or_none}), + }), + } + + +class JioSaavnAlbumIE(JioSaavnBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/album/[^/?#]+/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/album/96/buIOjYZDrNA_', + 'info_dict': { + 'id': 'buIOjYZDrNA_', + 'title': '96', + }, + 'playlist_count': 10, + }] + + def _real_extract(self, url): + album_id = self._match_id(url) + album_view = self._extract_initial_data(url, album_id)['albumView'] + + return self.playlist_from_matches( + traverse_obj(album_view, ( + 'modules', lambda _, x: x['key'] == 'list', 'data', ..., 'title', 'action', {str})), + album_id, traverse_obj(album_view, ('album', 'title', 'text', {str})), ie=JioSaavnSongIE, + getter=lambda x: urljoin('https://www.jiosaavn.com/', x)) diff --git a/yt_dlp/extractor/joqrag.py b/yt_dlp/extractor/joqrag.py new file mode 100644 index 000000000..3bb28af94 --- /dev/null +++ b/yt_dlp/extractor/joqrag.py @@ -0,0 +1,112 @@ +import datetime +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + clean_html, + datetime_from_str, + unified_timestamp, + urljoin, +) + + +class JoqrAgIE(InfoExtractor): + IE_DESC = '超!A&G+ 文化放送 (f.k.a. AGQR) Nippon Cultural Broadcasting, Inc. (JOQR)' + _VALID_URL = [r'https?://www\.uniqueradio\.jp/agplayer5/(?:player|inc-player-hls)\.php', + r'https?://(?:www\.)?joqr\.co\.jp/ag/', + r'https?://(?:www\.)?joqr\.co\.jp/qr/ag(?:daily|regular)program/?(?:$|[#?])'] + _TESTS = [{ + 'url': 'https://www.uniqueradio.jp/agplayer5/player.php', + 'info_dict': { + 'id': 'live', + 'title': str, + 'channel': '超!A&G+', + 'description': str, + 'live_status': 'is_live', + 'release_timestamp': int, + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + }, { + 'url': 'https://www.uniqueradio.jp/agplayer5/inc-player-hls.php', + 'only_matching': True, + }, { + 'url': 'https://www.joqr.co.jp/ag/article/103760/', + 'only_matching': True, + }, { + 'url': 'http://www.joqr.co.jp/qr/agdailyprogram/', + 'only_matching': True, + }, { + 'url': 'http://www.joqr.co.jp/qr/agregularprogram/', + 'only_matching': True, + }] + + def _extract_metadata(self, variable, html): + return clean_html(urllib.parse.unquote_plus(self._search_regex( + rf'var\s+{variable}\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + html, 'metadata', group='value', default=''))) or None + + def _extract_start_timestamp(self, video_id, is_live): + def extract_start_time_from(date_str): + dt = datetime_from_str(date_str) + datetime.timedelta(hours=9) + date = dt.strftime('%Y%m%d') + start_time = self._search_regex( + r'<h3[^>]+\bclass="dailyProgram-itemHeaderTime"[^>]*>[\s\d:]+–\s*(\d{1,2}:\d{1,2})', + self._download_webpage( + f'https://www.joqr.co.jp/qr/agdailyprogram/?date={date}', video_id, + note=f'Downloading program list of {date}', fatal=False, + errnote=f'Failed to download program list of {date}') or '', + 'start time', default=None) + if start_time: + return unified_timestamp(f'{dt.strftime("%Y/%m/%d")} {start_time} +09:00') + return None + + start_timestamp = extract_start_time_from('today') + if not start_timestamp: + return None + + if not is_live or start_timestamp < datetime_from_str('now').timestamp(): + return start_timestamp + else: + return extract_start_time_from('yesterday') + + def _real_extract(self, url): + video_id = 'live' + + metadata = self._download_webpage( + 'https://www.uniqueradio.jp/aandg', video_id, + note='Downloading metadata', errnote='Failed to download metadata') + title = self._extract_metadata('Program_name', metadata) + + if title == '放送休止': + formats = [] + live_status = 'is_upcoming' + release_timestamp = self._extract_start_timestamp(video_id, False) + msg = 'This stream is not currently live' + if release_timestamp: + msg += (' and will start at ' + + datetime.datetime.fromtimestamp(release_timestamp).strftime('%Y-%m-%d %H:%M:%S')) + self.raise_no_formats(msg, expected=True) + else: + m3u8_path = self._search_regex( + r'<source\s[^>]*\bsrc="([^"]+)"', + self._download_webpage( + 'https://www.uniqueradio.jp/agplayer5/inc-player-hls.php', video_id, + note='Downloading player data', errnote='Failed to download player data'), + 'm3u8 url') + formats = self._extract_m3u8_formats( + urljoin('https://www.uniqueradio.jp/', m3u8_path), video_id) + live_status = 'is_live' + release_timestamp = self._extract_start_timestamp(video_id, True) + + return { + 'id': video_id, + 'title': title, + 'channel': '超!A&G+', + 'description': self._extract_metadata('Program_text', metadata), + 'formats': formats, + 'live_status': live_status, + 'release_timestamp': release_timestamp, + } diff --git a/yt_dlp/extractor/jtbc.py b/yt_dlp/extractor/jtbc.py new file mode 100644 index 000000000..573f7492f --- /dev/null +++ b/yt_dlp/extractor/jtbc.py @@ -0,0 +1,156 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class JTBCIE(InfoExtractor): + IE_DESC = 'jtbc.co.kr' + _VALID_URL = r'''(?x) + https?://(?: + vod\.jtbc\.co\.kr/player/(?:program|clip) + |tv\.jtbc\.co\.kr/(?:replay|trailer|clip)/pr\d+/pm\d+ + )/(?P<id>(?:ep|vo)\d+)''' + _GEO_COUNTRIES = ['KR'] + + _TESTS = [{ + 'url': 'https://tv.jtbc.co.kr/replay/pr10011629/pm10067930/ep20216321/view', + 'md5': 'e6ade71d8c8685bbfd6e6ce4167c6a6c', + 'info_dict': { + 'id': 'VO10721192', + 'display_id': 'ep20216321', + 'ext': 'mp4', + 'title': '힘쎈여자 강남순 2회 다시보기', + 'description': 'md5:043c1d9019100ce271dba09995dbd1e2', + 'duration': 3770.0, + 'release_date': '20231008', + 'age_limit': 15, + 'thumbnail': 'https://fs.jtbc.co.kr//joydata/CP00000001/prog/drama/stronggirlnamsoon/img/20231008_163541_522_1.jpg', + 'series': '힘쎈여자 강남순', + }, + }, { + 'url': 'https://vod.jtbc.co.kr/player/program/ep20216733', + 'md5': '217a6d190f115a75e4bda0ceaa4cd7f4', + 'info_dict': { + 'id': 'VO10721429', + 'display_id': 'ep20216733', + 'ext': 'mp4', + 'title': '헬로 마이 닥터 친절한 진료실 149회 다시보기', + 'description': 'md5:1d70788a982dd5de26874a92fcffddb8', + 'duration': 2720.0, + 'release_date': '20231009', + 'age_limit': 15, + 'thumbnail': 'https://fs.jtbc.co.kr//joydata/CP00000001/prog/culture/hellomydoctor/img/20231009_095002_528_1.jpg', + 'series': '헬로 마이 닥터 친절한 진료실', + }, + }, { + 'url': 'https://vod.jtbc.co.kr/player/clip/vo10721270', + 'md5': '05782e2dc22a9c548aebefe62ae4328a', + 'info_dict': { + 'id': 'VO10721270', + 'display_id': 'vo10721270', + 'ext': 'mp4', + 'title': '뭉쳐야 찬다3 2회 예고편 - A매치로 향하는 마지막 관문💥', + 'description': 'md5:d48b51a8655c84843b4ed8d0c39aae68', + 'duration': 46.0, + 'release_date': '20231015', + 'age_limit': 15, + 'thumbnail': 'https://fs.jtbc.co.kr//joydata/CP00000001/prog/enter/soccer3/img/20231008_210957_775_1.jpg', + 'series': '뭉쳐야 찬다3', + }, + }, { + 'url': 'https://tv.jtbc.co.kr/trailer/pr10010392/pm10032526/vo10720912/view', + 'md5': '367d480eb3ef54a9cd7a4b4d69c4b32d', + 'info_dict': { + 'id': 'VO10720912', + 'display_id': 'vo10720912', + 'ext': 'mp4', + 'title': '아는 형님 404회 예고편 | 10월 14일(토) 저녁 8시 50분 방송!', + 'description': 'md5:2743bb1079ceb85bb00060f2ad8f0280', + 'duration': 148.0, + 'release_date': '20231014', + 'age_limit': 15, + 'thumbnail': 'https://fs.jtbc.co.kr//joydata/CP00000001/prog/enter/jtbcbros/img/20231006_230023_802_1.jpg', + 'series': '아는 형님', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + if display_id.startswith('vo'): + video_id = display_id.upper() + else: + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(r'data-vod="(VO\d+)"', webpage, 'vod id') + + playback_data = self._download_json( + f'https://api.jtbc.co.kr/vod/{video_id}', video_id, note='Downloading VOD playback data') + + subtitles = {} + for sub in traverse_obj(playback_data, ('tracks', lambda _, v: v['file'])): + subtitles.setdefault(sub.get('label', 'und'), []).append({'url': sub['file']}) + + formats = [] + for stream_url in traverse_obj(playback_data, ('sources', 'HLS', ..., 'file', {url_or_none})): + stream_url = re.sub(r'/playlist(?:_pd\d+)?\.m3u8', '/index.m3u8', stream_url) + formats.extend(self._extract_m3u8_formats(stream_url, video_id, fatal=False)) + + metadata = self._download_json( + 'https://now-api.jtbc.co.kr/v1/vod/detail', video_id, + note='Downloading mobile details', fatal=False, query={'vodFileId': video_id}) + return { + 'id': video_id, + 'display_id': display_id, + **traverse_obj(metadata, ('vodDetail', { + 'title': 'vodTitleView', + 'series': 'programTitle', + 'age_limit': ('watchAge', {int_or_none}), + 'release_date': ('broadcastDate', {lambda x: re.match(r'\d{8}', x.replace('.', ''))}, 0), + 'description': 'episodeContents', + 'thumbnail': ('imgFileUrl', {url_or_none}), + })), + 'duration': parse_duration(playback_data.get('playTime')), + 'formats': formats, + 'subtitles': subtitles, + } + + +class JTBCProgramIE(InfoExtractor): + IE_NAME = 'JTBC:program' + _VALID_URL = r'https?://(?:vod\.jtbc\.co\.kr/program|tv\.jtbc\.co\.kr/replay)/(?P<id>pr\d+)/(?:replay|pm\d+)/?(?:$|[?#])' + + _TESTS = [{ + 'url': 'https://tv.jtbc.co.kr/replay/pr10010392/pm10032710', + 'info_dict': { + '_type': 'playlist', + 'id': 'pr10010392', + }, + 'playlist_count': 398, + }, { + 'url': 'https://vod.jtbc.co.kr/program/pr10011491/replay', + 'info_dict': { + '_type': 'playlist', + 'id': 'pr10011491', + }, + 'playlist_count': 59, + }] + + def _real_extract(self, url): + program_id = self._match_id(url) + + vod_list = self._download_json( + 'https://now-api.jtbc.co.kr/v1/vodClip/programHome/programReplayVodList', program_id, + note='Downloading program replay list', query={ + 'programId': program_id, + 'rowCount': '10000', + }) + + entries = [self.url_result(f'https://vod.jtbc.co.kr/player/program/{video_id}', JTBCIE, video_id) + for video_id in traverse_obj(vod_list, ('programReplayVodList', ..., 'episodeId'))] + return self.playlist_result(entries, program_id) diff --git a/yt_dlp/extractor/keezmovies.py b/yt_dlp/extractor/keezmovies.py deleted file mode 100644 index b50da420c..000000000 --- a/yt_dlp/extractor/keezmovies.py +++ /dev/null @@ -1,125 +0,0 @@ -import re - -from .common import InfoExtractor -from ..aes import aes_decrypt_text -from ..compat import compat_urllib_parse_unquote -from ..utils import ( - determine_ext, - format_field, - int_or_none, - str_to_int, - strip_or_none, - url_or_none, -) - - -class KeezMoviesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/(?:(?P<display_id>[^/]+)-)?(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://www.keezmovies.com/video/arab-wife-want-it-so-bad-i-see-she-thirsty-and-has-tiny-money-18070681', - 'md5': '2ac69cdb882055f71d82db4311732a1a', - 'info_dict': { - 'id': '18070681', - 'display_id': 'arab-wife-want-it-so-bad-i-see-she-thirsty-and-has-tiny-money', - 'ext': 'mp4', - 'title': 'Arab wife want it so bad I see she thirsty and has tiny money.', - 'thumbnail': None, - 'view_count': int, - 'age_limit': 18, - } - }, { - 'url': 'http://www.keezmovies.com/video/18070681', - 'only_matching': True, - }] - - def _extract_info(self, url, fatal=True): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - display_id = (mobj.group('display_id') - if 'display_id' in mobj.groupdict() - else None) or mobj.group('id') - - webpage = self._download_webpage( - url, display_id, headers={'Cookie': 'age_verified=1'}) - - formats = [] - format_urls = set() - - title = None - thumbnail = None - duration = None - encrypted = False - - def extract_format(format_url, height=None): - format_url = url_or_none(format_url) - if not format_url or not format_url.startswith(('http', '//')): - return - if format_url in format_urls: - return - format_urls.add(format_url) - tbr = int_or_none(self._search_regex( - r'[/_](\d+)[kK][/_]', format_url, 'tbr', default=None)) - if not height: - height = int_or_none(self._search_regex( - r'[/_](\d+)[pP][/_]', format_url, 'height', default=None)) - if encrypted: - format_url = aes_decrypt_text( - video_url, title, 32).decode('utf-8') - formats.append({ - 'url': format_url, - 'format_id': format_field(height, None, '%dp'), - 'height': height, - 'tbr': tbr, - }) - - flashvars = self._parse_json( - self._search_regex( - r'flashvars\s*=\s*({.+?});', webpage, - 'flashvars', default='{}'), - display_id, fatal=False) - - if flashvars: - title = flashvars.get('video_title') - thumbnail = flashvars.get('image_url') - duration = int_or_none(flashvars.get('video_duration')) - encrypted = flashvars.get('encrypted') is True - for key, value in flashvars.items(): - mobj = re.search(r'quality_(\d+)[pP]', key) - if mobj: - extract_format(value, int(mobj.group(1))) - video_url = flashvars.get('video_url') - if video_url and determine_ext(video_url, None): - extract_format(video_url) - - video_url = self._html_search_regex( - r'flashvars\.video_url\s*=\s*(["\'])(?P<url>http.+?)\1', - webpage, 'video url', default=None, group='url') - if video_url: - extract_format(compat_urllib_parse_unquote(video_url)) - - if not formats: - if 'title="This video is no longer available"' in webpage: - self.raise_no_formats( - 'Video %s is no longer available' % video_id, expected=True) - - if not title: - title = self._html_search_regex( - r'<h1[^>]*>([^<]+)', webpage, 'title') - - return webpage, { - 'id': video_id, - 'display_id': display_id, - 'title': strip_or_none(title), - 'thumbnail': thumbnail, - 'duration': duration, - 'age_limit': 18, - 'formats': formats, - } - - def _real_extract(self, url): - webpage, info = self._extract_info(url, fatal=False) - if not info['formats']: - return self.url_result(url, 'Generic') - info['view_count'] = str_to_int(self._search_regex( - r'<b>([\d,.]+)</b> Views?', webpage, 'view count', fatal=False)) - return info diff --git a/yt_dlp/extractor/kinja.py b/yt_dlp/extractor/kinja.py index df1386fb8..f4e5c4c47 100644 --- a/yt_dlp/extractor/kinja.py +++ b/yt_dlp/extractor/kinja.py @@ -12,7 +12,7 @@ class KinjaEmbedIE(InfoExtractor): - IENAME = 'kinja:embed' + IE_NAME = 'kinja:embed' _DOMAIN_REGEX = r'''(?:[^.]+\.)? (?: avclub| @@ -41,7 +41,6 @@ class KinjaEmbedIE(InfoExtractor): kinjavideo| mcp| megaphone| - ooyala| soundcloud(?:-playlist)?| tumblr-post| twitch-stream| @@ -61,9 +60,6 @@ class KinjaEmbedIE(InfoExtractor): }, { 'url': 'https://kinja.com/ajax/inset/iframe?id=megaphone-PPY1300931075', 'only_matching': True, - }, { - 'url': 'https://kinja.com/ajax/inset/iframe?id=ooyala-xzMXhleDpopuT0u1ijt_qZj3Va-34pEX%2FZTIxYmJjZDM2NWYzZDViZGRiOWJjYzc5', - 'only_matching': True, }, { 'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-128574047', 'only_matching': True, @@ -103,7 +99,6 @@ class KinjaEmbedIE(InfoExtractor): 'jwplayer-video': _JWPLATFORM_PROVIDER, 'jwp-video': _JWPLATFORM_PROVIDER, 'megaphone': ('player.megaphone.fm/', 'Generic'), - 'ooyala': ('player.ooyala.com/player.js?embedCode=', 'Ooyala'), 'soundcloud': ('api.soundcloud.com/tracks/', 'Soundcloud'), 'soundcloud-playlist': ('api.soundcloud.com/playlists/', 'SoundcloudPlaylist'), 'tumblr-post': ('%s.tumblr.com/post/%s', 'Tumblr'), @@ -129,8 +124,6 @@ def _real_extract(self, url): video_id, playlist_id = video_id.split('/') result_url = provider[0] % (video_id, playlist_id) else: - if video_type == 'ooyala': - video_id = video_id.split('/')[0] result_url = provider[0] + video_id return self.url_result('http://' + result_url, provider[1]) diff --git a/yt_dlp/extractor/kommunetv.py b/yt_dlp/extractor/kommunetv.py index e21e556be..a30905b57 100644 --- a/yt_dlp/extractor/kommunetv.py +++ b/yt_dlp/extractor/kommunetv.py @@ -3,7 +3,7 @@ class KommunetvIE(InfoExtractor): - _VALID_URL = r'https://(\w+).kommunetv.no/archive/(?P<id>\w+)' + _VALID_URL = r'https://\w+\.kommunetv\.no/archive/(?P<id>\w+)' _TEST = { 'url': 'https://oslo.kommunetv.no/archive/921', 'md5': '5f102be308ee759be1e12b63d5da4bbc', diff --git a/yt_dlp/extractor/kukululive.py b/yt_dlp/extractor/kukululive.py new file mode 100644 index 000000000..86ab5d40e --- /dev/null +++ b/yt_dlp/extractor/kukululive.py @@ -0,0 +1,140 @@ +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + filter_dict, + get_element_by_id, + int_or_none, + join_nonempty, + js_to_json, + qualities, + url_or_none, + urljoin, +) +from ..utils.traversal import traverse_obj + + +class KukuluLiveIE(InfoExtractor): + _VALID_URL = r'https?://live\.erinn\.biz/live\.php\?h(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://live.erinn.biz/live.php?h675134569', + 'md5': 'e380fa6a47fc703d91cea913ab44ec2e', + 'info_dict': { + 'id': '675134569', + 'ext': 'mp4', + 'title': 'プロセカ', + 'description': 'テストも兼ねたプロセカ配信。', + 'timestamp': 1702689148, + 'upload_date': '20231216', + 'thumbnail': r're:^https?://.*', + }, + }, { + 'url': 'https://live.erinn.biz/live.php?h102338092', + 'md5': 'dcf5167a934b1c60333461e13a81a6e2', + 'info_dict': { + 'id': '102338092', + 'ext': 'mp4', + 'title': 'Among Usで遊びます!!', + 'description': 'VTuberになりましたねんねこ㌨ですよろしくお願いします', + 'timestamp': 1704603118, + 'upload_date': '20240107', + 'thumbnail': r're:^https?://.*', + }, + }, { + 'url': 'https://live.erinn.biz/live.php?h878049531', + 'only_matching': True, + }] + + def _get_quality_meta(self, video_id, desc, code, force_h264=None): + desc += ' (force_h264)' if force_h264 else '' + qs = self._download_webpage( + 'https://live.erinn.biz/live.player.fplayer.php', video_id, + f'Downloading {desc} quality metadata', f'Unable to download {desc} quality metadata', + query=filter_dict({ + 'hash': video_id, + 'action': f'get{code}liveByAjax', + 'force_h264': force_h264, + })) + return urllib.parse.parse_qs(qs) + + def _add_quality_formats(self, formats, quality_meta): + vcodec = traverse_obj(quality_meta, ('vcodec', 0, {str})) + quality = traverse_obj(quality_meta, ('now_quality', 0, {str})) + quality_priority = qualities(('low', 'h264', 'high'))(quality) + if traverse_obj(quality_meta, ('hlsaddr', 0, {url_or_none})): + formats.append({ + 'format_id': quality, + 'url': quality_meta['hlsaddr'][0], + 'ext': 'mp4', + 'vcodec': vcodec, + 'quality': quality_priority, + }) + if traverse_obj(quality_meta, ('hlsaddr_audioonly', 0, {url_or_none})): + formats.append({ + 'format_id': join_nonempty(quality, 'audioonly'), + 'url': quality_meta['hlsaddr_audioonly'][0], + 'ext': 'm4a', + 'vcodec': 'none', + 'quality': quality_priority, + }) + + def _real_extract(self, url): + video_id = self._match_id(url) + html = self._download_webpage(url, video_id) + + if '>タイムシフトが見つかりませんでした。<' in html: + raise ExtractorError('This stream has expired', expected=True) + + title = clean_html( + get_element_by_id('livetitle', html.replace('<SPAN', '<span').replace('SPAN>', 'span>'))) + description = self._html_search_meta('Description', html) + thumbnail = self._html_search_meta(['og:image', 'twitter:image'], html) + + if self._search_regex(r'(var\s+timeshift\s*=\s*false)', html, 'is livestream', default=False): + formats = [] + for (desc, code) in [('high', 'Z'), ('low', 'ForceLow')]: + quality_meta = self._get_quality_meta(video_id, desc, code) + self._add_quality_formats(formats, quality_meta) + if desc == 'high' and traverse_obj(quality_meta, ('vcodec', 0)) == 'HEVC': + self._add_quality_formats( + formats, self._get_quality_meta(video_id, desc, code, force_h264='1')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'is_live': True, + 'formats': formats, + } + + # VOD extraction + player_html = self._download_webpage( + 'https://live.erinn.biz/live.timeshift.fplayer.php', video_id, + 'Downloading player html', 'Unable to download player html', query={'hash': video_id}) + + sources = traverse_obj(self._search_json( + r'var\s+fplayer_source\s*=', player_html, 'stream data', video_id, + contains_pattern=r'\[(?s:.+)\]', transform_source=js_to_json), lambda _, v: v['file']) + + def entries(segments, playlist=True): + for i, segment in enumerate(segments, 1): + yield { + 'id': f'{video_id}_{i}' if playlist else video_id, + 'title': f'{title} (Part {i})' if playlist else title, + 'description': description, + 'timestamp': traverse_obj(segment, ('time_start', {int_or_none})), + 'thumbnail': thumbnail, + 'formats': [{ + 'url': urljoin('https://live.erinn.biz', segment['file']), + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }], + } + + if len(sources) == 1: + return next(entries(sources, playlist=False)) + + return self.playlist_result(entries(sources), video_id, title, description, multi_video=True) diff --git a/yt_dlp/extractor/la7.py b/yt_dlp/extractor/la7.py index a3cd12b00..f5fd24134 100644 --- a/yt_dlp/extractor/la7.py +++ b/yt_dlp/extractor/la7.py @@ -208,9 +208,9 @@ class LA7PodcastIE(LA7PodcastEpisodeIE): # XXX: Do not subclass from concrete I 'url': 'https://www.la7.it/propagandalive/podcast', 'info_dict': { 'id': 'propagandalive', - 'title': "Propaganda Live", + 'title': 'Propaganda Live', }, - 'playlist_count_min': 10, + 'playlist_mincount': 10, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/laola1tv.py b/yt_dlp/extractor/laola1tv.py deleted file mode 100644 index 416dd7eb4..000000000 --- a/yt_dlp/extractor/laola1tv.py +++ /dev/null @@ -1,261 +0,0 @@ -import json -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - unified_strdate, - urlencode_postdata, - xpath_element, - xpath_text, - update_url_query, - js_to_json, -) - - -class Laola1TvEmbedIE(InfoExtractor): - IE_NAME = 'laola1tv:embed' - _VALID_URL = r'https?://(?:www\.)?laola1\.tv/titanplayer\.php\?.*?\bvideoid=(?P<id>\d+)' - _TESTS = [{ - # flashvars.premium = "false"; - 'url': 'https://www.laola1.tv/titanplayer.php?videoid=708065&type=V&lang=en&portal=int&customer=1024', - 'info_dict': { - 'id': '708065', - 'ext': 'mp4', - 'title': 'MA Long CHN - FAN Zhendong CHN', - 'uploader': 'ITTF - International Table Tennis Federation', - 'upload_date': '20161211', - }, - }] - - def _extract_token_url(self, stream_access_url, video_id, data): - return self._download_json( - self._proto_relative_url(stream_access_url, 'https:'), video_id, - headers={ - 'Content-Type': 'application/json', - }, data=json.dumps(data).encode())['data']['stream-access'][0] - - def _extract_formats(self, token_url, video_id): - token_doc = self._download_xml( - token_url, video_id, 'Downloading token', - headers=self.geo_verification_headers()) - - token_attrib = xpath_element(token_doc, './/token').attrib - - if token_attrib['status'] != '0': - raise ExtractorError( - 'Token error: %s' % token_attrib['comment'], expected=True) - - formats = self._extract_akamai_formats( - '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']), - video_id) - return formats - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - flash_vars = self._search_regex( - r'(?s)flashvars\s*=\s*({.+?});', webpage, 'flash vars') - - def get_flashvar(x, *args, **kwargs): - flash_var = self._search_regex( - r'%s\s*:\s*"([^"]+)"' % x, - flash_vars, x, default=None) - if not flash_var: - flash_var = self._search_regex([ - r'flashvars\.%s\s*=\s*"([^"]+)"' % x, - r'%s\s*=\s*"([^"]+)"' % x], - webpage, x, *args, **kwargs) - return flash_var - - hd_doc = self._download_xml( - 'http://www.laola1.tv/server/hd_video.php', video_id, query={ - 'play': get_flashvar('streamid'), - 'partner': get_flashvar('partnerid'), - 'portal': get_flashvar('portalid'), - 'lang': get_flashvar('sprache'), - 'v5ident': '', - }) - - _v = lambda x, **k: xpath_text(hd_doc, './/video/' + x, **k) - title = _v('title', fatal=True) - - token_url = None - premium = get_flashvar('premium', default=None) - if premium: - token_url = update_url_query( - _v('url', fatal=True), { - 'timestamp': get_flashvar('timestamp'), - 'auth': get_flashvar('auth'), - }) - else: - data_abo = urlencode_postdata( - dict((i, v) for i, v in enumerate(_v('req_liga_abos').split(',')))) - stream_access_url = update_url_query( - 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', { - 'videoId': _v('id'), - 'target': self._search_regex(r'vs_target = (\d+);', webpage, 'vs target'), - 'label': _v('label'), - 'area': _v('area'), - }) - token_url = self._extract_token_url(stream_access_url, video_id, data_abo) - - formats = self._extract_formats(token_url, video_id) - - categories_str = _v('meta_sports') - categories = categories_str.split(',') if categories_str else [] - is_live = _v('islive') == 'true' - - return { - 'id': video_id, - 'title': title, - 'upload_date': unified_strdate(_v('time_date')), - 'uploader': _v('meta_organisation'), - 'categories': categories, - 'is_live': is_live, - 'formats': formats, - } - - -class Laola1TvBaseIE(Laola1TvEmbedIE): # XXX: Do not subclass from concrete IE - def _extract_video(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - if 'Dieser Livestream ist bereits beendet.' in webpage: - raise ExtractorError('This live stream has already finished.', expected=True) - - conf = self._parse_json(self._search_regex( - r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'), - display_id, - transform_source=lambda s: js_to_json(re.sub(r'shareurl:.+,', '', s))) - video_id = conf['videoid'] - - config = self._download_json(conf['configUrl'], video_id, query={ - 'videoid': video_id, - 'partnerid': conf['partnerid'], - 'language': conf.get('language', ''), - 'portal': conf.get('portalid', ''), - }) - error = config.get('error') - if error: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - - video_data = config['video'] - title = video_data['title'] - is_live = video_data.get('isLivestream') and video_data.get('isLive') - meta = video_data.get('metaInformation') - sports = meta.get('sports') - categories = sports.split(',') if sports else [] - - token_url = self._extract_token_url( - video_data['streamAccess'], video_id, - video_data['abo']['required']) - - formats = self._extract_formats(token_url, video_id) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': video_data.get('description'), - 'thumbnail': video_data.get('image'), - 'categories': categories, - 'formats': formats, - 'is_live': is_live, - } - - -class Laola1TvIE(Laola1TvBaseIE): - IE_NAME = 'laola1tv' - _VALID_URL = r'https?://(?:www\.)?laola1\.tv/[a-z]+-[a-z]+/[^/]+/(?P<id>[^/?#&]+)' - - _TESTS = [{ - 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html', - 'info_dict': { - 'id': '227883', - 'display_id': 'straubing-tigers-koelner-haie', - 'ext': 'flv', - 'title': 'Straubing Tigers - Kölner Haie', - 'upload_date': '20140912', - 'is_live': False, - 'categories': ['Eishockey'], - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie', - 'info_dict': { - 'id': '464602', - 'display_id': 'straubing-tigers-koelner-haie', - 'ext': 'flv', - 'title': 'Straubing Tigers - Kölner Haie', - 'upload_date': '20160129', - 'is_live': False, - 'categories': ['Eishockey'], - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.laola1.tv/de-de/livestream/2016-03-22-belogorie-belgorod-trentino-diatec-lde', - 'info_dict': { - 'id': '487850', - 'display_id': '2016-03-22-belogorie-belgorod-trentino-diatec-lde', - 'ext': 'flv', - 'title': 'Belogorie BELGOROD - TRENTINO Diatec', - 'upload_date': '20160322', - 'uploader': 'CEV - Europäischer Volleyball Verband', - 'is_live': True, - 'categories': ['Volleyball'], - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This live stream has already finished.', - }] - - def _real_extract(self, url): - return self._extract_video(url) - - -class EHFTVIE(Laola1TvBaseIE): - IE_NAME = 'ehftv' - _VALID_URL = r'https?://(?:www\.)?ehftv\.com/[a-z]+(?:-[a-z]+)?/[^/]+/(?P<id>[^/?#&]+)' - - _TESTS = [{ - 'url': 'https://www.ehftv.com/int/video/paris-saint-germain-handball-pge-vive-kielce/1166761', - 'info_dict': { - 'id': '1166761', - 'display_id': 'paris-saint-germain-handball-pge-vive-kielce', - 'ext': 'mp4', - 'title': 'Paris Saint-Germain Handball - PGE Vive Kielce', - 'is_live': False, - 'categories': ['Handball'], - }, - 'params': { - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - return self._extract_video(url) - - -class ITTFIE(InfoExtractor): - _VALID_URL = r'https?://tv\.ittf\.com/video/[^/]+/(?P<id>\d+)' - _TEST = { - 'url': 'https://tv.ittf.com/video/peng-wang-wei-matsudaira-kenta/951802', - 'only_matching': True, - } - - def _real_extract(self, url): - return self.url_result( - update_url_query('https://www.laola1.tv/titanplayer.php', { - 'videoid': self._match_id(url), - 'type': 'V', - 'lang': 'en', - 'portal': 'int', - 'customer': 1024, - }), Laola1TvEmbedIE.ie_key()) diff --git a/yt_dlp/extractor/laxarxames.py b/yt_dlp/extractor/laxarxames.py new file mode 100644 index 000000000..e157f7c08 --- /dev/null +++ b/yt_dlp/extractor/laxarxames.py @@ -0,0 +1,73 @@ +import json + +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor +from ..utils import ExtractorError +from ..utils.traversal import traverse_obj + + +class LaXarxaMesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?laxarxames\.cat/(?:[^/?#]+/)*?(player|movie-details)/(?P<id>\d+)' + _NETRC_MACHINE = 'laxarxames' + _TOKEN = None + _TESTS = [{ + 'url': 'https://www.laxarxames.cat/player/3459421', + 'md5': '0966f46c34275934c19af78f3df6e2bc', + 'info_dict': { + 'id': '6339612436112', + 'ext': 'mp4', + 'title': 'Resum | UA Horta — UD Viladecans', + 'timestamp': 1697905186, + 'thumbnail': r're:https?://.*\.jpg', + 'description': '', + 'upload_date': '20231021', + 'duration': 129.44, + 'tags': ['ott', 'esports', '23-24', ' futbol', ' futbol-partits', 'elit', 'resum'], + 'uploader_id': '5779379807001', + }, + 'skip': 'Requires login', + }] + + def _perform_login(self, username, password): + if self._TOKEN: + return + + login = self._download_json( + 'https://api.laxarxames.cat/Authorization/SignIn', None, note='Logging in', headers={ + 'X-Tenantorigin': 'https://laxarxames.cat', + 'Content-Type': 'application/json', + }, data=json.dumps({ + 'Username': username, + 'Password': password, + 'Device': { + 'PlatformCode': 'WEB', + 'Name': 'Mac OS ()', + }, + }).encode(), expected_status=401) + + self._TOKEN = traverse_obj(login, ('AuthorizationToken', 'Token', {str})) + if not self._TOKEN: + raise ExtractorError('Login failed', expected=True) + + def _real_extract(self, url): + video_id = self._match_id(url) + if not self._TOKEN: + self.raise_login_required() + + media_play_info = self._download_json( + 'https://api.laxarxames.cat/Media/GetMediaPlayInfo', video_id, + data=json.dumps({ + 'MediaId': int(video_id), + 'StreamType': 'MAIN' + }).encode(), headers={ + 'Authorization': f'Bearer {self._TOKEN}', + 'X-Tenantorigin': 'https://laxarxames.cat', + 'Content-Type': 'application/json', + }) + + if not traverse_obj(media_play_info, ('ContentUrl', {str})): + self.raise_no_formats('No video found', expected=True) + + return self.url_result( + f'https://players.brightcove.net/5779379807001/default_default/index.html?videoId={media_play_info["ContentUrl"]}', + BrightcoveNewIE, video_id, media_play_info.get('Title')) diff --git a/yt_dlp/extractor/linuxacademy.py b/yt_dlp/extractor/linuxacademy.py deleted file mode 100644 index 0b1644293..000000000 --- a/yt_dlp/extractor/linuxacademy.py +++ /dev/null @@ -1,238 +0,0 @@ -import json -import random - -from .common import InfoExtractor -from ..compat import compat_b64decode, compat_str -from ..networking.exceptions import HTTPError -from ..utils import ( - clean_html, - ExtractorError, - js_to_json, - parse_duration, - try_get, - unified_timestamp, - urlencode_postdata, - urljoin, -) - - -class LinuxAcademyIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:www\.)?linuxacademy\.com/cp/ - (?: - courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)| - modules/view/id/(?P<course_id>\d+) - ) - ''' - _TESTS = [{ - 'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675', - 'info_dict': { - 'id': '7971-2', - 'ext': 'mp4', - 'title': 'What Is Data Science', - 'description': 'md5:c574a3c20607144fb36cb65bdde76c99', - 'timestamp': int, # The timestamp and upload date changes - 'upload_date': r're:\d+', - 'duration': 304, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Requires Linux Academy account credentials', - }, { - 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2', - 'only_matching': True, - }, { - 'url': 'https://linuxacademy.com/cp/modules/view/id/154', - 'info_dict': { - 'id': '154', - 'title': 'AWS Certified Cloud Practitioner', - 'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c', - 'duration': 28835, - }, - 'playlist_count': 41, - 'skip': 'Requires Linux Academy account credentials', - }, { - 'url': 'https://linuxacademy.com/cp/modules/view/id/39', - 'info_dict': { - 'id': '39', - 'title': 'Red Hat Certified Systems Administrator - RHCSA (EX200) Exam Prep (legacy)', - 'description': 'md5:0f1d3369e90c3fb14a79813b863c902f', - 'duration': 89280, - }, - 'playlist_count': 73, - 'skip': 'Requires Linux Academy account credentials', - }] - - _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize' - _ORIGIN_URL = 'https://linuxacademy.com' - _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx' - _NETRC_MACHINE = 'linuxacademy' - - def _perform_login(self, username, password): - def random_string(): - return ''.join(random.choices( - '0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~', k=32)) - - webpage, urlh = self._download_webpage_handle( - self._AUTHORIZE_URL, None, 'Downloading authorize page', query={ - 'client_id': self._CLIENT_ID, - 'response_type': 'token id_token', - 'response_mode': 'web_message', - 'redirect_uri': self._ORIGIN_URL, - 'scope': 'openid email user_impersonation profile', - 'audience': self._ORIGIN_URL, - 'state': random_string(), - 'nonce': random_string(), - }) - - login_data = self._parse_json( - self._search_regex( - r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, - 'login info', group='value'), None, - transform_source=lambda x: compat_b64decode(x).decode('utf-8') - )['extraParams'] - - login_data.update({ - 'client_id': self._CLIENT_ID, - 'redirect_uri': self._ORIGIN_URL, - 'tenant': 'lacausers', - 'connection': 'Username-Password-ACG-Proxy', - 'username': username, - 'password': password, - 'sso': 'true', - }) - - login_state_url = urlh.url - - try: - login_page = self._download_webpage( - 'https://login.linuxacademy.com/usernamepassword/login', None, - 'Downloading login page', data=json.dumps(login_data).encode(), - headers={ - 'Content-Type': 'application/json', - 'Origin': 'https://login.linuxacademy.com', - 'Referer': login_state_url, - }) - except ExtractorError as e: - if isinstance(e.cause, HTTPError) and e.cause.status == 401: - error = self._parse_json(e.cause.response.read(), None) - message = error.get('description') or error['code'] - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, message), expected=True) - raise - - callback_page, urlh = self._download_webpage_handle( - 'https://login.linuxacademy.com/login/callback', None, - 'Downloading callback page', - data=urlencode_postdata(self._hidden_inputs(login_page)), - headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - 'Origin': 'https://login.linuxacademy.com', - 'Referer': login_state_url, - }) - - access_token = self._search_regex( - r'access_token=([^=&]+)', urlh.url, - 'access token', default=None) - if not access_token: - access_token = self._parse_json( - self._search_regex( - r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page, - 'authorization response'), None, - transform_source=js_to_json)['response']['access_token'] - - self._download_webpage( - 'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s' - % access_token, None, 'Downloading token validation page') - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id') - item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id) - - webpage = self._download_webpage(url, item_id) - - # course path - if course_id: - module = self._parse_json( - self._search_regex( - r'window\.module\s*=\s*({(?:(?!};)[^"]|"([^"]|\\")*")+})\s*;', webpage, 'module'), - item_id) - entries = [] - chapter_number = None - chapter = None - chapter_id = None - for item in module['items']: - if not isinstance(item, dict): - continue - - def type_field(key): - return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower() - type_fields = (type_field('name'), type_field('slug')) - # Move to next module section - if 'section' in type_fields: - chapter = item.get('course_name') - chapter_id = item.get('course_module') - chapter_number = 1 if not chapter_number else chapter_number + 1 - continue - # Skip non-lessons - if 'lesson' not in type_fields: - continue - lesson_url = urljoin(url, item.get('url')) - if not lesson_url: - continue - title = item.get('title') or item.get('lesson_name') - description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text')) - entries.append({ - '_type': 'url_transparent', - 'url': lesson_url, - 'ie_key': LinuxAcademyIE.ie_key(), - 'title': title, - 'description': description, - 'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')), - 'duration': parse_duration(item.get('duration')), - 'chapter': chapter, - 'chapter_id': chapter_id, - 'chapter_number': chapter_number, - }) - return { - '_type': 'playlist', - 'entries': entries, - 'id': course_id, - 'title': module.get('title'), - 'description': module.get('md_desc') or clean_html(module.get('desc')), - 'duration': parse_duration(module.get('duration')), - } - - # single video path - m3u8_url = self._parse_json( - self._search_regex( - r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'), - item_id)[0]['file'] - formats = self._extract_m3u8_formats( - m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - info = { - 'id': item_id, - 'formats': formats, - } - lesson = self._parse_json( - self._search_regex( - (r'window\.lesson\s*=\s*({.+?})\s*;', - r'player\.lesson\s*=\s*({.+?})\s*;'), - webpage, 'lesson', default='{}'), item_id, fatal=False) - if lesson: - info.update({ - 'title': lesson.get('lesson_name'), - 'description': lesson.get('md_desc') or clean_html(lesson.get('desc')), - 'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')), - 'duration': parse_duration(lesson.get('duration')), - }) - if not info.get('title'): - info['title'] = self._search_regex( - (r'>Lecture\s*:\s*(?P<value>[^<]+)', - r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, - 'title', group='value') - return info diff --git a/yt_dlp/extractor/litv.py b/yt_dlp/extractor/litv.py index 2c7c7175e..1003fb2fd 100644 --- a/yt_dlp/extractor/litv.py +++ b/yt_dlp/extractor/litv.py @@ -6,6 +6,7 @@ int_or_none, smuggle_url, traverse_obj, + try_call, unsmuggle_url, ) @@ -96,13 +97,22 @@ def _real_extract(self, url): r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', webpage, 'video data', default='{}'), video_id) if not video_data: - payload = { - 'assetId': program_info['assetId'], - 'watchDevices': program_info['watchDevices'], - 'contentType': program_info['contentType'], - } + payload = {'assetId': program_info['assetId']} + puid = try_call(lambda: self._get_cookies('https://www.litv.tv/')['PUID'].value) + if puid: + payload.update({ + 'type': 'auth', + 'puid': puid, + }) + endpoint = 'getUrl' + else: + payload.update({ + 'watchDevices': program_info['watchDevices'], + 'contentType': program_info['contentType'], + }) + endpoint = 'getMainUrlNoAuth' video_data = self._download_json( - 'https://www.litv.tv/vod/ajax/getMainUrlNoAuth', video_id, + f'https://www.litv.tv/vod/ajax/{endpoint}', video_id, data=json.dumps(payload).encode('utf-8'), headers={'Content-Type': 'application/json'}) diff --git a/yt_dlp/extractor/m6.py b/yt_dlp/extractor/m6.py deleted file mode 100644 index 9dcc60164..000000000 --- a/yt_dlp/extractor/m6.py +++ /dev/null @@ -1,22 +0,0 @@ -from .common import InfoExtractor - - -class M6IE(InfoExtractor): - IE_NAME = 'm6' - _VALID_URL = r'https?://(?:www\.)?m6\.fr/[^/]+/videos/(?P<id>\d+)-[^\.]+\.html' - - _TEST = { - 'url': 'http://www.m6.fr/emission-les_reines_du_shopping/videos/11323908-emeline_est_la_reine_du_shopping_sur_le_theme_ma_fete_d_8217_anniversaire.html', - 'md5': '242994a87de2c316891428e0176bcb77', - 'info_dict': { - 'id': '11323908', - 'ext': 'mp4', - 'title': 'Emeline est la Reine du Shopping sur le thème « Ma fête d’anniversaire ! »', - 'description': 'md5:1212ae8fb4b7baa4dc3886c5676007c2', - 'duration': 100, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result('6play:%s' % video_id, 'SixPlay', video_id) diff --git a/yt_dlp/extractor/maariv.py b/yt_dlp/extractor/maariv.py new file mode 100644 index 000000000..425a8b3b4 --- /dev/null +++ b/yt_dlp/extractor/maariv.py @@ -0,0 +1,62 @@ +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_resolution, + unified_timestamp, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class MaarivIE(InfoExtractor): + IE_NAME = 'maariv.co.il' + _VALID_URL = r'https?://player\.maariv\.co\.il/public/player\.html\?(?:[^#]+&)?media=(?P<id>\d+)' + _EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL})'] + _TESTS = [{ + 'url': 'https://player.maariv.co.il/public/player.html?player=maariv-desktop&media=3611585', + 'info_dict': { + 'id': '3611585', + 'duration': 75, + 'ext': 'mp4', + 'upload_date': '20231009', + 'title': 'מבצע חרבות ברזל', + 'timestamp': 1696851301, + }, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.maariv.co.il/news/law/Article-1044008', + 'info_dict': { + 'id': '3611585', + 'duration': 75, + 'ext': 'mp4', + 'upload_date': '20231009', + 'title': 'מבצע חרבות ברזל', + 'timestamp': 1696851301, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._download_json( + f'https://dal.walla.co.il/media/{video_id}?origin=player.maariv.co.il', video_id)['data'] + + formats = [] + if hls_url := traverse_obj(data, ('video', 'url', {url_or_none})): + formats.extend(self._extract_m3u8_formats(hls_url, video_id, m3u8_id='hls', fatal=False)) + + for http_format in traverse_obj(data, ('video', 'stream_urls', ..., 'stream_url', {url_or_none})): + formats.append({ + 'url': http_format, + 'format_id': 'http', + **parse_resolution(http_format), + }) + + return { + 'id': video_id, + **traverse_obj(data, { + 'title': 'title', + 'duration': ('video', 'duration', {int_or_none}), + 'timestamp': ('upload_date', {unified_timestamp}), + }), + 'formats': formats, + } diff --git a/yt_dlp/extractor/magentamusik.py b/yt_dlp/extractor/magentamusik.py new file mode 100644 index 000000000..9d86a1b21 --- /dev/null +++ b/yt_dlp/extractor/magentamusik.py @@ -0,0 +1,62 @@ +from .common import InfoExtractor +from ..utils import ExtractorError, int_or_none, join_nonempty, url_or_none +from ..utils.traversal import traverse_obj + + +class MagentaMusikIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?magentamusik\.de/(?P<id>[^/?#]+)' + + _TESTS = [{ + 'url': 'https://www.magentamusik.de/marty-friedman-woa-2023-9208205928595409235', + 'md5': 'd82dd4748f55fc91957094546aaf8584', + 'info_dict': { + 'id': '9208205928595409235', + 'display_id': 'marty-friedman-woa-2023-9208205928595409235', + 'ext': 'mp4', + 'title': 'Marty Friedman: W:O:A 2023', + 'alt_title': 'Konzert vom: 05.08.2023 13:00', + 'duration': 2760, + 'categories': ['Musikkonzert'], + 'release_year': 2023, + 'location': 'Deutschland', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + player_config = self._search_json( + r'data-js-element="o-video-player__config">', webpage, 'player config', display_id, fatal=False) + if not player_config: + raise ExtractorError('No video found', expected=True) + + asset_id = player_config['assetId'] + asset_details = self._download_json( + f'https://wcps.t-online.de/cvss/magentamusic/vodclient/v2/assetdetails/58938/{asset_id}', + display_id, note='Downloading asset details') + + video_id = traverse_obj( + asset_details, ('content', 'partnerInformation', ..., 'reference', {str}), get_all=False) + if not video_id: + raise ExtractorError('Unable to extract video id') + + vod_data = self._download_json( + f'https://wcps.t-online.de/cvss/magentamusic/vodclient/v2/player/58935/{video_id}/Main%20Movie', video_id) + smil_url = traverse_obj( + vod_data, ('content', 'feature', 'representations', ..., + 'contentPackages', ..., 'media', 'href', {url_or_none}), get_all=False) + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': self._extract_smil_formats(smil_url, video_id), + **traverse_obj(vod_data, ('content', 'feature', 'metadata', { + 'title': 'title', + 'alt_title': 'originalTitle', + 'description': 'longDescription', + 'duration': ('runtimeInSeconds', {int_or_none}), + 'location': ('countriesOfProduction', {list}, {lambda x: join_nonempty(*x, delim=', ')}), + 'release_year': ('yearOfProduction', {int_or_none}), + 'categories': ('mainGenre', {str}, {lambda x: x and [x]}), + })), + } diff --git a/yt_dlp/extractor/magentamusik360.py b/yt_dlp/extractor/magentamusik360.py deleted file mode 100644 index 5d0cb3bfb..000000000 --- a/yt_dlp/extractor/magentamusik360.py +++ /dev/null @@ -1,58 +0,0 @@ -from .common import InfoExtractor - - -class MagentaMusik360IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?magenta-musik-360\.de/([a-z0-9-]+-(?P<id>[0-9]+)|festivals/.+)' - _TESTS = [{ - 'url': 'https://www.magenta-musik-360.de/within-temptation-wacken-2019-1-9208205928595185932', - 'md5': '65b6f060b40d90276ec6fb9b992c1216', - 'info_dict': { - 'id': '9208205928595185932', - 'ext': 'm3u8', - 'title': 'WITHIN TEMPTATION', - 'description': 'Robert Westerholt und Sharon Janny den Adel gründeten die Symphonic Metal-Band. Privat sind die Niederländer ein Paar und haben zwei Kinder. Die Single Ice Queen brachte ihnen Platin und Gold und verhalf 2002 zum internationalen Durchbruch. Charakteristisch für die Band war Anfangs der hohe Gesang von Frontfrau Sharon. Stilistisch fing die Band im Gothic Metal an. Mit neuem Sound, schnellen Gitarrenriffs und Gitarrensoli, avancierte Within Temptation zur erfolgreichen Rockband. Auch dieses Jahr wird die Band ihre Fangemeinde wieder mitreißen.', - } - }, { - 'url': 'https://www.magenta-musik-360.de/festivals/wacken-world-wide-2020-body-count-feat-ice-t', - 'md5': '81010d27d7cab3f7da0b0f681b983b7e', - 'info_dict': { - 'id': '9208205928595231363', - 'ext': 'm3u8', - 'title': 'Body Count feat. Ice-T', - 'description': 'Body Count feat. Ice-T konnten bereits im vergangenen Jahr auf dem „Holy Ground“ in Wacken überzeugen. 2020 gehen die Crossover-Metaller aus einem Club in Los Angeles auf Sendung und bringen mit ihrer Mischung aus Metal und Hip-Hop Abwechslung und ordentlich Alarm zum WWW. Bereits seit 1990 stehen die beiden Gründer Ice-T (Gesang) und Ernie C (Gitarre) auf der Bühne. Sieben Studioalben hat die Gruppe bis jetzt veröffentlicht, darunter das Debüt „Body Count“ (1992) mit dem kontroversen Track „Cop Killer“.', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - # _match_id casts to string, but since "None" is not a valid video_id for magenta - # there is no risk for confusion - if video_id == "None": - webpage = self._download_webpage(url, video_id) - video_id = self._html_search_regex(r'data-asset-id="([^"]+)"', webpage, 'video_id') - json = self._download_json("https://wcps.t-online.de/cvss/magentamusic/vodplayer/v3/player/58935/%s/Main%%20Movie" % video_id, video_id) - xml_url = json['content']['feature']['representations'][0]['contentPackages'][0]['media']['href'] - metadata = json['content']['feature'].get('metadata') - title = None - description = None - duration = None - thumbnails = [] - if metadata: - title = metadata.get('title') - description = metadata.get('fullDescription') - duration = metadata.get('runtimeInSeconds') - for img_key in ('teaserImageWide', 'smallCoverImage'): - if img_key in metadata: - thumbnails.append({'url': metadata[img_key].get('href')}) - - xml = self._download_xml(xml_url, video_id) - final_url = xml[0][0][0].attrib['src'] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'url': final_url, - 'duration': duration, - 'thumbnails': thumbnails - } diff --git a/yt_dlp/extractor/mainstreaming.py b/yt_dlp/extractor/mainstreaming.py index fe5589d59..fd9bba8bc 100644 --- a/yt_dlp/extractor/mainstreaming.py +++ b/yt_dlp/extractor/mainstreaming.py @@ -13,7 +13,7 @@ class MainStreamingIE(InfoExtractor): - _VALID_URL = r'https?://(?:webtools-?)?(?P<host>[A-Za-z0-9-]*\.msvdn.net)/(?:embed|amp_embed|content)/(?P<id>\w+)' + _VALID_URL = r'https?://(?:webtools-?)?(?P<host>[A-Za-z0-9-]*\.msvdn\.net)/(?:embed|amp_embed|content)/(?P<id>\w+)' _EMBED_REGEX = [rf'<iframe[^>]+?src=["\']?(?P<url>{_VALID_URL})["\']?'] IE_DESC = 'MainStreaming Player' diff --git a/yt_dlp/extractor/mbn.py b/yt_dlp/extractor/mbn.py new file mode 100644 index 000000000..4917c4698 --- /dev/null +++ b/yt_dlp/extractor/mbn.py @@ -0,0 +1,89 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class MBNIE(InfoExtractor): + IE_DESC = 'mbn.co.kr (매일방송)' + _VALID_URL = r'https?://(?:www\.)?mbn\.co\.kr/vod/programContents/preview(?:list)?/\d+/\d+/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://mbn.co.kr/vod/programContents/previewlist/861/5433/1276155', + 'md5': '85e1694e5b247c04d1386b7e3c90fd76', + 'info_dict': { + 'id': '1276155', + 'ext': 'mp4', + 'title': '결국 사로잡힌 권유리, 그녀를 목숨 걸고 구하려는 정일우!', + 'duration': 3891, + 'release_date': '20210703', + 'thumbnail': 'http://img.vod.mbn.co.kr/mbnvod2img/861/2021/07/03/20210703230811_20_861_1276155_360_7_0.jpg', + 'series': '보쌈 - 운명을 훔치다', + 'episode': 'Episode 19', + 'episode_number': 19, + }, + }, { + 'url': 'https://www.mbn.co.kr/vod/programContents/previewlist/835/5294/1084744', + 'md5': 'fc65d3aac85e85e0b5056f4ef99cde4a', + 'info_dict': { + 'id': '1084744', + 'ext': 'mp4', + 'title': '김정은♥최원영, 제자리를 찾은 위험한 부부! "결혼은 투쟁이면서, 어려운 방식이야.."', + 'duration': 93, + 'release_date': '20201124', + 'thumbnail': 'http://img.vod.mbn.co.kr/mbnvod2img/835/2020/11/25/20201125000221_21_835_1084744_360_7_0.jpg', + 'series': '나의 위험한 아내', + }, + }, { + 'url': 'https://www.mbn.co.kr/vod/programContents/preview/952/6088/1054797?next=1', + 'md5': 'c711103c72aeac8323a5cf1751f10097', + 'info_dict': { + 'id': '1054797', + 'ext': 'mp4', + 'title': '[2차 티저] MBN 주말 미니시리즈 <완벽한 결혼의 정석> l 그녀에게 주어진 두 번째 인생', + 'duration': 65, + 'release_date': '20231028', + 'thumbnail': 'http://img.vod.mbn.co.kr/vod2/952/2023/09/11/20230911130223_22_952_1054797_1080_7.jpg', + 'series': '완벽한 결혼의 정석', + }, + }] + + def _real_extract(self, url): + content_id = self._match_id(url) + webpage = self._download_webpage(url, content_id) + + content_cls_cd = self._search_regex( + r'"\?content_cls_cd=(\d+)&', webpage, 'content cls cd', fatal=False) or '20' + media_info = self._download_json( + 'https://www.mbn.co.kr/player/mbnVodPlayer_2020.mbn', content_id, + note='Fetching playback data', query={ + 'content_cls_cd': content_cls_cd, + 'content_id': content_id, + 'relay_type': '1', + }) + + formats = [] + for stream_url in traverse_obj(media_info, ('movie_list', ..., 'url', {url_or_none})): + stream_url = re.sub(r'/(?:chunk|play)list(?:_pd\d+)?\.m3u8', '/manifest.m3u8', stream_url) + final_url = url_or_none(self._download_webpage( + f'https://www.mbn.co.kr/player/mbnStreamAuth_new_vod.mbn?vod_url={stream_url}', + content_id, note='Fetching authenticated m3u8 url')) + + formats.extend(self._extract_m3u8_formats(final_url, content_id, fatal=False)) + + return { + 'id': content_id, + **traverse_obj(media_info, { + 'title': ('movie_title', {str}), + 'duration': ('play_sec', {int_or_none}), + 'release_date': ('bcast_date', {lambda x: x.replace('.', '')}, {unified_strdate}), + 'thumbnail': ('movie_start_Img', {url_or_none}), + 'series': ('prog_nm', {str}), + 'episode_number': ('ad_contentnumber', {int_or_none}), + }), + 'formats': formats, + } diff --git a/yt_dlp/extractor/mediaite.py b/yt_dlp/extractor/mediaite.py index ab253920b..32887cbde 100644 --- a/yt_dlp/extractor/mediaite.py +++ b/yt_dlp/extractor/mediaite.py @@ -2,7 +2,7 @@ class MediaiteIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mediaite.com(?!/category)(?:/[\w-]+){2}' + _VALID_URL = r'https?://(?:www\.)?mediaite\.com(?!/category)(?:/[\w-]+){2}' _TESTS = [{ 'url': 'https://www.mediaite.com/sports/bill-burr-roasts-nfl-for-promoting-black-lives-matter-while-scheduling-more-games-after-all-the-sht-they-know-about-cte/', 'info_dict': { diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py index 2d6204298..e04a1ce90 100644 --- a/yt_dlp/extractor/mediaset.py +++ b/yt_dlp/extractor/mediaset.py @@ -73,6 +73,7 @@ class MediasetIE(ThePlatformBaseIE): 'season_number': 5, 'episode_number': 5, 'chapters': [{'start_time': 0.0, 'end_time': 3409.08}, {'start_time': 3409.08, 'end_time': 6565.008}], + 'categories': ['Informazione'], }, }, { # DRM @@ -149,6 +150,7 @@ class MediasetIE(ThePlatformBaseIE): 'season_number': 12, 'episode': 'Episode 8', 'episode_number': 8, + 'categories': ['Intrattenimento'], }, 'params': { 'skip_download': True, diff --git a/yt_dlp/extractor/mediastream.py b/yt_dlp/extractor/mediastream.py index b8cb5a691..ae0fb2aed 100644 --- a/yt_dlp/extractor/mediastream.py +++ b/yt_dlp/extractor/mediastream.py @@ -3,8 +3,11 @@ from .common import InfoExtractor from ..utils import ( clean_html, + filter_dict, + parse_qs, remove_end, traverse_obj, + update_url_query, urljoin, ) @@ -108,7 +111,9 @@ def _real_extract(self, url): for message in [ 'Debido a tu ubicación no puedes ver el contenido', - 'You are not allowed to watch this video: Geo Fencing Restriction' + 'You are not allowed to watch this video: Geo Fencing Restriction', + 'Este contenido no está disponible en tu zona geográfica.', + 'El contenido sólo está disponible dentro de', ]: if message in webpage: self.raise_geo_restricted() @@ -118,7 +123,16 @@ def _real_extract(self, url): formats, subtitles = [], {} for video_format in player_config['src']: if video_format == 'hls': - fmts, subs = self._extract_m3u8_formats_and_subtitles(player_config['src'][video_format], video_id) + params = { + 'at': 'web-app', + 'access_token': traverse_obj(parse_qs(url), ('access_token', 0)), + } + for name, key in (('MDSTRMUID', 'uid'), ('MDSTRMSID', 'sid'), ('MDSTRMPID', 'pid'), ('VERSION', 'av')): + params[key] = self._search_regex( + rf'window\.{name}\s*=\s*["\']([^"\']+)["\'];', webpage, key, default=None) + + fmts, subs = self._extract_m3u8_formats_and_subtitles( + update_url_query(player_config['src'][video_format], filter_dict(params)), video_id) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) elif video_format == 'mpd': diff --git a/yt_dlp/extractor/meta.py b/yt_dlp/extractor/meta.py deleted file mode 100644 index 7c11e6017..000000000 --- a/yt_dlp/extractor/meta.py +++ /dev/null @@ -1,70 +0,0 @@ -from .common import InfoExtractor -from .pladform import PladformIE -from ..utils import ( - unescapeHTML, - int_or_none, - ExtractorError, -) - - -class METAIE(InfoExtractor): - _VALID_URL = r'https?://video\.meta\.ua/(?:iframe/)?(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://video.meta.ua/5502115.video', - 'md5': '71b6f3ee274bef16f1ab410f7f56b476', - 'info_dict': { - 'id': '5502115', - 'ext': 'mp4', - 'title': 'Sony Xperia Z camera test [HQ]', - 'description': 'Xperia Z shoots video in FullHD HDR.', - 'uploader_id': 'nomobile', - 'uploader': 'CHЁZA.TV', - 'upload_date': '20130211', - }, - 'add_ie': ['Youtube'], - }, { - 'url': 'http://video.meta.ua/iframe/5502115', - 'only_matching': True, - }, { - # pladform embed - 'url': 'http://video.meta.ua/7121015.video', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - st_html5 = self._search_regex( - r"st_html5\s*=\s*'#([^']+)'", webpage, 'uppod html5 st', default=None) - - if st_html5: - # uppod st decryption algorithm is reverse engineered from function un(s) at uppod.js - json_str = '' - for i in range(0, len(st_html5), 3): - json_str += '�%s;' % st_html5[i:i + 3] - uppod_data = self._parse_json(unescapeHTML(json_str), video_id) - error = uppod_data.get('customnotfound') - if error: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - - video_url = uppod_data['file'] - info = { - 'id': video_id, - 'url': video_url, - 'title': uppod_data.get('comment') or self._og_search_title(webpage), - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': uppod_data.get('poster') or self._og_search_thumbnail(webpage), - 'duration': int_or_none(self._og_search_property( - 'video:duration', webpage, default=None)), - } - if 'youtube.com/' in video_url: - info.update({ - '_type': 'url_transparent', - 'ie_key': 'Youtube', - }) - return info - - pladform_url = PladformIE._extract_url(webpage) - if pladform_url: - return self.url_result(pladform_url) diff --git a/yt_dlp/extractor/metacafe.py b/yt_dlp/extractor/metacafe.py deleted file mode 100644 index d7f5def0e..000000000 --- a/yt_dlp/extractor/metacafe.py +++ /dev/null @@ -1,281 +0,0 @@ -import json -import re -import urllib.parse - -from .common import InfoExtractor -from ..compat import compat_parse_qs, compat_urllib_parse_unquote -from ..utils import ( - ExtractorError, - determine_ext, - get_element_by_attribute, - int_or_none, - mimetype2ext, -) - - -class MetacafeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/(?P<id>[^/]+)/(?P<display_id>[^/?#]+)' - _DISCLAIMER = 'http://www.metacafe.com/family_filter/' - _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' - IE_NAME = 'metacafe' - _TESTS = [ - # Youtube video - { - 'add_ie': ['Youtube'], - 'url': 'http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/', - 'info_dict': { - 'id': '_aUehQsCQtM', - 'ext': 'mp4', - 'upload_date': '20090102', - 'title': 'The Electric Company | "Short I" | PBS KIDS GO!', - 'description': 'md5:2439a8ef6d5a70e380c22f5ad323e5a8', - 'uploader': 'PBS', - 'uploader_id': 'PBS' - } - }, - # Normal metacafe video - { - 'url': 'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/', - 'md5': '6e0bca200eaad2552e6915ed6fd4d9ad', - 'info_dict': { - 'id': '11121940', - 'ext': 'mp4', - 'title': 'News: Stuff You Won\'t Do with Your PlayStation 4', - 'uploader': 'ign', - 'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.', - }, - 'skip': 'Page is temporarily unavailable.', - }, - # metacafe video with family filter - { - 'url': 'http://www.metacafe.com/watch/2155630/adult_art_by_david_hart_156/', - 'md5': 'b06082c5079bbdcde677a6291fbdf376', - 'info_dict': { - 'id': '2155630', - 'ext': 'mp4', - 'title': 'Adult Art By David Hart 156', - 'uploader': '63346', - 'description': 'md5:9afac8fc885252201ad14563694040fc', - }, - 'params': { - 'skip_download': True, - }, - }, - # AnyClip video - { - 'url': 'http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/', - 'info_dict': { - 'id': 'an-dVVXnuY7Jh77J', - 'ext': 'mp4', - 'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3', - 'uploader': 'AnyClip', - 'description': 'md5:cbef0460d31e3807f6feb4e7a5952e5b', - }, - }, - # age-restricted video - { - 'url': 'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/', - 'md5': '98dde7c1a35d02178e8ab7560fe8bd09', - 'info_dict': { - 'id': '5186653', - 'ext': 'mp4', - 'title': 'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.', - 'uploader': 'Dwayne Pipe', - 'description': 'md5:950bf4c581e2c059911fa3ffbe377e4b', - 'age_limit': 18, - }, - }, - # cbs video - { - 'url': 'http://www.metacafe.com/watch/cb-8VD4r_Zws8VP/open_this_is_face_the_nation_february_9/', - 'info_dict': { - 'id': '8VD4r_Zws8VP', - 'ext': 'flv', - 'title': 'Open: This is Face the Nation, February 9', - 'description': 'md5:8a9ceec26d1f7ed6eab610834cc1a476', - 'duration': 96, - 'uploader': 'CBSI-NEW', - 'upload_date': '20140209', - 'timestamp': 1391959800, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, - # Movieclips.com video - { - 'url': 'http://www.metacafe.com/watch/mv-Wy7ZU/my_week_with_marilyn_do_you_love_me/', - 'info_dict': { - 'id': 'mv-Wy7ZU', - 'ext': 'mp4', - 'title': 'My Week with Marilyn - Do You Love Me?', - 'description': 'From the movie My Week with Marilyn - Colin (Eddie Redmayne) professes his love to Marilyn (Michelle Williams) and gets her to promise to return to set and finish the movie.', - 'uploader': 'movie_trailers', - 'duration': 176, - }, - 'params': { - 'skip_download': 'requires rtmpdump', - } - } - ] - - def report_disclaimer(self): - self.to_screen('Retrieving disclaimer') - - def _real_extract(self, url): - # Extract id and simplified title from URL - video_id, display_id = self._match_valid_url(url).groups() - - # the video may come from an external site - m_external = re.match(r'^(\w{2})-(.*)$', video_id) - if m_external is not None: - prefix, ext_id = m_external.groups() - # Check if video comes from YouTube - if prefix == 'yt': - return self.url_result('http://www.youtube.com/watch?v=%s' % ext_id, 'Youtube') - # CBS videos use theplatform.com - if prefix == 'cb': - return self.url_result('theplatform:%s' % ext_id, 'ThePlatform') - - headers = { - # Disable family filter - 'Cookie': 'user=%s; ' % urllib.parse.quote(json.dumps({'ffilter': False})) - } - - # AnyClip videos require the flashversion cookie so that we get the link - # to the mp4 file - if video_id.startswith('an-'): - headers['Cookie'] += 'flashVersion=0; ' - - # Retrieve video webpage to extract further information - webpage = self._download_webpage(url, video_id, headers=headers) - - error = get_element_by_attribute( - 'class', 'notfound-page-title', webpage) - if error: - raise ExtractorError(error, expected=True) - - video_title = self._html_search_meta( - ['og:title', 'twitter:title'], webpage, 'title', default=None) or self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title') - - # Extract URL, uploader and title from webpage - self.report_extraction(video_id) - video_url = None - mobj = re.search(r'(?m)&(?:media|video)URL=([^&]+)', webpage) - if mobj is not None: - mediaURL = compat_urllib_parse_unquote(mobj.group(1)) - video_ext = determine_ext(mediaURL) - - # Extract gdaKey if available - mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) - if mobj is None: - video_url = mediaURL - else: - gdaKey = mobj.group(1) - video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) - if video_url is None: - mobj = re.search(r'<video src="([^"]+)"', webpage) - if mobj: - video_url = mobj.group(1) - video_ext = 'mp4' - if video_url is None: - flashvars = self._search_regex( - r' name="flashvars" value="(.*?)"', webpage, 'flashvars', - default=None) - if flashvars: - vardict = compat_parse_qs(flashvars) - if 'mediaData' not in vardict: - raise ExtractorError('Unable to extract media URL') - mobj = re.search( - r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0]) - if mobj is None: - raise ExtractorError('Unable to extract media URL') - mediaURL = mobj.group('mediaURL').replace('\\/', '/') - video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key')) - video_ext = determine_ext(video_url) - if video_url is None: - player_url = self._search_regex( - r"swfobject\.embedSWF\('([^']+)'", - webpage, 'config URL', default=None) - if player_url: - config_url = self._search_regex( - r'config=(.+)$', player_url, 'config URL') - config_doc = self._download_xml( - config_url, video_id, - note='Downloading video config') - smil_url = config_doc.find('.//properties').attrib['smil_file'] - smil_doc = self._download_xml( - smil_url, video_id, - note='Downloading SMIL document') - base_url = smil_doc.find('./head/meta').attrib['base'] - video_url = [] - for vn in smil_doc.findall('.//video'): - br = int(vn.attrib['system-bitrate']) - play_path = vn.attrib['src'] - video_url.append({ - 'format_id': 'smil-%d' % br, - 'url': base_url, - 'play_path': play_path, - 'page_url': url, - 'player_url': player_url, - 'ext': play_path.partition(':')[0], - }) - if video_url is None: - flashvars = self._parse_json(self._search_regex( - r'flashvars\s*=\s*({.*});', webpage, 'flashvars', - default=None), video_id, fatal=False) - if flashvars: - video_url = [] - for source in flashvars.get('sources'): - source_url = source.get('src') - if not source_url: - continue - ext = mimetype2ext(source.get('type')) or determine_ext(source_url) - if ext == 'm3u8': - video_url.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - else: - video_url.append({ - 'url': source_url, - 'ext': ext, - }) - - if video_url is None: - raise ExtractorError('Unsupported video type') - - description = self._html_search_meta( - ['og:description', 'twitter:description', 'description'], - webpage, 'title', fatal=False) - thumbnail = self._html_search_meta( - ['og:image', 'twitter:image'], webpage, 'title', fatal=False) - video_uploader = self._html_search_regex( - r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', - webpage, 'uploader nickname', fatal=False) - duration = int_or_none( - self._html_search_meta('video:duration', webpage, default=None)) - age_limit = ( - 18 - if re.search(r'(?:"contentRating":|"rating",)"restricted"', webpage) - else 0) - - if isinstance(video_url, list): - formats = video_url - else: - formats = [{ - 'url': video_url, - 'ext': video_ext, - }] - - return { - 'id': video_id, - 'display_id': display_id, - 'description': description, - 'uploader': video_uploader, - 'title': video_title, - 'thumbnail': thumbnail, - 'age_limit': age_limit, - 'formats': formats, - 'duration': duration, - } diff --git a/yt_dlp/extractor/mgoon.py b/yt_dlp/extractor/mgoon.py deleted file mode 100644 index 2388a7192..000000000 --- a/yt_dlp/extractor/mgoon.py +++ /dev/null @@ -1,81 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - qualities, - unified_strdate, -) - - -class MgoonIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)? - (?:(:?m\.)?mgoon\.com/(?:ch/(?:.+)/v|play/view)| - video\.mgoon\.com)/(?P<id>[0-9]+)''' - _API_URL = 'http://mpos.mgoon.com/player/video?id={0:}' - _TESTS = [ - { - 'url': 'http://m.mgoon.com/ch/hi6618/v/5582148', - 'md5': 'dd46bb66ab35cf6d51cc812fd82da79d', - 'info_dict': { - 'id': '5582148', - 'uploader_id': 'hi6618', - 'duration': 240.419, - 'upload_date': '20131220', - 'ext': 'mp4', - 'title': 'md5:543aa4c27a4931d371c3f433e8cebebc', - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, - { - 'url': 'http://www.mgoon.com/play/view/5582148', - 'only_matching': True, - }, - { - 'url': 'http://video.mgoon.com/5582148', - 'only_matching': True, - }, - ] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - - data = self._download_json(self._API_URL.format(video_id), video_id) - - if data.get('errorInfo', {}).get('code') != 'NONE': - raise ExtractorError('%s encountered an error: %s' % ( - self.IE_NAME, data['errorInfo']['message']), expected=True) - - v_info = data['videoInfo'] - title = v_info.get('v_title') - thumbnail = v_info.get('v_thumbnail') - duration = v_info.get('v_duration') - upload_date = unified_strdate(v_info.get('v_reg_date')) - uploader_id = data.get('userInfo', {}).get('u_alias') - if duration: - duration /= 1000.0 - - age_limit = None - if data.get('accessInfo', {}).get('code') == 'VIDEO_STATUS_ADULT': - age_limit = 18 - - formats = [] - get_quality = qualities(['360p', '480p', '720p', '1080p']) - for fmt in data['videoFiles']: - formats.append({ - 'format_id': fmt['label'], - 'quality': get_quality(fmt['label']), - 'url': fmt['url'], - 'ext': fmt['format'], - - }) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'duration': duration, - 'upload_date': upload_date, - 'uploader_id': uploader_id, - 'age_limit': age_limit, - } diff --git a/yt_dlp/extractor/miomio.py b/yt_dlp/extractor/miomio.py deleted file mode 100644 index 8df8cba19..000000000 --- a/yt_dlp/extractor/miomio.py +++ /dev/null @@ -1,134 +0,0 @@ -import random - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..networking import Request -from ..utils import ExtractorError, int_or_none, xpath_text - - -class MioMioIE(InfoExtractor): - IE_NAME = 'miomio.tv' - _VALID_URL = r'https?://(?:www\.)?miomio\.tv/watch/cc(?P<id>[0-9]+)' - _TESTS = [{ - # "type=video" in flashvars - 'url': 'http://www.miomio.tv/watch/cc88912/', - 'info_dict': { - 'id': '88912', - 'ext': 'flv', - 'title': '【SKY】字幕 铠武昭和VS平成 假面骑士大战FEAT战队 魔星字幕组 字幕', - 'duration': 5923, - }, - 'skip': 'Unable to load videos', - }, { - 'url': 'http://www.miomio.tv/watch/cc184024/', - 'info_dict': { - 'id': '43729', - 'title': '《动漫同人插画绘制》', - }, - 'playlist_mincount': 86, - 'skip': 'Unable to load videos', - }, { - 'url': 'http://www.miomio.tv/watch/cc173113/', - 'info_dict': { - 'id': '173113', - 'title': 'The New Macbook 2015 上手试玩与简评' - }, - 'playlist_mincount': 2, - 'skip': 'Unable to load videos', - }, { - # new 'h5' player - 'url': 'http://www.miomio.tv/watch/cc273997/', - 'md5': '0b27a4b4495055d826813f8c3a6b2070', - 'info_dict': { - 'id': '273997', - 'ext': 'mp4', - 'title': 'マツコの知らない世界【劇的進化SP!ビニール傘&冷凍食品2016】 1_2 - 16 05 31', - }, - 'skip': 'Unable to load videos', - }] - - def _extract_mioplayer(self, webpage, video_id, title, http_headers): - xml_config = self._search_regex( - r'flashvars="type=(?:sina|video)&(.+?)&', - webpage, 'xml config') - - # skipping the following page causes lags and eventually connection drop-outs - self._request_webpage( - 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/xml.php?id=%s&r=%s' % (id, random.randint(100, 999)), - video_id) - - vid_config_request = Request( - 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/sina.php?{0}'.format(xml_config), - headers=http_headers) - - # the following xml contains the actual configuration information on the video file(s) - vid_config = self._download_xml(vid_config_request, video_id) - - if not int_or_none(xpath_text(vid_config, 'timelength')): - raise ExtractorError('Unable to load videos!', expected=True) - - entries = [] - for f in vid_config.findall('./durl'): - segment_url = xpath_text(f, 'url', 'video url') - if not segment_url: - continue - order = xpath_text(f, 'order', 'order') - segment_id = video_id - segment_title = title - if order: - segment_id += '-%s' % order - segment_title += ' part %s' % order - entries.append({ - 'id': segment_id, - 'url': segment_url, - 'title': segment_title, - 'duration': int_or_none(xpath_text(f, 'length', 'duration'), 1000), - 'http_headers': http_headers, - }) - - return entries - - def _download_chinese_webpage(self, *args, **kwargs): - # Requests with English locales return garbage - headers = { - 'Accept-Language': 'zh-TW,en-US;q=0.7,en;q=0.3', - } - kwargs.setdefault('headers', {}).update(headers) - return self._download_webpage(*args, **kwargs) - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_chinese_webpage( - url, video_id) - - title = self._html_search_meta( - 'description', webpage, 'title', fatal=True) - - mioplayer_path = self._search_regex( - r'src="(/mioplayer(?:_h5)?/[^"]+)"', webpage, 'ref_path') - - if '_h5' in mioplayer_path: - player_url = compat_urlparse.urljoin(url, mioplayer_path) - player_webpage = self._download_chinese_webpage( - player_url, video_id, - note='Downloading player webpage', headers={'Referer': url}) - entries = self._parse_html5_media_entries(player_url, player_webpage, video_id) - http_headers = {'Referer': player_url} - else: - http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path} - entries = self._extract_mioplayer(webpage, video_id, title, http_headers) - - if len(entries) == 1: - segment = entries[0] - segment['id'] = video_id - segment['title'] = title - segment['http_headers'] = http_headers - return segment - - return { - '_type': 'multi_video', - 'id': video_id, - 'entries': entries, - 'title': title, - 'http_headers': http_headers, - } diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py index 72057dc97..d715b9789 100644 --- a/yt_dlp/extractor/mlb.py +++ b/yt_dlp/extractor/mlb.py @@ -355,11 +355,11 @@ class MLBArticleIE(InfoExtractor): 'info_dict': { 'id': '36db7394-343c-4ea3-b8ca-ead2e61bca9a', 'title': 'Machado\'s grab draws hilarious irate reaction', - 'modified_timestamp': 1650130737, + 'modified_timestamp': 1675888370, 'description': 'md5:a19d4eb0487b2cb304e9a176f6b67676', - 'modified_date': '20220416', + 'modified_date': '20230208', }, - 'playlist_count': 2, + 'playlist_mincount': 2, }] def _real_extract(self, url): @@ -367,15 +367,13 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) apollo_cache_json = self._search_json(r'window\.initState\s*=', webpage, 'window.initState', display_id)['apolloCache'] - content_data_id = traverse_obj( - apollo_cache_json, ('ROOT_QUERY', lambda k, _: k.startswith('getForgeContent'), 'id'), get_all=False) - - content_real_info = apollo_cache_json[content_data_id] + content_real_info = traverse_obj( + apollo_cache_json, ('ROOT_QUERY', lambda k, _: k.startswith('getArticle')), get_all=False) return self.playlist_from_matches( - traverse_obj(content_real_info, ('parts', lambda _, v: v['typename'] == 'Video', 'id')), - getter=lambda x: f'https://www.mlb.com/video/{apollo_cache_json[x]["slug"]}', - ie=MLBVideoIE, playlist_id=content_real_info.get('_translationId'), + traverse_obj(content_real_info, ('parts', lambda _, v: v['__typename'] == 'Video' or v['type'] == 'video')), + getter=lambda x: f'https://www.mlb.com/video/{x["slug"]}', + ie=MLBVideoIE, playlist_id=content_real_info.get('translationId'), title=self._html_search_meta('og:title', webpage), description=content_real_info.get('summary'), modified_timestamp=parse_iso8601(content_real_info.get('lastUpdatedDate'))) diff --git a/yt_dlp/extractor/mnet.py b/yt_dlp/extractor/mnet.py deleted file mode 100644 index 98bab2e10..000000000 --- a/yt_dlp/extractor/mnet.py +++ /dev/null @@ -1,85 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, - parse_iso8601, -) - - -class MnetIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mnet\.(?:com|interest\.me)/tv/vod/(?:.*?\bclip_id=)?(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://www.mnet.com/tv/vod/171008', - 'info_dict': { - 'id': '171008', - 'title': 'SS_이해인@히든박스', - 'description': 'md5:b9efa592c3918b615ba69fe9f8a05c55', - 'duration': 88, - 'upload_date': '20151231', - 'timestamp': 1451564040, - 'age_limit': 0, - 'thumbnails': 'mincount:5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'ext': 'flv', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - 'url': 'http://mnet.interest.me/tv/vod/172790', - 'only_matching': True, - }, { - 'url': 'http://www.mnet.com/tv/vod/vod_view.asp?clip_id=172790&tabMenu=', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - # TODO: extract rtmp formats - # no stype -> rtmp url - # stype=H -> m3u8 url - # stype=M -> mpd url - info = self._download_json( - 'http://content.api.mnet.com/player/vodConfig', - video_id, 'Downloading vod config JSON', query={ - 'id': video_id, - 'ctype': 'CLIP', - 'stype': 'H', - })['data']['info'] - - title = info['title'] - - cdn_data = self._download_json( - info['cdn'], video_id, 'Downloading vod cdn JSON')['data'][0] - m3u8_url = cdn_data['url'] - token = cdn_data.get('token') - if token and token != '-': - m3u8_url += '?' + token - formats = self._extract_wowza_formats( - m3u8_url, video_id, skip_protocols=['rtmp', 'rtsp', 'f4m']) - - description = info.get('ment') - duration = parse_duration(info.get('time')) - timestamp = parse_iso8601(info.get('date'), delimiter=' ') - age_limit = info.get('adult') - if age_limit is not None: - age_limit = 0 if age_limit == 'N' else 18 - thumbnails = [{ - 'id': thumb_format, - 'url': thumb['url'], - 'width': int_or_none(thumb.get('width')), - 'height': int_or_none(thumb.get('height')), - } for thumb_format, thumb in info.get('cover', {}).items() if thumb.get('url')] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'age_limit': age_limit, - 'thumbnails': thumbnails, - 'formats': formats, - } diff --git a/yt_dlp/extractor/mocha.py b/yt_dlp/extractor/mocha.py index 5f72b810b..2fbc0e911 100644 --- a/yt_dlp/extractor/mocha.py +++ b/yt_dlp/extractor/mocha.py @@ -3,7 +3,7 @@ class MochaVideoIE(InfoExtractor): - _VALID_URL = r'https?://video.mocha.com.vn/(?P<video_slug>[\w-]+)' + _VALID_URL = r'https?://video\.mocha\.com\.vn/(?P<video_slug>[\w-]+)' _TESTS = [{ 'url': 'http://video.mocha.com.vn/chuyen-meo-gia-su-tu-thong-diep-cuoc-song-v18694039', 'info_dict': { diff --git a/yt_dlp/extractor/moevideo.py b/yt_dlp/extractor/moevideo.py deleted file mode 100644 index fda08cae9..000000000 --- a/yt_dlp/extractor/moevideo.py +++ /dev/null @@ -1,74 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - clean_html, - int_or_none, -) - - -class MoeVideoIE(InfoExtractor): - IE_DESC = 'LetitBit video services: moevideo.net, playreplay.net and videochart.net' - _VALID_URL = r'''(?x) - https?://(?P<host>(?:www\.)? - (?:(?:moevideo|playreplay|videochart)\.net|thesame\.tv))/ - (?:video|framevideo|embed)/(?P<id>[0-9a-z]+\.[0-9A-Za-z]+)''' - _API_URL = 'http://api.letitbit.net/' - _API_KEY = 'tVL0gjqo5' - _TESTS = [ - { - 'url': 'http://moevideo.net/video/00297.0036103fe3d513ef27915216fd29', - 'md5': '129f5ae1f6585d0e9bb4f38e774ffb3a', - 'info_dict': { - 'id': '00297.0036103fe3d513ef27915216fd29', - 'ext': 'flv', - 'title': 'Sink cut out machine', - 'description': 'md5:f29ff97b663aefa760bf7ca63c8ca8a8', - 'thumbnail': r're:^https?://.*\.jpg$', - 'width': 540, - 'height': 360, - 'duration': 179, - 'filesize': 17822500, - }, - 'skip': 'Video has been removed', - }, - { - 'url': 'http://playreplay.net/video/77107.7f325710a627383d40540d8e991a', - 'md5': '74f0a014d5b661f0f0e2361300d1620e', - 'info_dict': { - 'id': '77107.7f325710a627383d40540d8e991a', - 'ext': 'flv', - 'title': 'Operacion Condor.', - 'description': 'md5:7e68cb2fcda66833d5081c542491a9a3', - 'thumbnail': r're:^https?://.*\.jpg$', - 'width': 480, - 'height': 296, - 'duration': 6027, - 'filesize': 588257923, - }, - 'skip': 'Video has been removed', - }, - ] - - def _real_extract(self, url): - host, video_id = self._match_valid_url(url).groups() - - webpage = self._download_webpage( - 'http://%s/video/%s' % (host, video_id), - video_id, 'Downloading webpage') - - title = self._og_search_title(webpage) - - embed_webpage = self._download_webpage( - 'http://%s/embed/%s' % (host, video_id), - video_id, 'Downloading embed webpage') - video = self._parse_json(self._search_regex( - r'mvplayer\("#player"\s*,\s*({.+})', - embed_webpage, 'mvplayer'), video_id)['video'] - - return { - 'id': video_id, - 'title': title, - 'thumbnail': video.get('poster') or self._og_search_thumbnail(webpage), - 'description': clean_html(self._og_search_description(webpage)), - 'duration': int_or_none(self._og_search_property('video:duration', webpage)), - 'url': video['ourUrl'], - } diff --git a/yt_dlp/extractor/mofosex.py b/yt_dlp/extractor/mofosex.py deleted file mode 100644 index 9cb6980c1..000000000 --- a/yt_dlp/extractor/mofosex.py +++ /dev/null @@ -1,70 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - int_or_none, - str_to_int, - unified_strdate, -) -from .keezmovies import KeezMoviesIE - - -class MofosexIE(KeezMoviesIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'https?://(?:www\.)?mofosex\.com/videos/(?P<id>\d+)/(?P<display_id>[^/?#&.]+)\.html' - _TESTS = [{ - 'url': 'http://www.mofosex.com/videos/318131/amateur-teen-playing-and-masturbating-318131.html', - 'md5': '558fcdafbb63a87c019218d6e49daf8a', - 'info_dict': { - 'id': '318131', - 'display_id': 'amateur-teen-playing-and-masturbating-318131', - 'ext': 'mp4', - 'title': 'amateur teen playing and masturbating', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20121114', - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - 'age_limit': 18, - } - }, { - # This video is no longer available - 'url': 'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - webpage, info = self._extract_info(url) - - view_count = str_to_int(self._search_regex( - r'VIEWS:</span>\s*([\d,.]+)', webpage, 'view count', fatal=False)) - like_count = int_or_none(self._search_regex( - r'id=["\']amountLikes["\'][^>]*>(\d+)', webpage, - 'like count', fatal=False)) - dislike_count = int_or_none(self._search_regex( - r'id=["\']amountDislikes["\'][^>]*>(\d+)', webpage, - 'like count', fatal=False)) - upload_date = unified_strdate(self._html_search_regex( - r'Added:</span>([^<]+)', webpage, 'upload date', fatal=False)) - - info.update({ - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'upload_date': upload_date, - 'thumbnail': self._og_search_thumbnail(webpage), - }) - - return info - - -class MofosexEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=(?P<id>\d+)' - _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=\d+)'] - _TESTS = [{ - 'url': 'https://www.mofosex.com/embed/?videoid=318131&referrer=KM', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result( - 'http://www.mofosex.com/videos/{0}/{0}.html'.format(video_id), - ie=MofosexIE.ie_key(), video_id=video_id) diff --git a/yt_dlp/extractor/monstercat.py b/yt_dlp/extractor/monstercat.py index 7f04825fc..cf5e09969 100644 --- a/yt_dlp/extractor/monstercat.py +++ b/yt_dlp/extractor/monstercat.py @@ -24,7 +24,6 @@ class MonstercatIE(InfoExtractor): 'title': 'The Secret Language of Trees', 'id': '742779548009', 'thumbnail': 'https://www.monstercat.com/release/742779548009/cover', - 'release_year': 2023, 'release_date': '20230711', 'album': 'The Secret Language of Trees', 'album_artist': 'BT', @@ -71,7 +70,6 @@ def _real_extract(self, url): 'thumbnail': f'https://www.monstercat.com/release/{url_id}/cover', 'album_artist': try_call( lambda: get_element_by_class('h-normal text-uppercase mb-desktop-medium mb-smallish', html)), - 'release_year': int_or_none(date[:4]) if date else None, 'release_date': date, } diff --git a/yt_dlp/extractor/movieclips.py b/yt_dlp/extractor/movieclips.py deleted file mode 100644 index f7f2921fd..000000000 --- a/yt_dlp/extractor/movieclips.py +++ /dev/null @@ -1,47 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - smuggle_url, - float_or_none, - parse_iso8601, - update_url_query, -) - - -class MovieClipsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?movieclips\.com/videos/.+-(?P<id>\d+)(?:\?|$)' - _TEST = { - 'url': 'http://www.movieclips.com/videos/warcraft-trailer-1-561180739597', - 'md5': '42b5a0352d4933a7bd54f2104f481244', - 'info_dict': { - 'id': 'pKIGmG83AqD9', - 'ext': 'mp4', - 'title': 'Warcraft Trailer 1', - 'description': 'Watch Trailer 1 from Warcraft (2016). Legendary’s WARCRAFT is a 3D epic adventure of world-colliding conflict based.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1446843055, - 'upload_date': '20151106', - 'uploader': 'Movieclips', - }, - 'add_ie': ['ThePlatform'], - 'skip': 'redirects to YouTube', - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video = next(v for v in self._parse_json(self._search_regex( - r'var\s+__REACT_ENGINE__\s*=\s*({.+});', - webpage, 'react engine'), video_id)['playlist']['videos'] if v['id'] == video_id) - - return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': smuggle_url(update_url_query( - video['contentUrl'], {'mbr': 'true'}), {'force_smil_url': True}), - 'title': self._og_search_title(webpage), - 'description': self._html_search_meta('description', webpage), - 'duration': float_or_none(video.get('duration')), - 'timestamp': parse_iso8601(video.get('dateCreated')), - 'thumbnail': video.get('defaultImage'), - 'uploader': video.get('provider'), - } diff --git a/yt_dlp/extractor/msn.py b/yt_dlp/extractor/msn.py index f91c53eba..77d1806a3 100644 --- a/yt_dlp/extractor/msn.py +++ b/yt_dlp/extractor/msn.py @@ -11,6 +11,7 @@ class MSNIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:(?:www|preview)\.)?msn\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/[a-z]{2}-(?P<id>[\da-zA-Z]+)' _TESTS = [{ 'url': 'https://www.msn.com/en-in/money/video/7-ways-to-get-rid-of-chest-congestion/vi-BBPxU6d', diff --git a/yt_dlp/extractor/mtv.py b/yt_dlp/extractor/mtv.py index 0d700b9a8..e192453c7 100644 --- a/yt_dlp/extractor/mtv.py +++ b/yt_dlp/extractor/mtv.py @@ -1,4 +1,5 @@ import re +import xml.etree.ElementTree from .common import InfoExtractor from ..compat import compat_str @@ -137,7 +138,7 @@ def _get_video_info(self, itemdoc, use_hls=True): mediagen_doc = self._download_xml( mediagen_url, video_id, 'Downloading video urls', fatal=False) - if mediagen_doc is False: + if not isinstance(mediagen_doc, xml.etree.ElementTree.Element): return None item = mediagen_doc.find('./video/item') diff --git a/yt_dlp/extractor/mwave.py b/yt_dlp/extractor/mwave.py deleted file mode 100644 index efbfd9d43..000000000 --- a/yt_dlp/extractor/mwave.py +++ /dev/null @@ -1,87 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - parse_duration, -) - - -class MwaveIE(InfoExtractor): - _VALID_URL = r'https?://mwave\.interest\.me/(?:[^/]+/)?mnettv/videodetail\.m\?searchVideoDetailVO\.clip_id=(?P<id>[0-9]+)' - _URL_TEMPLATE = 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=%s' - _TESTS = [{ - 'url': 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=168859', - # md5 is unstable - 'info_dict': { - 'id': '168859', - 'ext': 'flv', - 'title': '[M COUNTDOWN] SISTAR - SHAKE IT', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'M COUNTDOWN', - 'duration': 206, - 'view_count': int, - } - }, { - 'url': 'http://mwave.interest.me/en/mnettv/videodetail.m?searchVideoDetailVO.clip_id=176199', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - vod_info = self._download_json( - 'http://mwave.interest.me/onair/vod_info.m?vodtype=CL§orid=&endinfo=Y&id=%s' % video_id, - video_id, 'Download vod JSON') - - formats = [] - for num, cdn_info in enumerate(vod_info['cdn']): - stream_url = cdn_info.get('url') - if not stream_url: - continue - stream_name = cdn_info.get('name') or compat_str(num) - f4m_stream = self._download_json( - stream_url, video_id, - 'Download %s stream JSON' % stream_name) - f4m_url = f4m_stream.get('fileurl') - if not f4m_url: - continue - formats.extend( - self._extract_f4m_formats(f4m_url + '&hdcore=3.0.3', video_id, f4m_id=stream_name)) - - return { - 'id': video_id, - 'title': vod_info['title'], - 'thumbnail': vod_info.get('cover'), - 'uploader': vod_info.get('program_title'), - 'duration': parse_duration(vod_info.get('time')), - 'view_count': int_or_none(vod_info.get('hit')), - 'formats': formats, - } - - -class MwaveMeetGreetIE(InfoExtractor): - _VALID_URL = r'https?://mwave\.interest\.me/(?:[^/]+/)?meetgreet/view/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://mwave.interest.me/meetgreet/view/256', - 'info_dict': { - 'id': '173294', - 'ext': 'flv', - 'title': '[MEET&GREET] Park BoRam', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Mwave', - 'duration': 3634, - 'view_count': int, - } - }, { - 'url': 'http://mwave.interest.me/en/meetgreet/view/256', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - clip_id = self._html_search_regex( - r'<iframe[^>]+src="/mnettv/ifr_clip\.m\?searchVideoDetailVO\.clip_id=(\d+)', - webpage, 'clip ID') - clip_url = MwaveIE._URL_TEMPLATE % clip_id - return self.url_result(clip_url, 'Mwave', clip_id) diff --git a/yt_dlp/extractor/mychannels.py b/yt_dlp/extractor/mychannels.py deleted file mode 100644 index 8a70c1f7b..000000000 --- a/yt_dlp/extractor/mychannels.py +++ /dev/null @@ -1,35 +0,0 @@ -from .common import InfoExtractor - - -class MyChannelsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mychannels\.com/.*(?P<id_type>video|production)_id=(?P<id>[0-9]+)' - _TEST = { - 'url': 'https://mychannels.com/missholland/miss-holland?production_id=3416', - 'md5': 'b8993daad4262dd68d89d651c0c52c45', - 'info_dict': { - 'id': 'wUUDZZep6vQD', - 'ext': 'mp4', - 'title': 'Miss Holland joins VOTE LEAVE', - 'description': 'Miss Holland | #13 Not a potato', - 'uploader': 'Miss Holland', - } - } - - def _real_extract(self, url): - id_type, url_id = self._match_valid_url(url).groups() - webpage = self._download_webpage(url, url_id) - video_data = self._html_search_regex(r'<div([^>]+data-%s-id="%s"[^>]+)>' % (id_type, url_id), webpage, 'video data') - - def extract_data_val(attr, fatal=False): - return self._html_search_regex(r'data-%s\s*=\s*"([^"]+)"' % attr, video_data, attr, fatal=fatal) - minoto_id = extract_data_val('minoto-id') or self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id') - - return { - '_type': 'url_transparent', - 'url': 'minoto:%s' % minoto_id, - 'id': url_id, - 'title': extract_data_val('title', True), - 'description': extract_data_val('description'), - 'thumbnail': extract_data_val('image'), - 'uploader': extract_data_val('channel'), - } diff --git a/yt_dlp/extractor/myvi.py b/yt_dlp/extractor/myvi.py deleted file mode 100644 index df7200be2..000000000 --- a/yt_dlp/extractor/myvi.py +++ /dev/null @@ -1,100 +0,0 @@ -from .common import InfoExtractor -from .vimple import SprutoBaseIE - - -class MyviIE(SprutoBaseIE): - _VALID_URL = r'''(?x) - (?: - https?:// - (?:www\.)? - myvi\. - (?: - (?:ru/player|tv)/ - (?: - (?: - embed/html| - flash| - api/Video/Get - )/| - content/preloader\.swf\?.*\bid= - )| - ru/watch/ - )| - myvi: - ) - (?P<id>[\da-zA-Z_-]+) - ''' - _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//myvi\.(?:ru/player|tv)/(?:embed/html|flash)/[^"]+)\1'] - _TESTS = [{ - 'url': 'http://myvi.ru/player/embed/html/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0', - 'md5': '571bbdfba9f9ed229dc6d34cc0f335bf', - 'info_dict': { - 'id': 'f16b2bbd-cde8-481c-a981-7cd48605df43', - 'ext': 'mp4', - 'title': 'хозяин жизни', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 25, - }, - }, { - 'url': 'http://myvi.ru/player/content/preloader.swf?id=oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wOYf1WFpPfc_bWTKGVf_Zafr0', - 'only_matching': True, - }, { - 'url': 'http://myvi.ru/player/api/Video/Get/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0', - 'only_matching': True, - }, { - 'url': 'http://myvi.tv/embed/html/oTGTNWdyz4Zwy_u1nraolwZ1odenTd9WkTnRfIL9y8VOgHYqOHApE575x4_xxS9Vn0?ap=0', - 'only_matching': True, - }, { - 'url': 'http://myvi.ru/player/flash/ocp2qZrHI-eZnHKQBK4cZV60hslH8LALnk0uBfKsB-Q4WnY26SeGoYPi8HWHxu0O30', - 'only_matching': True, - }, { - 'url': 'https://www.myvi.ru/watch/YwbqszQynUaHPn_s82sx0Q2', - 'only_matching': True, - }, { - 'url': 'myvi:YwbqszQynUaHPn_s82sx0Q2', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - spruto = self._download_json( - 'http://myvi.ru/player/api/Video/Get/%s?sig' % video_id, video_id)['sprutoData'] - - return self._extract_spruto(spruto, video_id) - - -class MyviEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?myvi\.tv/(?:[^?]+\?.*?\bv=|embed/)(?P<id>[\da-z]+)' - _TESTS = [{ - 'url': 'https://www.myvi.tv/embed/ccdqic3wgkqwpb36x9sxg43t4r', - 'info_dict': { - 'id': 'b3ea0663-3234-469d-873e-7fecf36b31d1', - 'ext': 'mp4', - 'title': 'Твоя (original song).mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 277, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.myvi.tv/idmi6o?v=ccdqic3wgkqwpb36x9sxg43t4r#watch', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if MyviIE.suitable(url) else super(MyviEmbedIE, cls).suitable(url) - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'https://www.myvi.tv/embed/%s' % video_id, video_id) - - myvi_id = self._search_regex( - r'CreatePlayer\s*\(\s*["\'].*?\bv=([\da-zA-Z_]+)', - webpage, 'video id') - - return self.url_result('myvi:%s' % myvi_id, ie=MyviIE.ie_key()) diff --git a/yt_dlp/extractor/naver.py b/yt_dlp/extractor/naver.py index 2d8459b02..806b79082 100644 --- a/yt_dlp/extractor/naver.py +++ b/yt_dlp/extractor/naver.py @@ -1,20 +1,25 @@ +import base64 +import hashlib +import hmac import itertools +import json import re -from urllib.parse import urlparse, parse_qs +import time +from urllib.parse import parse_qs, urlparse from .common import InfoExtractor from ..utils import ( ExtractorError, - clean_html, dict_get, int_or_none, join_nonempty, merge_dicts, - parse_duration, + parse_iso8601, traverse_obj, try_get, unified_timestamp, update_url_query, + url_or_none, ) @@ -110,6 +115,18 @@ def get_subs(caption_url): **self.process_subtitles(video_data, get_subs), } + def _call_api(self, path, video_id): + api_endpoint = f'https://apis.naver.com/now_web2/now_web_api/v1{path}' + key = b'nbxvs5nwNG9QKEWK0ADjYA4JZoujF4gHcIwvoCxFTPAeamq5eemvt5IWAYXxrbYM' + msgpad = int(time.time() * 1000) + md = base64.b64encode(hmac.HMAC( + key, f'{api_endpoint[:255]}{msgpad}'.encode(), digestmod=hashlib.sha1).digest()).decode() + + return self._download_json(api_endpoint, video_id=video_id, headers=self.geo_verification_headers(), query={ + 'msgpad': msgpad, + 'md': md, + })['result'] + class NaverIE(NaverBaseIE): _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/(?:v|embed)/(?P<id>\d+)' @@ -125,21 +142,32 @@ class NaverIE(NaverBaseIE): 'upload_date': '20130903', 'uploader': '메가스터디, 합격불변의 법칙', 'uploader_id': 'megastudy', + 'uploader_url': 'https://tv.naver.com/megastudy', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'duration': 2118, + 'thumbnail': r're:^https?://.*\.jpg', }, }, { 'url': 'http://tv.naver.com/v/395837', - 'md5': '8a38e35354d26a17f73f4e90094febd3', + 'md5': '7791205fa89dbed2f5e3eb16d287ff05', 'info_dict': { 'id': '395837', 'ext': 'mp4', 'title': '9년이 지나도 아픈 기억, 전효성의 아버지', - 'description': 'md5:eb6aca9d457b922e43860a2a2b1984d3', + 'description': 'md5:c76be23e21403a6473d8119678cdb5cb', 'timestamp': 1432030253, 'upload_date': '20150519', - 'uploader': '4가지쇼 시즌2', - 'uploader_id': 'wrappinguser29', + 'uploader': '4가지쇼', + 'uploader_id': '4show', + 'uploader_url': 'https://tv.naver.com/4show', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'duration': 277, + 'thumbnail': r're:^https?://.*\.jpg', }, - 'skip': 'Georestricted', }, { 'url': 'http://tvcast.naver.com/v/81652', 'only_matching': True, @@ -147,56 +175,63 @@ class NaverIE(NaverBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - content = self._download_json( - 'https://tv.naver.com/api/json/v/' + video_id, - video_id, headers=self.geo_verification_headers()) - player_info_json = content.get('playerInfoJson') or {} - current_clip = player_info_json.get('currentClip') or {} + data = self._call_api(f'/clips/{video_id}/play-info', video_id) - vid = current_clip.get('videoId') - in_key = current_clip.get('inKey') + vid = traverse_obj(data, ('clip', 'videoId', {str})) + in_key = traverse_obj(data, ('play', 'inKey', {str})) if not vid or not in_key: - player_auth = try_get(player_info_json, lambda x: x['playerOption']['auth']) - if player_auth == 'notCountry': - self.raise_geo_restricted(countries=['KR']) - elif player_auth == 'notLogin': - self.raise_login_required() - raise ExtractorError('couldn\'t extract vid and key') + raise ExtractorError('Unable to extract video info') + info = self._extract_video_info(video_id, vid, in_key) - info.update({ - 'description': clean_html(current_clip.get('description')), - 'timestamp': int_or_none(current_clip.get('firstExposureTime'), 1000), - 'duration': parse_duration(current_clip.get('displayPlayTime')), - 'like_count': int_or_none(current_clip.get('recommendPoint')), - 'age_limit': 19 if current_clip.get('adult') else None, - }) + info.update(traverse_obj(data, ('clip', { + 'title': 'title', + 'description': 'description', + 'timestamp': ('firstExposureDatetime', {parse_iso8601}), + 'duration': ('playTime', {int_or_none}), + 'like_count': ('likeItCount', {int_or_none}), + 'view_count': ('playCount', {int_or_none}), + 'comment_count': ('commentCount', {int_or_none}), + 'thumbnail': ('thumbnailImageUrl', {url_or_none}), + 'uploader': 'channelName', + 'uploader_id': 'channelId', + 'uploader_url': ('channelUrl', {url_or_none}), + 'age_limit': ('adultVideo', {lambda x: 19 if x else None}), + }))) return info -class NaverLiveIE(InfoExtractor): +class NaverLiveIE(NaverBaseIE): IE_NAME = 'Naver:live' _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/l/(?P<id>\d+)' _GEO_BYPASS = False _TESTS = [{ - 'url': 'https://tv.naver.com/l/52010', + 'url': 'https://tv.naver.com/l/127062', 'info_dict': { - 'id': '52010', + 'id': '127062', 'ext': 'mp4', - 'title': '[LIVE] 뉴스특보 : "수도권 거리두기, 2주간 2단계로 조정"', - 'description': 'md5:df7f0c237a5ed5e786ce5c91efbeaab3', - 'channel_id': 'NTV-ytnnews24-0', - 'start_time': 1597026780000, + 'live_status': 'is_live', + 'channel': '뉴스는 YTN', + 'channel_id': 'ytnnews24', + 'title': 're:^대한민국 24시간 뉴스 채널 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:f938b5956711beab6f882314ffadf4d5', + 'start_time': 1677752280, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)', + 'like_count': int, }, }, { - 'url': 'https://tv.naver.com/l/51549', + 'url': 'https://tv.naver.com/l/140535', 'info_dict': { - 'id': '51549', + 'id': '140535', 'ext': 'mp4', - 'title': '연합뉴스TV - 코로나19 뉴스특보', - 'description': 'md5:c655e82091bc21e413f549c0eaccc481', - 'channel_id': 'NTV-yonhapnewstv-0', - 'start_time': 1596406380000, + 'live_status': 'is_live', + 'channel': 'KBS뉴스', + 'channel_id': 'kbsnews', + 'start_time': 1696867320, + 'title': 're:^언제 어디서나! KBS 뉴스 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:6ad419c0bf2f332829bda3f79c295284', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)', + 'like_count': int, }, }, { 'url': 'https://tv.naver.com/l/54887', @@ -205,55 +240,27 @@ class NaverLiveIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - page = self._download_webpage(url, video_id, 'Downloading Page', 'Unable to download Page') - secure_url = self._search_regex(r'sApiF:\s+(?:"|\')([^"\']+)', page, 'secureurl') - - info = self._extract_video_info(video_id, secure_url) - info.update({ - 'description': self._og_search_description(page) - }) - - return info - - def _extract_video_info(self, video_id, url): - video_data = self._download_json(url, video_id, headers=self.geo_verification_headers()) - meta = video_data.get('meta') - status = meta.get('status') + data = self._call_api(f'/live-end/normal/{video_id}/play-info?renewLastPlayDate=true', video_id) + status = traverse_obj(data, ('live', 'liveStatus')) if status == 'CLOSED': raise ExtractorError('Stream is offline.', expected=True) elif status != 'OPENED': - raise ExtractorError('Unknown status %s' % status) - - title = meta.get('title') - stream_list = video_data.get('streams') - - if stream_list is None: - raise ExtractorError('Could not get stream data.', expected=True) - - formats = [] - for quality in stream_list: - if not quality.get('url'): - continue - - prop = quality.get('property') - if prop.get('abr'): # This abr doesn't mean Average audio bitrate. - continue - - formats.extend(self._extract_m3u8_formats( - quality.get('url'), video_id, 'mp4', - m3u8_id=quality.get('qualityId'), live=True - )) + raise ExtractorError(f'Unknown status {status!r}') return { 'id': video_id, - 'title': title, - 'formats': formats, - 'channel_id': meta.get('channelId'), - 'channel_url': meta.get('channelUrl'), - 'thumbnail': meta.get('imgUrl'), - 'start_time': meta.get('startTime'), - 'categories': [meta.get('categoryId')], + 'formats': self._extract_m3u8_formats( + traverse_obj(data, ('playbackBody', {json.loads}, 'media', 0, 'path')), video_id, live=True), + **traverse_obj(data, ('live', { + 'title': 'title', + 'channel': 'channelName', + 'channel_id': 'channelId', + 'description': 'description', + 'like_count': (('likeCount', 'likeItCount'), {int_or_none}), + 'thumbnail': ('thumbnailImageUrl', {url_or_none}), + 'start_time': (('startTime', 'startDateTime', 'startYmdt'), {parse_iso8601}), + }), get_all=False), 'is_live': True } diff --git a/yt_dlp/extractor/nba.py b/yt_dlp/extractor/nba.py index d8fc82488..81d11e3a5 100644 --- a/yt_dlp/extractor/nba.py +++ b/yt_dlp/extractor/nba.py @@ -97,7 +97,7 @@ def _extract_video(self, filter_key, filter_value): class NBAWatchEmbedIE(NBAWatchBaseIE): - IENAME = 'nba:watch:embed' + IE_NAME = 'nba:watch:embed' _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'embed\?.*?\bid=(?P<id>\d+)' _TESTS = [{ 'url': 'http://watch.nba.com/embed?id=659395', @@ -339,7 +339,7 @@ def _real_extract(self, url): class NBAEmbedIE(NBABaseIE): - IENAME = 'nba:embed' + IE_NAME = 'nba:embed' _VALID_URL = r'https?://secure\.nba\.com/assets/amp/include/video/(?:topI|i)frame\.html\?.*?\bcontentId=(?P<id>[^?#&]+)' _TESTS = [{ 'url': 'https://secure.nba.com/assets/amp/include/video/topIframe.html?contentId=teams/bulls/2020/12/04/3478774/1607105587854-20201204_SCHEDULE_RELEASE_FINAL_DRUPAL-3478774&team=bulls&adFree=false&profile=71&videoPlayerName=TAMPCVP&baseUrl=&videoAdsection=nba.com_mobile_web_teamsites_chicagobulls&Env=', @@ -361,7 +361,7 @@ def _real_extract(self, url): class NBAIE(NBABaseIE): - IENAME = 'nba' + IE_NAME = 'nba' _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?!%s)video/(?P<id>(?:[^/]+/)*[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX _TESTS = [{ 'url': 'https://www.nba.com/bulls/video/teams/bulls/2020/12/04/3478774/1607105587854-20201204schedulereleasefinaldrupal-3478774', @@ -388,7 +388,7 @@ def _extract_url_results(self, team, content_id): class NBAChannelIE(NBABaseIE): - IENAME = 'nba:channel' + IE_NAME = 'nba:channel' _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?:%s)/(?P<id>[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX _TESTS = [{ 'url': 'https://www.nba.com/blazers/video/channel/summer_league', diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index 666550a49..267fa8353 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -1,6 +1,7 @@ import base64 import json import re +import xml.etree.ElementTree from .common import InfoExtractor from .theplatform import ThePlatformIE, default_ns @@ -52,6 +53,8 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'chapters': 'count:1', 'tags': 'count:4', 'thumbnail': r're:https?://.+\.jpg', + 'categories': ['Series/The Tonight Show Starring Jimmy Fallon'], + 'media_type': 'Full Episode', }, 'params': { 'skip_download': 'm3u8', @@ -130,6 +133,8 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'tags': 'count:10', 'age_limit': 0, 'thumbnail': r're:https?://.+\.jpg', + 'categories': ['Series/Quantum Leap 2022'], + 'media_type': 'Highlight', }, 'params': { 'skip_download': 'm3u8', @@ -803,8 +808,10 @@ def _real_extract(self, url): smil = self._download_xml( f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id, note='Downloading SMIL data', query=query, fatal=is_live) - subtitles = self._parse_smil_subtitles(smil, default_ns) if smil else {} - for video in smil.findall(self._xpath_ns('.//video', default_ns)) if smil else []: + if not isinstance(smil, xml.etree.ElementTree.Element): + smil = None + subtitles = self._parse_smil_subtitles(smil, default_ns) if smil is not None else {} + for video in smil.findall(self._xpath_ns('.//video', default_ns)) if smil is not None else []: info['duration'] = float_or_none(remove_end(video.get('dur'), 'ms'), 1000) video_src_url = video.get('src') ext = mimetype2ext(video.get('type'), default=determine_ext(video_src_url)) diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index 8fba2bcf7..136b0e10a 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -3,230 +3,306 @@ from .common import InfoExtractor from ..networking.exceptions import HTTPError -from ..utils import ExtractorError, make_archive_id, parse_iso8601, remove_start +from ..utils import ( + ExtractorError, + int_or_none, + make_archive_id, + parse_iso8601, + smuggle_url, + try_call, + unsmuggle_url, + update_url_query, + url_or_none, + urljoin, +) +from ..utils.traversal import traverse_obj _BASE_URL_RE = r'https?://(?:www\.|beta\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)' class NebulaBaseIE(InfoExtractor): _NETRC_MACHINE = 'watchnebula' + _token = _api_token = None - _nebula_api_token = None - _nebula_bearer_token = None - - def _perform_nebula_auth(self, username, password): - if not username or not password: - self.raise_login_required(method='password') - - data = json.dumps({'email': username, 'password': password}).encode('utf8') - response = self._download_json( - 'https://api.watchnebula.com/api/v1/auth/login/', - data=data, fatal=False, video_id=None, - headers={ - 'content-type': 'application/json', - # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint - 'cookie': '' - }, - note='Logging in to Nebula with supplied credentials', - errnote='Authentication failed or rejected') - if not response or not response.get('key'): - self.raise_login_required(method='password') - - return response['key'] - - def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''): - assert method in ('GET', 'POST',) - assert auth_type in ('api', 'bearer',) - - def inner_call(): - authorization = f'Token {self._nebula_api_token}' if auth_type == 'api' else f'Bearer {self._nebula_bearer_token}' - return self._download_json( - url, video_id, note=note, headers={'Authorization': authorization}, - data=b'' if method == 'POST' else None) - + def _perform_login(self, username, password): try: - return inner_call() - except ExtractorError as exc: - # if 401 or 403, attempt credential re-auth and retry - if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.status in (401, 403): - self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}') - self._perform_login() - return inner_call() - else: + response = self._download_json( + 'https://nebula.tv/auth/login/', None, + 'Logging in to Nebula', 'Login failed', + data=json.dumps({'email': username, 'password': password}).encode(), + headers={'content-type': 'application/json'}) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: + raise ExtractorError('Login failed: Invalid username or password', expected=True) + raise + self._api_token = traverse_obj(response, ('key', {str})) + if not self._api_token: + raise ExtractorError('Login failed: No token') + + def _call_api(self, *args, **kwargs): + if self._token: + kwargs.setdefault('headers', {})['Authorization'] = f'Bearer {self._token}' + try: + return self._download_json(*args, **kwargs) + except ExtractorError as e: + if not isinstance(e.cause, HTTPError) or e.cause.status not in (401, 403): + raise + self.to_screen( + f'Reauthorizing with Nebula and retrying, because last API call resulted in error {e.cause.status}') + self._real_initialize() + if self._token: + kwargs.setdefault('headers', {})['Authorization'] = f'Bearer {self._token}' + return self._download_json(*args, **kwargs) + + def _real_initialize(self): + if not self._api_token: + self._api_token = try_call( + lambda: self._get_cookies('https://nebula.tv')['nebula_auth.apiToken'].value) + self._token = self._download_json( + 'https://users.api.nebula.app/api/v1/authorization/', None, + headers={'Authorization': f'Token {self._api_token}'} if self._api_token else None, + note='Authorizing to Nebula', data=b'')['token'] + + def _extract_formats(self, content_id, slug): + for retry in (False, True): + try: + fmts, subs = self._extract_m3u8_formats_and_subtitles( + f'https://content.api.nebula.app/{content_id.split(":")[0]}s/{content_id}/manifest.m3u8', + slug, 'mp4', query={ + 'token': self._token, + 'app_version': '23.10.0', + 'platform': 'ios', + }) + return {'formats': fmts, 'subtitles': subs} + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + self.raise_login_required() + if not retry and isinstance(e.cause, HTTPError) and e.cause.status == 403: + self.to_screen('Reauthorizing with Nebula and retrying, because fetching video resulted in error') + self._real_initialize() + continue raise - def _fetch_nebula_bearer_token(self): - """ - Get a Bearer token for the Nebula API. This will be required to fetch video meta data. - """ - response = self._call_nebula_api('https://api.watchnebula.com/api/v1/authorization/', - method='POST', - note='Authorizing to Nebula') - return response['token'] - - def _fetch_video_formats(self, slug): - stream_info = self._call_nebula_api(f'https://content.api.nebula.app/video/{slug}/stream/', - video_id=slug, - auth_type='bearer', - note='Fetching video stream info') - manifest_url = stream_info['manifest'] - return self._extract_m3u8_formats_and_subtitles(manifest_url, slug, 'mp4') - - def _build_video_info(self, episode): - fmts, subs = self._fetch_video_formats(episode['slug']) - channel_slug = episode['channel_slug'] - channel_title = episode['channel_title'] - zype_id = episode.get('zype_id') + def _extract_video_metadata(self, episode): + channel_url = traverse_obj( + episode, (('channel_slug', 'class_slug'), {lambda x: urljoin('https://nebula.tv/', x)}), get_all=False) return { - 'id': remove_start(episode['id'], 'video_episode:'), - 'display_id': episode['slug'], - 'formats': fmts, - 'subtitles': subs, - 'webpage_url': f'https://nebula.tv/{episode["slug"]}', - 'title': episode['title'], - 'description': episode['description'], - 'timestamp': parse_iso8601(episode['published_at']), - 'thumbnails': [{ - # 'id': tn.get('name'), # this appears to be null - 'url': tn['original'], - 'height': key, - } for key, tn in episode['assets']['thumbnail'].items()], - 'duration': episode['duration'], - 'channel': channel_title, - 'channel_id': channel_slug, - 'channel_url': f'https://nebula.tv/{channel_slug}', - 'uploader': channel_title, - 'uploader_id': channel_slug, - 'uploader_url': f'https://nebula.tv/{channel_slug}', - 'series': channel_title, - 'creator': channel_title, - 'extractor_key': NebulaIE.ie_key(), - 'extractor': NebulaIE.IE_NAME, - '_old_archive_ids': [make_archive_id(NebulaIE, zype_id)] if zype_id else None, + 'id': episode['id'].partition(':')[2], + **traverse_obj(episode, { + 'display_id': 'slug', + 'title': 'title', + 'description': 'description', + 'timestamp': ('published_at', {parse_iso8601}), + 'duration': ('duration', {int_or_none}), + 'channel_id': 'channel_slug', + 'uploader_id': 'channel_slug', + 'channel': 'channel_title', + 'uploader': 'channel_title', + 'series': 'channel_title', + 'creator': 'channel_title', + 'thumbnail': ('images', 'thumbnail', 'src', {url_or_none}), + 'episode_number': ('order', {int_or_none}), + # Old code was wrongly setting extractor_key from NebulaSubscriptionsIE + '_old_archive_ids': ('zype_id', {lambda x: [ + make_archive_id(NebulaIE, x), make_archive_id(NebulaSubscriptionsIE, x)] if x else None}), + }), + 'channel_url': channel_url, + 'uploader_url': channel_url, } - def _perform_login(self, username=None, password=None): - self._nebula_api_token = self._perform_nebula_auth(username, password) - self._nebula_bearer_token = self._fetch_nebula_bearer_token() - class NebulaIE(NebulaBaseIE): _VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[-\w]+)' - _TESTS = [ - { - 'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast', - 'md5': '14944cfee8c7beeea106320c47560efc', - 'info_dict': { - 'id': '84ed544d-4afd-4723-8cd5-2b95261f0abf', - 'ext': 'mp4', - 'title': 'That Time Disney Remade Beauty and the Beast', - 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.', - 'upload_date': '20180731', - 'timestamp': 1533009600, - 'channel': 'Lindsay Ellis', - 'channel_id': 'lindsayellis', - 'uploader': 'Lindsay Ellis', - 'uploader_id': 'lindsayellis', - 'uploader_url': 'https://nebula.tv/lindsayellis', - 'series': 'Lindsay Ellis', - 'display_id': 'that-time-disney-remade-beauty-and-the-beast', - 'channel_url': 'https://nebula.tv/lindsayellis', - 'creator': 'Lindsay Ellis', - 'duration': 2212, - 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', - }, + _TESTS = [{ + 'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast', + 'info_dict': { + 'id': '84ed544d-4afd-4723-8cd5-2b95261f0abf', + 'ext': 'mp4', + 'title': 'That Time Disney Remade Beauty and the Beast', + 'description': 'md5:2aae3c4cfc5ee09a1ecdff0909618cf4', + 'upload_date': '20180731', + 'timestamp': 1533009600, + 'channel': 'Lindsay Ellis', + 'channel_id': 'lindsayellis', + 'uploader': 'Lindsay Ellis', + 'uploader_id': 'lindsayellis', + 'uploader_url': r're:https://nebula\.(tv|app)/lindsayellis', + 'series': 'Lindsay Ellis', + 'display_id': 'that-time-disney-remade-beauty-and-the-beast', + 'channel_url': r're:https://nebula\.(tv|app)/lindsayellis', + 'creator': 'Lindsay Ellis', + 'duration': 2212, + 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+', + '_old_archive_ids': ['nebula 5c271b40b13fd613090034fd', 'nebulasubscriptions 5c271b40b13fd613090034fd'], }, - { - 'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', - 'md5': 'd05739cf6c38c09322422f696b569c23', - 'info_dict': { - 'id': '7e623145-1b44-4ca3-aa0b-ed25a247ea34', - 'ext': 'mp4', - 'title': 'Landing Craft - How The Allies Got Ashore', - 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', - 'upload_date': '20200327', - 'timestamp': 1585348140, - 'channel': 'Real Engineering — The Logistics of D-Day', - 'channel_id': 'd-day', - 'uploader': 'Real Engineering — The Logistics of D-Day', - 'uploader_id': 'd-day', - 'series': 'Real Engineering — The Logistics of D-Day', - 'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', - 'creator': 'Real Engineering — The Logistics of D-Day', - 'duration': 841, - 'channel_url': 'https://nebula.tv/d-day', - 'uploader_url': 'https://nebula.tv/d-day', - 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', - }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', + 'md5': 'd05739cf6c38c09322422f696b569c23', + 'info_dict': { + 'id': '7e623145-1b44-4ca3-aa0b-ed25a247ea34', + 'ext': 'mp4', + 'title': 'Landing Craft - How The Allies Got Ashore', + 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', + 'upload_date': '20200327', + 'timestamp': 1585348140, + 'channel': 'Real Engineering — The Logistics of D-Day', + 'channel_id': 'd-day', + 'uploader': 'Real Engineering — The Logistics of D-Day', + 'uploader_id': 'd-day', + 'series': 'Real Engineering — The Logistics of D-Day', + 'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', + 'creator': 'Real Engineering — The Logistics of D-Day', + 'duration': 841, + 'channel_url': 'https://nebula.tv/d-day', + 'uploader_url': 'https://nebula.tv/d-day', + 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+', + '_old_archive_ids': ['nebula 5e7e78171aaf320001fbd6be', 'nebulasubscriptions 5e7e78171aaf320001fbd6be'], }, - { - 'url': 'https://nebula.tv/videos/money-episode-1-the-draw', - 'md5': 'ebe28a7ad822b9ee172387d860487868', - 'info_dict': { - 'id': 'b96c5714-9e2b-4ec3-b3f1-20f6e89cc553', - 'ext': 'mp4', - 'title': 'Episode 1: The Draw', - 'description': r'contains:There’s free money on offer… if the players can all work together.', - 'upload_date': '20200323', - 'timestamp': 1584980400, - 'channel': 'Tom Scott Presents: Money', - 'channel_id': 'tom-scott-presents-money', - 'uploader': 'Tom Scott Presents: Money', - 'uploader_id': 'tom-scott-presents-money', - 'uploader_url': 'https://nebula.tv/tom-scott-presents-money', - 'duration': 825, - 'channel_url': 'https://nebula.tv/tom-scott-presents-money', - 'series': 'Tom Scott Presents: Money', - 'display_id': 'money-episode-1-the-draw', - 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', - 'creator': 'Tom Scott Presents: Money', - }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://nebula.tv/videos/money-episode-1-the-draw', + 'md5': 'ebe28a7ad822b9ee172387d860487868', + 'info_dict': { + 'id': 'b96c5714-9e2b-4ec3-b3f1-20f6e89cc553', + 'ext': 'mp4', + 'title': 'Episode 1: The Draw', + 'description': r'contains:There’s free money on offer… if the players can all work together.', + 'upload_date': '20200323', + 'timestamp': 1584980400, + 'channel': 'Tom Scott Presents: Money', + 'channel_id': 'tom-scott-presents-money', + 'uploader': 'Tom Scott Presents: Money', + 'uploader_id': 'tom-scott-presents-money', + 'uploader_url': 'https://nebula.tv/tom-scott-presents-money', + 'duration': 825, + 'channel_url': 'https://nebula.tv/tom-scott-presents-money', + 'series': 'Tom Scott Presents: Money', + 'display_id': 'money-episode-1-the-draw', + 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+', + 'creator': 'Tom Scott Presents: Money', + '_old_archive_ids': ['nebula 5e779ebdd157bc0001d1c75a', 'nebulasubscriptions 5e779ebdd157bc0001d1c75a'], }, - { - 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', - 'only_matching': True, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', + 'only_matching': True, + }, { + 'url': 'https://nebula.tv/videos/tldrnewseu-did-the-us-really-blow-up-the-nordstream-pipelines', + 'info_dict': { + 'id': 'e389af9d-1dab-44f2-8788-ee24deb7ff0d', + 'ext': 'mp4', + 'display_id': 'tldrnewseu-did-the-us-really-blow-up-the-nordstream-pipelines', + 'title': 'Did the US Really Blow Up the NordStream Pipelines?', + 'description': 'md5:b4e2a14e3ff08f546a3209c75261e789', + 'upload_date': '20230223', + 'timestamp': 1677144070, + 'channel': 'TLDR News EU', + 'channel_id': 'tldrnewseu', + 'uploader': 'TLDR News EU', + 'uploader_id': 'tldrnewseu', + 'uploader_url': r're:https://nebula\.(tv|app)/tldrnewseu', + 'duration': 524, + 'channel_url': r're:https://nebula\.(tv|app)/tldrnewseu', + 'series': 'TLDR News EU', + 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+', + 'creator': 'TLDR News EU', + '_old_archive_ids': ['nebula 63f64c74366fcd00017c1513', 'nebulasubscriptions 63f64c74366fcd00017c1513'], }, - { - 'url': 'https://beta.nebula.tv/videos/money-episode-1-the-draw', - 'only_matching': True, - }, - ] - - def _fetch_video_metadata(self, slug): - return self._call_nebula_api(f'https://content.api.nebula.app/video/{slug}/', - video_id=slug, - auth_type='bearer', - note='Fetching video meta data') + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://beta.nebula.tv/videos/money-episode-1-the-draw', + 'only_matching': True, + }] def _real_extract(self, url): slug = self._match_id(url) - video = self._fetch_video_metadata(slug) - return self._build_video_info(video) + url, smuggled_data = unsmuggle_url(url, {}) + if smuggled_data.get('id'): + return { + 'id': smuggled_data['id'], + 'display_id': slug, + 'title': '', + **self._extract_formats(smuggled_data['id'], slug), + } + + metadata = self._call_api( + f'https://content.api.nebula.app/content/videos/{slug}', + slug, note='Fetching video metadata') + return { + **self._extract_video_metadata(metadata), + **self._extract_formats(metadata['id'], slug), + } + + +class NebulaClassIE(NebulaBaseIE): + IE_NAME = 'nebula:class' + _VALID_URL = rf'{_BASE_URL_RE}/(?P<id>[-\w]+)/(?P<ep>\d+)' + _TESTS = [{ + 'url': 'https://nebula.tv/copyright-for-fun-and-profit/14', + 'info_dict': { + 'id': 'd7432cdc-c608-474d-942c-f74345daed7b', + 'ext': 'mp4', + 'display_id': '14', + 'channel_url': 'https://nebula.tv/copyright-for-fun-and-profit', + 'episode_number': 14, + 'thumbnail': 'https://dj423fildxgac.cloudfront.net/d533718d-9307-42d4-8fb0-e283285e99c9', + 'uploader_url': 'https://nebula.tv/copyright-for-fun-and-profit', + 'duration': 646, + 'episode': 'Episode 14', + 'title': 'Photos, Sculpture, and Video', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + slug, episode = self._match_valid_url(url).group('id', 'ep') + url, smuggled_data = unsmuggle_url(url, {}) + if smuggled_data.get('id'): + return { + 'id': smuggled_data['id'], + 'display_id': slug, + 'title': '', + **self._extract_formats(smuggled_data['id'], slug), + } + + metadata = self._call_api( + f'https://content.api.nebula.app/content/{slug}/{episode}/?include=lessons', + slug, note='Fetching video metadata') + return { + **self._extract_video_metadata(metadata), + **self._extract_formats(metadata['id'], slug), + } class NebulaSubscriptionsIE(NebulaBaseIE): IE_NAME = 'nebula:subscriptions' - _VALID_URL = rf'{_BASE_URL_RE}/myshows' - _TESTS = [ - { - 'url': 'https://nebula.tv/myshows', - 'playlist_mincount': 1, - 'info_dict': { - 'id': 'myshows', - }, + _VALID_URL = rf'{_BASE_URL_RE}/(?P<id>myshows|library/latest-videos)' + _TESTS = [{ + 'url': 'https://nebula.tv/myshows', + 'playlist_mincount': 1, + 'info_dict': { + 'id': 'myshows', }, - ] + }] def _generate_playlist_entries(self): - next_url = 'https://content.watchnebula.com/library/video/?page_size=100' - page_num = 1 - while next_url: - channel = self._call_nebula_api(next_url, 'myshows', auth_type='bearer', - note=f'Retrieving subscriptions page {page_num}') + next_url = update_url_query('https://content.api.nebula.app/video_episodes/', { + 'following': 'true', + 'include': 'engagement', + 'ordering': '-published_at', + }) + for page_num in itertools.count(1): + channel = self._call_api( + next_url, 'myshows', note=f'Retrieving subscriptions page {page_num}') for episode in channel['results']: - yield self._build_video_info(episode) - next_url = channel['next'] - page_num += 1 + metadata = self._extract_video_metadata(episode) + yield self.url_result(smuggle_url( + f'https://nebula.tv/videos/{metadata["display_id"]}', + {'id': episode['id']}), NebulaIE, url_transparent=True, **metadata) + next_url = channel.get('next') + if not next_url: + return def _real_extract(self, url): return self.playlist_result(self._generate_playlist_entries(), 'myshows') @@ -234,48 +310,74 @@ def _real_extract(self, url): class NebulaChannelIE(NebulaBaseIE): IE_NAME = 'nebula:channel' - _VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|videos/)(?P<id>[-\w]+)' - _TESTS = [ - { - 'url': 'https://nebula.tv/tom-scott-presents-money', - 'info_dict': { - 'id': 'tom-scott-presents-money', - 'title': 'Tom Scott Presents: Money', - 'description': 'Tom Scott hosts a series all about trust, negotiation and money.', - }, - 'playlist_count': 5, - }, { - 'url': 'https://nebula.tv/lindsayellis', - 'info_dict': { - 'id': 'lindsayellis', - 'title': 'Lindsay Ellis', - 'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.', - }, - 'playlist_mincount': 2, + _VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|library|videos/)(?P<id>[-\w]+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://nebula.tv/tom-scott-presents-money', + 'info_dict': { + 'id': 'tom-scott-presents-money', + 'title': 'Tom Scott Presents: Money', + 'description': 'Tom Scott hosts a series all about trust, negotiation and money.', }, - ] + 'playlist_count': 5, + }, { + 'url': 'https://nebula.tv/lindsayellis', + 'info_dict': { + 'id': 'lindsayellis', + 'title': 'Lindsay Ellis', + 'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://nebula.tv/johnnyharris', + 'info_dict': { + 'id': 'johnnyharris', + 'title': 'Johnny Harris', + 'description': 'I make videos about maps and many other things.', + }, + 'playlist_mincount': 90, + }, { + 'url': 'https://nebula.tv/copyright-for-fun-and-profit', + 'info_dict': { + 'id': 'copyright-for-fun-and-profit', + 'title': 'Copyright for Fun and Profit', + 'description': 'md5:6690248223eed044a9f11cd5a24f9742', + }, + 'playlist_count': 23, + }] - def _generate_playlist_entries(self, collection_id, channel): - episodes = channel['episodes']['results'] - for page_num in itertools.count(2): - for episode in episodes: - yield self._build_video_info(episode) - next_url = channel['episodes']['next'] + def _generate_playlist_entries(self, collection_id, collection_slug): + next_url = f'https://content.api.nebula.app/video_channels/{collection_id}/video_episodes/?ordering=-published_at' + for page_num in itertools.count(1): + episodes = self._call_api(next_url, collection_slug, note=f'Retrieving channel page {page_num}') + for episode in episodes['results']: + metadata = self._extract_video_metadata(episode) + yield self.url_result(smuggle_url( + episode.get('share_url') or f'https://nebula.tv/videos/{metadata["display_id"]}', + {'id': episode['id']}), NebulaIE, url_transparent=True, **metadata) + next_url = episodes.get('next') if not next_url: break - channel = self._call_nebula_api(next_url, collection_id, auth_type='bearer', - note=f'Retrieving channel page {page_num}') - episodes = channel['episodes']['results'] + + def _generate_class_entries(self, channel): + for lesson in channel['lessons']: + metadata = self._extract_video_metadata(lesson) + yield self.url_result(smuggle_url( + lesson.get('share_url') or f'https://nebula.tv/{metadata["class_slug"]}/{metadata["slug"]}', + {'id': lesson['id']}), NebulaClassIE, url_transparent=True, **metadata) def _real_extract(self, url): - collection_id = self._match_id(url) - channel_url = f'https://content.watchnebula.com/video/channels/{collection_id}/' - channel = self._call_nebula_api(channel_url, collection_id, auth_type='bearer', note='Retrieving channel') - channel_details = channel['details'] + collection_slug = self._match_id(url) + channel = self._call_api( + f'https://content.api.nebula.app/content/{collection_slug}/?include=lessons', + collection_slug, note='Retrieving channel') + + if channel.get('type') == 'class': + entries = self._generate_class_entries(channel) + else: + entries = self._generate_playlist_entries(channel['id'], collection_slug) return self.playlist_result( - entries=self._generate_playlist_entries(collection_id, channel), - playlist_id=collection_id, - playlist_title=channel_details['title'], - playlist_description=channel_details['description'] - ) + entries=entries, + playlist_id=collection_slug, + playlist_title=channel.get('title'), + playlist_description=channel.get('description')) diff --git a/yt_dlp/extractor/neteasemusic.py b/yt_dlp/extractor/neteasemusic.py index 68bfcb6ba..d332b840c 100644 --- a/yt_dlp/extractor/neteasemusic.py +++ b/yt_dlp/extractor/neteasemusic.py @@ -142,6 +142,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, "duration": 256, 'thumbnail': r're:^http.*\.jpg', + 'album': '偶像练习生 表演曲目合集', + 'average_rating': int, + 'album_artist': '偶像练习生', }, }, { 'note': 'No lyrics.', @@ -155,6 +158,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'timestamp': 1202745600, 'duration': 263, 'thumbnail': r're:^http.*\.jpg', + 'album': 'Piano Solos Vol. 2', + 'album_artist': 'Dustin O\'Halloran', + 'average_rating': int, }, }, { 'url': 'https://y.music.163.com/m/song?app_version=8.8.45&id=95670&uct2=sKnvS4+0YStsWkqsPhFijw%3D%3D&dlt=0846', @@ -171,6 +177,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'duration': 268, 'alt_title': '伴唱:现代人乐队 合唱:总政歌舞团', 'thumbnail': r're:^http.*\.jpg', + 'average_rating': int, + 'album': '红色摇滚', + 'album_artist': '侯牧人', }, }, { 'url': 'http://music.163.com/#/song?id=32102397', @@ -186,6 +195,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, 'duration': 199, 'thumbnail': r're:^http.*\.jpg', + 'album': 'Bad Blood', + 'average_rating': int, + 'album_artist': 'Taylor Swift', }, 'skip': 'Blocked outside Mainland China', }, { @@ -203,6 +215,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'duration': 229, 'alt_title': '说出愿望吧(Genie)', 'thumbnail': r're:^http.*\.jpg', + 'average_rating': int, + 'album': 'Oh!', + 'album_artist': '少女时代', }, 'skip': 'Blocked outside Mainland China', }] @@ -253,12 +268,15 @@ def _real_extract(self, url): 'formats': formats, 'alt_title': '/'.join(traverse_obj(info, (('transNames', 'alias'), ...))) or None, 'creator': ' / '.join(traverse_obj(info, ('artists', ..., 'name'))) or None, + 'album_artist': ' / '.join(traverse_obj(info, ('album', 'artists', ..., 'name'))) or None, **lyric_data, **traverse_obj(info, { 'title': ('name', {str}), 'timestamp': ('album', 'publishTime', {self.kilo_or_none}), 'thumbnail': ('album', 'picUrl', {url_or_none}), 'duration': ('duration', {self.kilo_or_none}), + 'album': ('album', 'name', {str}), + 'average_rating': ('score', {int_or_none}), }), } diff --git a/yt_dlp/extractor/newstube.py b/yt_dlp/extractor/newstube.py deleted file mode 100644 index 820eb4ba7..000000000 --- a/yt_dlp/extractor/newstube.py +++ /dev/null @@ -1,75 +0,0 @@ -import base64 -import hashlib - -from .common import InfoExtractor -from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 -from ..utils import ( - int_or_none, - parse_codecs, - parse_duration, -) - - -class NewstubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?newstube\.ru/media/(?P<id>.+)' - _TEST = { - 'url': 'http://www.newstube.ru/media/telekanal-cnn-peremestil-gorod-slavyansk-v-krym', - 'md5': '9d10320ad473444352f72f746ccb8b8c', - 'info_dict': { - 'id': '728e0ef2-e187-4012-bac0-5a081fdcb1f6', - 'ext': 'mp4', - 'title': 'Телеканал CNN переместил город Славянск в Крым', - 'description': 'md5:419a8c9f03442bc0b0a794d689360335', - 'duration': 31.05, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - page = self._download_webpage(url, video_id) - title = self._html_search_meta(['og:title', 'twitter:title'], page, fatal=True) - - video_guid = self._html_search_regex( - r'<meta\s+property="og:video(?::(?:(?:secure_)?url|iframe))?"\s+content="https?://(?:www\.)?newstube\.ru/embed/(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', - page, 'video GUID') - - enc_data = base64.b64decode(self._download_webpage( - 'https://www.newstube.ru/embed/api/player/getsources2', - video_guid, query={ - 'guid': video_guid, - 'ff': 3, - })) - key = hashlib.pbkdf2_hmac( - 'sha1', video_guid.replace('-', '').encode(), enc_data[:16], 1)[:16] - dec_data = unpad_pkcs7(aes_cbc_decrypt_bytes(enc_data[32:], key, enc_data[16:32])) - sources = self._parse_json(dec_data, video_guid) - - formats = [] - for source in sources: - source_url = source.get('Src') - if not source_url: - continue - height = int_or_none(source.get('Height')) - f = { - 'format_id': 'http' + ('-%dp' % height if height else ''), - 'url': source_url, - 'width': int_or_none(source.get('Width')), - 'height': height, - } - source_type = source.get('Type') - if source_type: - f.update(parse_codecs(self._search_regex( - r'codecs="([^"]+)"', source_type, 'codecs', fatal=False))) - formats.append(f) - - self._check_formats(formats, video_guid) - - return { - 'id': video_guid, - 'title': title, - 'description': self._html_search_meta(['description', 'og:description'], page), - 'thumbnail': self._html_search_meta(['og:image:secure_url', 'og:image', 'twitter:image'], page), - 'duration': parse_duration(self._html_search_meta('duration', page)), - 'formats': formats, - } diff --git a/yt_dlp/extractor/nfl.py b/yt_dlp/extractor/nfl.py index bd060dba9..3f83cd20e 100644 --- a/yt_dlp/extractor/nfl.py +++ b/yt_dlp/extractor/nfl.py @@ -247,7 +247,7 @@ def _real_extract(self, url): class NFLPlusReplayIE(NFLBaseIE): IE_NAME = 'nfl.com:plus:replay' - _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/games/(?P<slug>[\w-]+)(?:/(?P<id>\d+))?' + _VALID_URL = r'https?://(?:www\.)?nfl\.com/plus/games/(?P<slug>[\w-]+)(?:/(?P<id>\d+))?' _TESTS = [{ 'url': 'https://www.nfl.com/plus/games/giants-at-vikings-2022-post-1/1572108', 'info_dict': { @@ -342,7 +342,7 @@ def entries(): class NFLPlusEpisodeIE(NFLBaseIE): IE_NAME = 'nfl.com:plus:episode' - _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/episodes/(?P<id>[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?nfl\.com/plus/episodes/(?P<id>[\w-]+)' _TESTS = [{ 'note': 'Subscription required', 'url': 'https://www.nfl.com/plus/episodes/kurt-s-qb-insider-conference-championships', diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index bcbc2279f..4b3d185a3 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -3,6 +3,8 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, + clean_html, + get_element_by_class, int_or_none, join_nonempty, parse_duration, @@ -45,42 +47,54 @@ def _get_api_info(self, refresh=True): self.cache.store('nhk', 'api_info', api_info) return api_info - def _extract_formats_and_subtitles(self, vod_id): + def _extract_stream_info(self, vod_id): for refresh in (False, True): api_info = self._get_api_info(refresh) if not api_info: continue api_url = api_info.pop('url') - stream_url = traverse_obj( + meta = traverse_obj( self._download_json( api_url, vod_id, 'Downloading stream url info', fatal=False, query={ **api_info, 'type': 'json', 'optional_id': vod_id, 'active_flg': 1, - }), - ('meta', 0, 'movie_url', ('mb_auto', 'auto_sp', 'auto_pc'), {url_or_none}), get_all=False) - if stream_url: - return self._extract_m3u8_formats_and_subtitles(stream_url, vod_id) + }), ('meta', 0)) + stream_url = traverse_obj( + meta, ('movie_url', ('mb_auto', 'auto_sp', 'auto_pc'), {url_or_none}), get_all=False) + if stream_url: + formats, subtitles = self._extract_m3u8_formats_and_subtitles(stream_url, vod_id) + return { + **traverse_obj(meta, { + 'duration': ('duration', {int_or_none}), + 'timestamp': ('publication_date', {unified_timestamp}), + 'release_timestamp': ('insert_date', {unified_timestamp}), + 'modified_timestamp': ('update_date', {unified_timestamp}), + }), + 'formats': formats, + 'subtitles': subtitles, + } raise ExtractorError('Unable to extract stream url') def _extract_episode_info(self, url, episode=None): fetch_episode = episode is None - lang, m_type, episode_id = NhkVodIE._match_valid_url(url).groups() - if len(episode_id) == 7: + lang, m_type, episode_id = NhkVodIE._match_valid_url(url).group('lang', 'type', 'id') + is_video = m_type == 'video' + + if is_video: episode_id = episode_id[:4] + '-' + episode_id[4:] - is_video = m_type == 'video' if fetch_episode: episode = self._call_api( episode_id, lang, is_video, True, episode_id[:4] == '9999')[0] - title = episode.get('sub_title_clean') or episode['sub_title'] def get_clean_field(key): - return episode.get(key + '_clean') or episode.get(key) + return clean_html(episode.get(key + '_clean') or episode.get(key)) + title = get_clean_field('sub_title') series = get_clean_field('title') thumbnails = [] @@ -95,22 +109,30 @@ def get_clean_field(key): 'url': 'https://www3.nhk.or.jp' + img_path, }) + episode_name = title + if series and title: + title = f'{series} - {title}' + elif series and not title: + title = series + series = None + episode_name = None + else: # title, no series + episode_name = None + info = { 'id': episode_id + '-' + lang, - 'title': '%s - %s' % (series, title) if series and title else title, + 'title': title, 'description': get_clean_field('description'), 'thumbnails': thumbnails, 'series': series, - 'episode': title, + 'episode': episode_name, } + if is_video: vod_id = episode['vod_id'] - formats, subs = self._extract_formats_and_subtitles(vod_id) - info.update({ + **self._extract_stream_info(vod_id), 'id': vod_id, - 'formats': formats, - 'subtitles': subs, }) else: @@ -133,47 +155,61 @@ def get_clean_field(key): class NhkVodIE(NhkBaseIE): # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg - _VALID_URL = r'%s%s(?P<id>[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + _VALID_URL = [rf'{NhkBaseIE._BASE_URL_REGEX}/(?P<type>video)/(?P<id>[0-9a-z]+)', + rf'{NhkBaseIE._BASE_URL_REGEX}/(?P<type>audio)/(?P<id>[^/?#]+?-\d{{8}}-[0-9a-z]+)'] # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2061601/', + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2049126/', 'info_dict': { - 'id': 'yd8322ch', + 'id': 'nw_vod_v_en_2049_126_20230413233000_01_1681398302', 'ext': 'mp4', - 'description': 'md5:109c8b05d67a62d0592f2b445d2cd898', - 'title': 'GRAND SUMO Highlights - [Recap] May Tournament Day 1 (Opening Day)', - 'upload_date': '20230514', - 'timestamp': 1684083791, - 'series': 'GRAND SUMO Highlights', - 'episode': '[Recap] May Tournament Day 1 (Opening Day)', - 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1684084443/4028649.jpg?w=1920&h=1080', + 'title': 'Japan Railway Journal - The Tohoku Shinkansen: Full Speed Ahead', + 'description': 'md5:49f7c5b206e03868a2fdf0d0814b92f6', + 'thumbnail': 'md5:51bcef4a21936e7fea1ff4e06353f463', + 'episode': 'The Tohoku Shinkansen: Full Speed Ahead', + 'series': 'Japan Railway Journal', + 'modified_timestamp': 1694243656, + 'timestamp': 1681428600, + 'release_timestamp': 1693883728, + 'duration': 1679, + 'upload_date': '20230413', + 'modified_date': '20230909', + 'release_date': '20230905', + }, }, { # video clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', - 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca', + 'md5': '153c3016dfd252ba09726588149cf0e7', 'info_dict': { - 'id': 'a95j5iza', + 'id': 'lpZXIwaDE6_Z-976CPsFdxyICyWUzlT5', 'ext': 'mp4', - 'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU", + 'title': 'Dining with the Chef - Chef Saito\'s Family recipe: MENCHI-KATSU', 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5', - 'timestamp': 1565965194, - 'upload_date': '20190816', - 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1567086278/3715195.jpg?w=1920&h=1080', + 'thumbnail': 'md5:d6a4d9b6e9be90aaadda0bcce89631ed', 'series': 'Dining with the Chef', 'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU', + 'duration': 148, + 'upload_date': '20190816', + 'release_date': '20230902', + 'release_timestamp': 1693619292, + 'modified_timestamp': 1694168033, + 'modified_date': '20230908', + 'timestamp': 1565997540, }, }, { - # audio clip - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/', + # radio + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/livinginjapan-20231001-1/', 'info_dict': { - 'id': 'r_inventions-20201104-1-en', + 'id': 'livinginjapan-20231001-1-en', 'ext': 'm4a', - 'title': "Japan's Top Inventions - Miniature Video Cameras", - 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b', + 'title': 'Living in Japan - Tips for Travelers to Japan / Ramen Vending Machines', + 'series': 'Living in Japan', + 'description': 'md5:0a0e2077d8f07a03071e990a6f51bfab', + 'thumbnail': 'md5:960622fb6e06054a4a1a0c97ea752545', + 'episode': 'Tips for Travelers to Japan / Ramen Vending Machines' }, - 'skip': '404 Not Found', }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', 'only_matching': True, @@ -199,6 +235,36 @@ class NhkVodIE(NhkBaseIE): 'timestamp': 1623722008, }, 'skip': '404 Not Found', + }, { + # japanese-language, longer id than english + 'url': 'https://www3.nhk.or.jp/nhkworld/ja/ondemand/video/0020271111/', + 'info_dict': { + 'id': 'nw_ja_v_jvod_ohayou_20231008', + 'ext': 'mp4', + 'title': 'おはよう日本(7時台) - 10月8日放送', + 'series': 'おはよう日本(7時台)', + 'episode': '10月8日放送', + 'thumbnail': 'md5:d733b1c8e965ab68fb02b2d347d0e9b4', + 'description': 'md5:9c1d6cbeadb827b955b20e99ab920ff0', + }, + 'skip': 'expires 2023-10-15', + }, { + # a one-off (single-episode series). title from the api is just '<p></p>' + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/3004952/', + 'info_dict': { + 'id': 'nw_vod_v_en_3004_952_20230723091000_01_1690074552', + 'ext': 'mp4', + 'title': 'Barakan Discovers AMAMI OSHIMA: Isson\'s Treasure Island', + 'description': 'md5:5db620c46a0698451cc59add8816b797', + 'thumbnail': 'md5:67d9ff28009ba379bfa85ad1aaa0e2bd', + 'release_date': '20230905', + 'timestamp': 1690103400, + 'duration': 2939, + 'release_timestamp': 1693898699, + 'modified_timestamp': 1698057495, + 'modified_date': '20231023', + 'upload_date': '20230723', + }, }] def _real_extract(self, url): @@ -206,20 +272,22 @@ def _real_extract(self, url): class NhkVodProgramIE(NhkBaseIE): - _VALID_URL = r'%s/program%s(?P<id>[0-9a-z]+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + _VALID_URL = rf'{NhkBaseIE._BASE_URL_REGEX}/program{NhkBaseIE._TYPE_REGEX}(?P<id>\w+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' _TESTS = [{ # video program episodes 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/sumo', 'info_dict': { 'id': 'sumo', 'title': 'GRAND SUMO Highlights', + 'description': 'md5:fc20d02dc6ce85e4b72e0273aa52fdbf', }, - 'playlist_mincount': 12, + 'playlist_mincount': 0, }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway', 'info_dict': { 'id': 'japanrailway', 'title': 'Japan Railway Journal', + 'description': 'md5:ea39d93af7d05835baadf10d1aae0e3f', }, 'playlist_mincount': 12, }, { @@ -228,6 +296,7 @@ class NhkVodProgramIE(NhkBaseIE): 'info_dict': { 'id': 'japanrailway', 'title': 'Japan Railway Journal', + 'description': 'md5:ea39d93af7d05835baadf10d1aae0e3f', }, 'playlist_mincount': 5, }, { @@ -240,8 +309,7 @@ class NhkVodProgramIE(NhkBaseIE): }] def _real_extract(self, url): - lang, m_type, program_id, episode_type = self._match_valid_url(url).groups() - + lang, m_type, program_id, episode_type = self._match_valid_url(url).group('lang', 'type', 'id', 'episode_type') episodes = self._call_api( program_id, lang, m_type == 'video', False, episode_type == 'clip') @@ -253,11 +321,11 @@ def _real_extract(self, url): entries.append(self._extract_episode_info( urljoin(url, episode_path), episode)) - program_title = None - if entries: - program_title = entries[0].get('series') + html = self._download_webpage(url, program_id) + program_title = clean_html(get_element_by_class('p-programDetail__title', html)) + program_description = clean_html(get_element_by_class('p-programDetail__text', html)) - return self.playlist_result(entries, program_id, program_title) + return self.playlist_result(entries, program_id, program_title, program_description) class NhkForSchoolBangumiIE(InfoExtractor): @@ -409,6 +477,7 @@ class NhkRadiruIE(InfoExtractor): 'skip': 'Episode expired on 2023-04-16', 'info_dict': { 'channel': 'NHK-FM', + 'uploader': 'NHK-FM', 'description': 'md5:94b08bdeadde81a97df4ec882acce3e9', 'ext': 'm4a', 'id': '0449_01_3853544', @@ -429,6 +498,7 @@ class NhkRadiruIE(InfoExtractor): 'title': 'ベストオブクラシック', 'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。', 'channel': 'NHK-FM', + 'uploader': 'NHK-FM', 'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg', }, 'playlist_mincount': 3, @@ -442,6 +512,7 @@ class NhkRadiruIE(InfoExtractor): 'title': '有島武郎「一房のぶどう」', 'description': '朗読:川野一宇(ラジオ深夜便アンカー)\r\n\r\n(2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より)', 'channel': 'NHKラジオ第1、NHK-FM', + 'uploader': 'NHKラジオ第1、NHK-FM', 'timestamp': 1635757200, 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg', 'release_date': '20161207', @@ -457,6 +528,7 @@ class NhkRadiruIE(InfoExtractor): 'id': 'F261_01_3855109', 'ext': 'm4a', 'channel': 'NHKラジオ第1', + 'uploader': 'NHKラジオ第1', 'timestamp': 1681635900, 'release_date': '20230416', 'series': 'NHKラジオニュース', @@ -501,6 +573,7 @@ def _real_extract(self, url): series_meta = traverse_obj(meta, { 'title': 'program_name', 'channel': 'media_name', + 'uploader': 'media_name', 'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}), }, get_all=False) @@ -529,6 +602,7 @@ class NhkRadioNewsPageIE(InfoExtractor): 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg', 'description': 'md5:bf2c5b397e44bc7eb26de98d8f15d79d', 'channel': 'NHKラジオ第1', + 'uploader': 'NHKラジオ第1', 'title': 'NHKラジオニュース', } }] @@ -591,7 +665,7 @@ def _real_extract(self, url): noa_info = self._download_json( f'https:{config.find(".//url_program_noa").text}'.format(area=data.find('areakey').text), - station, note=f'Downloading {area} station metadata') + station, note=f'Downloading {area} station metadata', fatal=False) present_info = traverse_obj(noa_info, ('nowonair_list', self._NOA_STATION_IDS.get(station), 'present')) return { diff --git a/yt_dlp/extractor/nick.py b/yt_dlp/extractor/nick.py index de22cb8d6..165d8ce9d 100644 --- a/yt_dlp/extractor/nick.py +++ b/yt_dlp/extractor/nick.py @@ -188,26 +188,6 @@ def _get_feed_url(self, uri, url=None): return self._remove_template_parameter(config['feedWithQueryParams']) -class NickNightIE(NickDeIE): # XXX: Do not subclass from concrete IE - IE_NAME = 'nicknight' - _VALID_URL = r'https?://(?:www\.)(?P<host>nicknight\.(?:de|at|tv))/(?:playlist|shows)/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.nicknight.at/shows/977-awkward/videos/85987-nimmer-beste-freunde', - 'only_matching': True, - }, { - 'url': 'http://www.nicknight.at/shows/977-awkward', - 'only_matching': True, - }, { - 'url': 'http://www.nicknight.at/shows/1900-faking-it', - 'only_matching': True, - }] - - def _extract_mrss_url(self, webpage, *args): - return self._search_regex( - r'mrss\s*:\s*(["\'])(?P<url>http.+?)\1', webpage, - 'mrss url', group='url') - - class NickRuIE(MTVServicesInfoExtractor): IE_NAME = 'nickelodeonru' _VALID_URL = r'https?://(?:www\.)nickelodeon\.(?:ru|fr|es|pt|ro|hu|com\.tr)/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)' diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index fa2d709d2..797b5268a 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -8,12 +8,11 @@ from urllib.parse import urlparse from .common import InfoExtractor, SearchInfoExtractor -from ..dependencies import websockets +from ..networking import Request from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, OnDemandPagedList, - WebSocketsWrapper, bug_reports_message, clean_html, float_or_none, @@ -934,8 +933,6 @@ class NiconicoLiveIE(InfoExtractor): _KNOWN_LATENCY = ('high', 'low') def _real_extract(self, url): - if not websockets: - raise ExtractorError('websockets library is not available. Please install it.', expected=True) video_id = self._match_id(url) webpage, urlh = self._download_webpage_handle(f'https://live.nicovideo.jp/watch/{video_id}', video_id) @@ -950,17 +947,13 @@ def _real_extract(self, url): }) hostname = remove_start(urlparse(urlh.url).hostname, 'sp.') - cookies = try_get(urlh.url, self._downloader._calc_cookies) latency = try_get(self._configuration_arg('latency'), lambda x: x[0]) if latency not in self._KNOWN_LATENCY: latency = 'high' - ws = WebSocketsWrapper(ws_url, { - 'Cookies': str_or_none(cookies) or '', - 'Origin': f'https://{hostname}', - 'Accept': '*/*', - 'User-Agent': self.get_param('http_headers')['User-Agent'], - }) + ws = self._request_webpage( + Request(ws_url, headers={'Origin': f'https://{hostname}'}), + video_id=video_id, note='Connecting to WebSocket server') self.write_debug('[debug] Sending HLS server request') ws.send(json.dumps({ @@ -1034,7 +1027,6 @@ def _real_extract(self, url): 'protocol': 'niconico_live', 'ws': ws, 'video_id': video_id, - 'cookies': cookies, 'live_latency': latency, 'origin': hostname, }) diff --git a/yt_dlp/extractor/ninenews.py b/yt_dlp/extractor/ninenews.py new file mode 100644 index 000000000..900d9ba60 --- /dev/null +++ b/yt_dlp/extractor/ninenews.py @@ -0,0 +1,72 @@ +from .common import InfoExtractor +from .brightcove import BrightcoveNewIE +from ..utils import ExtractorError +from ..utils.traversal import traverse_obj + + +class NineNewsIE(InfoExtractor): + IE_NAME = '9News' + _VALID_URL = r'https?://(?:www\.)?9news\.com\.au/(?:[\w-]+/){2,3}(?P<id>[\w-]+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://www.9news.com.au/videos/national/fair-trading-pulls-dozens-of-toys-from-shelves/clqgc7dvj000y0jnvfism0w5m', + 'md5': 'd1a65b2e9d126e5feb9bc5cb96e62c80', + 'info_dict': { + 'id': '6343717246112', + 'ext': 'mp4', + 'title': 'Fair Trading pulls dozens of toys from shelves', + 'description': 'Fair Trading Australia have been forced to pull dozens of toys from shelves over hazard fears.', + 'thumbnail': 'md5:bdbe44294e2323b762d97acf8843f66c', + 'duration': 93.44, + 'timestamp': 1703231748, + 'upload_date': '20231222', + 'uploader_id': '664969388001', + 'tags': ['networkclip', 'aunews_aunationalninenews', 'christmas presents', 'toys', 'fair trading', 'au_news'], + } + }, { + 'url': 'https://www.9news.com.au/world/tape-reveals-donald-trump-pressured-michigan-officials-not-to-certify-2020-vote-a-new-report-says/0b8b880e-7d3c-41b9-b2bd-55bc7e492259', + 'md5': 'a885c44d20898c3e70e9a53e8188cea1', + 'info_dict': { + 'id': '6343587450112', + 'ext': 'mp4', + 'title': 'Trump found ineligible to run for president by state court', + 'description': 'md5:40e6e7db7a4ac6be0e960569a5af6066', + 'thumbnail': 'md5:3e132c48c186039fd06c10787de9bff2', + 'duration': 104.64, + 'timestamp': 1703058034, + 'upload_date': '20231220', + 'uploader_id': '664969388001', + 'tags': ['networkclip', 'aunews_aunationalninenews', 'ineligible', 'presidential candidate', 'donald trump', 'au_news'], + } + }, { + 'url': 'https://www.9news.com.au/national/outrage-as-parents-banned-from-giving-gifts-to-kindergarten-teachers/e19b49d4-a1a4-4533-9089-6e10e2d9386a', + 'info_dict': { + 'id': '6343716797112', + 'ext': 'mp4', + 'title': 'Outrage as parents banned from giving gifts to kindergarten teachers', + 'description': 'md5:7a8b0ed2f9e08875fd9a3e86e462bc46', + 'thumbnail': 'md5:5ee4d66717bdd0dee9fc9a705ef041b8', + 'duration': 91.307, + 'timestamp': 1703229584, + 'upload_date': '20231222', + 'uploader_id': '664969388001', + 'tags': ['networkclip', 'aunews_aunationalninenews', 'presents', 'teachers', 'kindergarten', 'au_news'], + }, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + initial_state = self._search_json( + r'var\s+__INITIAL_STATE__\s*=', webpage, 'initial state', article_id) + video_id = traverse_obj( + initial_state, ('videoIndex', 'currentVideo', 'brightcoveId', {str}), + ('article', ..., 'media', lambda _, v: v['type'] == 'video', 'urn', {str}), get_all=False) + account = traverse_obj(initial_state, ( + 'videoIndex', 'config', (None, 'video'), 'account', {str}), get_all=False) + + if not video_id or not account: + raise ExtractorError('Unable to get the required video data') + + return self.url_result( + f'https://players.brightcove.net/{account}/default_default/index.html?videoId={video_id}', + BrightcoveNewIE, video_id) diff --git a/yt_dlp/extractor/nintendo.py b/yt_dlp/extractor/nintendo.py index ed839af25..853a169bb 100644 --- a/yt_dlp/extractor/nintendo.py +++ b/yt_dlp/extractor/nintendo.py @@ -1,57 +1,131 @@ -import re +import json +import urllib.parse from .common import InfoExtractor -from .ooyala import OoyalaIE +from ..utils import ( + ExtractorError, + make_archive_id, + unified_timestamp, + urljoin, +) +from ..utils.traversal import traverse_obj class NintendoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nintendo\.com/(?:games/detail|nintendo-direct)/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?nintendo\.com/(?:(?P<locale>\w{2}(?:-\w{2})?)/)?nintendo-direct/(?P<slug>[^/?#]+)' _TESTS = [{ - 'url': 'https://www.nintendo.com/games/detail/duck-hunt-wii-u/', - 'info_dict': { - 'id': 'MzMmticjp0VPzO3CCj4rmFOuohEuEWoW', - 'ext': 'flv', - 'title': 'Duck Hunt Wii U VC NES - Trailer', - 'duration': 60.326, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Ooyala'], - }, { - 'url': 'http://www.nintendo.com/games/detail/tokyo-mirage-sessions-fe-wii-u', - 'info_dict': { - 'id': 'tokyo-mirage-sessions-fe-wii-u', - 'title': 'Tokyo Mirage Sessions ♯FE', - }, - 'playlist_count': 4, - }, { 'url': 'https://www.nintendo.com/nintendo-direct/09-04-2019/', 'info_dict': { - 'id': 'J2bXdmaTE6fe3dWJTPcc7m23FNbc_A1V', 'ext': 'mp4', - 'title': 'Switch_ROS_ND0904-H264.mov', - 'duration': 2324.758, + 'id': '2oPmiviVePUA1IqAZzjuVh', + 'display_id': '09-04-2019', + 'title': 'Nintendo Direct 9.4.2019', + 'timestamp': 1567580400, + 'description': 'md5:8aac2780361d8cb772b6d1de66d7d6f4', + 'upload_date': '20190904', + 'age_limit': 17, + '_old_archive_ids': ['nintendo J2bXdmaTE6fe3dWJTPcc7m23FNbc_A1V'], }, - 'params': { - 'skip_download': True, + }, { + 'url': 'https://www.nintendo.com/en-ca/nintendo-direct/08-31-2023/', + 'info_dict': { + 'ext': 'mp4', + 'id': '2TB2w2rJhNYF84qQ9E57hU', + 'display_id': '08-31-2023', + 'title': 'Super Mario Bros. Wonder Direct 8.31.2023', + 'timestamp': 1693465200, + 'description': 'md5:3067c5b824bcfdae9090a7f38ab2d200', + 'tags': ['Mild Fantasy Violence', 'In-Game Purchases'], + 'upload_date': '20230831', + 'age_limit': 6, + }, + }, { + 'url': 'https://www.nintendo.com/us/nintendo-direct/50-fact-extravaganza/', + 'info_dict': { + 'ext': 'mp4', + 'id': 'j0BBGzfw0pQ', + 'channel_follower_count': int, + 'view_count': int, + 'description': 'Learn new details about Super Smash Bros. for Wii U, which launches on November 21.', + 'duration': 2123, + 'availability': 'public', + 'thumbnail': 'https://i.ytimg.com/vi_webp/j0BBGzfw0pQ/maxresdefault.webp', + 'timestamp': 1414047600, + 'channel_id': 'UCGIY_O-8vW4rfX98KlMkvRg', + 'chapters': 'count:53', + 'heatmap': 'count:100', + 'upload_date': '20141023', + 'uploader_id': '@NintendoAmerica', + 'playable_in_embed': True, + 'categories': ['Gaming'], + 'display_id': '50-fact-extravaganza', + 'channel': 'Nintendo of America', + 'tags': ['Comic Mischief', 'Cartoon Violence', 'Mild Suggestive Themes'], + 'like_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCGIY_O-8vW4rfX98KlMkvRg', + 'age_limit': 10, + 'uploader_url': 'https://www.youtube.com/@NintendoAmerica', + 'comment_count': int, + 'live_status': 'not_live', + 'uploader': 'Nintendo of America', + 'title': '50-FACT Extravaganza', }, - 'add_ie': ['Ooyala'], }] + def _create_asset_url(self, path): + return urljoin('https://assets.nintendo.com/', urllib.parse.quote(path)) + def _real_extract(self, url): - page_id = self._match_id(url) + locale, slug = self._match_valid_url(url).group('locale', 'slug') - webpage = self._download_webpage(url, page_id) + language, _, country = (locale or 'US').rpartition('-') + parsed_locale = f'{language.lower() or "en"}_{country.upper()}' + self.write_debug(f'Using locale {parsed_locale} (from {locale})', only_once=True) - entries = [ - OoyalaIE._build_url_result(m.group('code')) - for m in re.finditer( - r'data-(?:video-id|directVideoId)=(["\'])(?P<code>(?:(?!\1).)+)\1', webpage)] + response = self._download_json('https://graph.nintendo.com/', slug, query={ + 'operationName': 'NintendoDirect', + 'variables': json.dumps({ + 'locale': parsed_locale, + 'slug': slug, + }, separators=(',', ':')), + 'extensions': json.dumps({ + 'persistedQuery': { + 'version': 1, + 'sha256Hash': '969b16fe9f08b686fa37bc44d1fd913b6188e65794bb5e341c54fa683a8004cb' + }, + }, separators=(',', ':')), + }) + # API returns `{"data": {"direct": null}}` if no matching id + direct_info = traverse_obj(response, ('data', 'direct', {dict})) + if not direct_info: + raise ExtractorError(f'No Nintendo Direct with id {slug} exists', expected=True) - title = self._html_search_regex( - r'(?s)<(?:span|div)[^>]+class="(?:title|wrapper)"[^>]*>.*?<h1>(.+?)</h1>', - webpage, 'title', fatal=False) + errors = ', '.join(traverse_obj(response, ('errors', ..., 'message'))) + if errors: + raise ExtractorError(f'GraphQL API error: {errors or "Unknown error"}') - return self.playlist_result( - entries, page_id, title) + result = traverse_obj(direct_info, { + 'id': ('id', {str}), + 'title': ('name', {str}), + 'timestamp': ('startDate', {unified_timestamp}), + 'description': ('description', 'text', {str}), + 'age_limit': ('contentRating', 'order', {int}), + 'tags': ('contentDescriptors', ..., 'label', {str}), + 'thumbnail': ('thumbnail', {self._create_asset_url}), + }) + result['display_id'] = slug + + asset_id = traverse_obj(direct_info, ('video', 'publicId', {str})) + if not asset_id: + youtube_id = traverse_obj(direct_info, ('liveStream', {str})) + if not youtube_id: + self.raise_no_formats('Could not find any video formats', video_id=slug) + + return self.url_result(youtube_id, **result, url_transparent=True) + + if asset_id.startswith('Legacy Videos/'): + result['_old_archive_ids'] = [make_archive_id(self, asset_id[14:])] + result['formats'] = self._extract_m3u8_formats( + self._create_asset_url(f'/video/upload/sp_full_hd/v1/{asset_id}.m3u8'), slug) + + return result diff --git a/yt_dlp/extractor/njpwworld.py b/yt_dlp/extractor/njpwworld.py deleted file mode 100644 index 607838133..000000000 --- a/yt_dlp/extractor/njpwworld.py +++ /dev/null @@ -1,82 +0,0 @@ -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - get_element_by_class, - urlencode_postdata, -) - - -class NJPWWorldIE(InfoExtractor): - _VALID_URL = r'https?://(front\.)?njpwworld\.com/p/(?P<id>[a-z0-9_]+)' - IE_DESC = '新日本プロレスワールド' - _NETRC_MACHINE = 'njpwworld' - - _TESTS = [{ - 'url': 'http://njpwworld.com/p/s_series_00155_1_9/', - 'info_dict': { - 'id': 's_series_00155_1_9', - 'ext': 'mp4', - 'title': '闘強導夢2000 2000年1月4日 東京ドーム 第9試合 ランディ・サベージ VS リック・スタイナー', - 'tags': list, - }, - 'params': { - 'skip_download': True, # AES-encrypted m3u8 - }, - 'skip': 'Requires login', - }, { - 'url': 'https://front.njpwworld.com/p/s_series_00563_16_bs', - 'info_dict': { - 'id': 's_series_00563_16_bs', - 'ext': 'mp4', - 'title': 'WORLD TAG LEAGUE 2020 & BEST OF THE SUPER Jr.27 2020年12月6日 福岡・福岡国際センター バックステージコメント(字幕あり)', - 'tags': ["福岡・福岡国際センター", "バックステージコメント", "2020", "20年代"], - }, - 'params': { - 'skip_download': True, - }, - }] - - _LOGIN_URL = 'https://front.njpwworld.com/auth/login' - - def _perform_login(self, username, password): - # Setup session (will set necessary cookies) - self._request_webpage( - 'https://njpwworld.com/', None, note='Setting up session') - - webpage, urlh = self._download_webpage_handle( - self._LOGIN_URL, None, - note='Logging in', errnote='Unable to login', - data=urlencode_postdata({'login_id': username, 'pw': password}), - headers={'Referer': 'https://front.njpwworld.com/auth'}) - # /auth/login will return 302 for successful logins - if urlh.url == self._LOGIN_URL: - self.report_warning('unable to login') - return False - - return True - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - formats = [] - for kind, vid in re.findall(r'if\s+\(\s*imageQualityType\s*==\s*\'([^\']+)\'\s*\)\s*{\s*video_id\s*=\s*"(\d+)"', webpage): - player_path = '/intent?id=%s&type=url' % vid - player_url = compat_urlparse.urljoin(url, player_path) - formats += self._extract_m3u8_formats( - player_url, video_id, 'mp4', 'm3u8_native', m3u8_id=kind, fatal=False, quality=int(kind == 'high')) - - tag_block = get_element_by_class('tag-block', webpage) - tags = re.findall( - r'<a[^>]+class="tag-[^"]+"[^>]*>([^<]+)</a>', tag_block - ) if tag_block else None - - return { - 'id': video_id, - 'title': get_element_by_class('article-title', webpage) or self._og_search_title(webpage), - 'formats': formats, - 'tags': tags, - } diff --git a/yt_dlp/extractor/normalboots.py b/yt_dlp/extractor/normalboots.py deleted file mode 100644 index 07babcd2c..000000000 --- a/yt_dlp/extractor/normalboots.py +++ /dev/null @@ -1,51 +0,0 @@ -from .common import InfoExtractor -from .jwplatform import JWPlatformIE - -from ..utils import ( - unified_strdate, -) - - -class NormalbootsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?normalboots\.com/video/(?P<id>[0-9a-z-]*)/?$' - _TEST = { - 'url': 'http://normalboots.com/video/home-alone-games-jontron/', - 'info_dict': { - 'id': 'home-alone-games-jontron', - 'ext': 'mp4', - 'title': 'Home Alone Games - JonTron - NormalBoots', - 'description': 'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for ‘Tense Battle Theme’:\xa0http://www.youtube.com/Kiamet/', - 'uploader': 'JonTron', - 'upload_date': '20140125', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': ['JWPlatform'], - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_uploader = self._html_search_regex( - r'Posted\sby\s<a\shref="[A-Za-z0-9/]*">(?P<uploader>[A-Za-z]*)\s</a>', - webpage, 'uploader', fatal=False) - video_upload_date = unified_strdate(self._html_search_regex( - r'<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>', - webpage, 'date', fatal=False)) - - jwplatform_url = JWPlatformIE._extract_url(webpage) - - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': jwplatform_url, - 'ie_key': JWPlatformIE.ie_key(), - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader': video_uploader, - 'upload_date': video_upload_date, - } diff --git a/yt_dlp/extractor/nosvideo.py b/yt_dlp/extractor/nosvideo.py deleted file mode 100644 index 7e9688c0b..000000000 --- a/yt_dlp/extractor/nosvideo.py +++ /dev/null @@ -1,72 +0,0 @@ -import re - -from .common import InfoExtractor -from ..networking import Request -from ..utils import ( - ExtractorError, - urlencode_postdata, - xpath_text, - xpath_with_ns, -) - -_x = lambda p: xpath_with_ns(p, {'xspf': 'http://xspf.org/ns/0/'}) - - -class NosVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nosvideo\.com/' + \ - r'(?:embed/|\?v=)(?P<id>[A-Za-z0-9]{12})/?' - _PLAYLIST_URL = 'http://nosvideo.com/xml/{xml_id:s}.xml' - _FILE_DELETED_REGEX = r'<b>File Not Found</b>' - _TEST = { - 'url': 'http://nosvideo.com/?v=mu8fle7g7rpq', - 'md5': '6124ed47130d8be3eacae635b071e6b6', - 'info_dict': { - 'id': 'mu8fle7g7rpq', - 'ext': 'mp4', - 'title': 'big_buck_bunny_480p_surround-fix.avi.mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - fields = { - 'id': video_id, - 'op': 'download1', - 'method_free': 'Continue to Video', - } - req = Request(url, urlencode_postdata(fields)) - req.headers['Content-type'] = 'application/x-www-form-urlencoded' - webpage = self._download_webpage(req, video_id, - 'Downloading download page') - if re.search(self._FILE_DELETED_REGEX, webpage) is not None: - raise ExtractorError('Video %s does not exist' % video_id, - expected=True) - - xml_id = self._search_regex(r'php\|([^\|]+)\|', webpage, 'XML ID') - playlist_url = self._PLAYLIST_URL.format(xml_id=xml_id) - playlist = self._download_xml(playlist_url, video_id) - - track = playlist.find(_x('.//xspf:track')) - if track is None: - raise ExtractorError( - 'XML playlist is missing the \'track\' element', - expected=True) - title = xpath_text(track, _x('./xspf:title'), 'title') - url = xpath_text(track, _x('./xspf:file'), 'URL', fatal=True) - thumbnail = xpath_text(track, _x('./xspf:image'), 'thumbnail') - if title is not None: - title = title.strip() - - formats = [{ - 'format_id': 'sd', - 'url': url, - }] - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - } diff --git a/yt_dlp/extractor/nova.py b/yt_dlp/extractor/nova.py index bd0c4ebe3..8a7dfceeb 100644 --- a/yt_dlp/extractor/nova.py +++ b/yt_dlp/extractor/nova.py @@ -13,7 +13,7 @@ class NovaEmbedIE(InfoExtractor): - _VALID_URL = r'https?://media\.cms\.nova\.cz/embed/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://media(?:tn)?\.cms\.nova\.cz/embed/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://media.cms.nova.cz/embed/8o0n0r?autoplay=1', 'info_dict': { @@ -37,6 +37,16 @@ class NovaEmbedIE(InfoExtractor): 'duration': 114, }, 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://mediatn.cms.nova.cz/embed/EU5ELEsmOHt?autoplay=1', + 'info_dict': { + 'id': 'EU5ELEsmOHt', + 'ext': 'mp4', + 'title': 'Haptické křeslo, bionická ruka nebo roboti. Reportérka se podívala na Týden inovací', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 1780, + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/novaplay.py b/yt_dlp/extractor/novaplay.py index 92d1d136c..d8849cd88 100644 --- a/yt_dlp/extractor/novaplay.py +++ b/yt_dlp/extractor/novaplay.py @@ -3,7 +3,7 @@ class NovaPlayIE(InfoExtractor): - _VALID_URL = r'https://play.nova\.bg/video/.*/(?P<id>\d+)' + _VALID_URL = r'https://play\.nova\.bg/video/[^?#]+/(?P<id>\d+)' _TESTS = [ { 'url': 'https://play.nova.bg/video/ochakvaite/season-0/ochakvaite-2022-07-22-sybudi-se-sat/606627', diff --git a/yt_dlp/extractor/npo.py b/yt_dlp/extractor/npo.py index 40fee24d0..4d5ff50de 100644 --- a/yt_dlp/extractor/npo.py +++ b/yt_dlp/extractor/npo.py @@ -245,7 +245,7 @@ def _real_extract(self, url): 'quality': 'npoplus', 'tokenId': player_token, 'streamType': 'broadcast', - }) + }, data=b'') # endpoint requires POST if not streams: continue stream = streams.get('stream') diff --git a/yt_dlp/extractor/nrl.py b/yt_dlp/extractor/nrl.py index 798d03417..1e8cf0b75 100644 --- a/yt_dlp/extractor/nrl.py +++ b/yt_dlp/extractor/nrl.py @@ -2,6 +2,7 @@ class NRLTVIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?nrl\.com/tv(/[^/]+)*/(?P<id>[^/?&#]+)' _TEST = { 'url': 'https://www.nrl.com/tv/news/match-highlights-titans-v-knights-862805/', diff --git a/yt_dlp/extractor/ntvde.py b/yt_dlp/extractor/ntvde.py index 6d7ea3d18..9f3a498ab 100644 --- a/yt_dlp/extractor/ntvde.py +++ b/yt_dlp/extractor/ntvde.py @@ -1,21 +1,21 @@ import re from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( int_or_none, js_to_json, - parse_duration, + url_or_none, ) +from ..utils.traversal import traverse_obj class NTVDeIE(InfoExtractor): IE_NAME = 'n-tv.de' - _VALID_URL = r'https?://(?:www\.)?n-tv\.de/mediathek/videos/[^/?#]+/[^/?#]+-article(?P<id>.+)\.html' + _VALID_URL = r'https?://(?:www\.)?n-tv\.de/mediathek/(?:videos|magazine)/[^/?#]+/[^/?#]+-article(?P<id>[^/?#]+)\.html' _TESTS = [{ 'url': 'http://www.n-tv.de/mediathek/videos/panorama/Schnee-und-Glaette-fuehren-zu-zahlreichen-Unfaellen-und-Staus-article14438086.html', - 'md5': '6ef2514d4b1e8e03ca24b49e2f167153', + 'md5': '6bcf2a6638cb83f45d5561659a1cb498', 'info_dict': { 'id': '14438086', 'ext': 'mp4', @@ -23,51 +23,61 @@ class NTVDeIE(InfoExtractor): 'title': 'Schnee und Glätte führen zu zahlreichen Unfällen und Staus', 'alt_title': 'Winterchaos auf deutschen Straßen', 'description': 'Schnee und Glätte sorgen deutschlandweit für einen chaotischen Start in die Woche: Auf den Straßen kommt es zu kilometerlangen Staus und Dutzenden Glätteunfällen. In Düsseldorf und München wirbelt der Schnee zudem den Flugplan durcheinander. Dutzende Flüge landen zu spät, einige fallen ganz aus.', - 'duration': 4020, + 'duration': 67, 'timestamp': 1422892797, 'upload_date': '20150202', }, + }, { + 'url': 'https://www.n-tv.de/mediathek/magazine/auslandsreport/Juedische-Siedler-wollten-Rache-die-wollten-nur-toeten-article24523089.html', + 'md5': 'c5c6014c014ccc3359470e1d34472bfd', + 'info_dict': { + 'id': '24523089', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'Jüdische Siedler "wollten Rache, die wollten nur töten"', + 'alt_title': 'Israelische Gewalt fern von Gaza', + 'description': 'Vier Tage nach dem Massaker der Hamas greifen jüdische Siedler das Haus einer palästinensischen Familie im Westjordanland an. Die Überlebenden berichten, sie waren unbewaffnet, die Angreifer seien nur auf "Rache und Töten" aus gewesen. Als die Toten beerdigt werden sollen, eröffnen die Siedler erneut das Feuer.', + 'duration': 326, + 'timestamp': 1699688294, + 'upload_date': '20231111', + }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - info = self._parse_json(self._search_regex( - r'(?s)ntv\.pageInfo\.article\s*=\s*(\{.*?\});', webpage, 'info'), - video_id, transform_source=js_to_json) - timestamp = int_or_none(info.get('publishedDateAsUnixTimeStamp')) - vdata = self._parse_json(self._search_regex( - r'(?s)\$\(\s*"\#player"\s*\)\s*\.data\(\s*"player",\s*(\{.*?\})\);', - webpage, 'player data'), video_id, - transform_source=lambda s: js_to_json(re.sub(r'advertising:\s*{[^}]+},', '', s))) - duration = parse_duration(vdata.get('duration')) + info = self._search_json( + r'article:', webpage, 'info', video_id, transform_source=js_to_json) + + vdata = self._search_json( + r'\$\(\s*"#playerwrapper"\s*\)\s*\.data\(\s*"player",', + webpage, 'player data', video_id, + transform_source=lambda s: js_to_json(re.sub(r'ivw:[^},]+', '', s)))['setup']['source'] formats = [] - if vdata.get('video'): + if vdata.get('progressive'): formats.append({ - 'format_id': 'flash', - 'url': 'rtmp://fms.n-tv.de/%s' % vdata['video'], + 'format_id': 'http', + 'url': vdata['progressive'], }) - if vdata.get('videoMp4'): - formats.append({ - 'format_id': 'mobile', - 'url': compat_urlparse.urljoin('http://video.n-tv.de', vdata['videoMp4']), - 'tbr': 400, # estimation - }) - if vdata.get('videoM3u8'): - m3u8_url = compat_urlparse.urljoin('http://video.n-tv.de', vdata['videoM3u8']) + if vdata.get('hls'): formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', - quality=1, m3u8_id='hls', fatal=False)) + vdata['hls'], video_id, 'mp4', m3u8_id='hls', fatal=False)) + if vdata.get('dash'): + formats.extend(self._extract_mpd_formats(vdata['dash'], video_id, fatal=False, mpd_id='dash')) return { 'id': video_id, - 'title': info['headline'], - 'description': info.get('intro'), - 'alt_title': info.get('kicker'), - 'timestamp': timestamp, - 'thumbnail': vdata.get('html5VideoPoster'), - 'duration': duration, + **traverse_obj(info, { + 'title': 'headline', + 'description': 'intro', + 'alt_title': 'kicker', + 'timestamp': ('publishedDateAsUnixTimeStamp', {int_or_none}), + }), + **traverse_obj(vdata, { + 'thumbnail': ('poster', {url_or_none}), + 'duration': ('length', {int_or_none}), + }), 'formats': formats, } diff --git a/yt_dlp/extractor/nubilesporn.py b/yt_dlp/extractor/nubilesporn.py index d4f1d9d67..1d630f547 100644 --- a/yt_dlp/extractor/nubilesporn.py +++ b/yt_dlp/extractor/nubilesporn.py @@ -19,7 +19,7 @@ class NubilesPornIE(InfoExtractor): _NETRC_MACHINE = 'nubiles-porn' _VALID_URL = r'''(?x) - https://members.nubiles-porn.com/video/watch/(?P<id>\d+) + https://members\.nubiles-porn\.com/video/watch/(?P<id>\d+) (?:/(?P<display_id>[\w\-]+-s(?P<season>\d+)e(?P<episode>\d+)))? ''' diff --git a/yt_dlp/extractor/oftv.py b/yt_dlp/extractor/oftv.py index 3ae7278fb..4cac51846 100644 --- a/yt_dlp/extractor/oftv.py +++ b/yt_dlp/extractor/oftv.py @@ -4,7 +4,7 @@ class OfTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?of.tv/video/(?P<id>\w+)' + _VALID_URL = r'https?://(?:www\.)?of\.tv/video/(?P<id>\w+)' _TESTS = [{ 'url': 'https://of.tv/video/627d7d95b353db0001dadd1a', 'md5': 'cb9cd5db3bb9ee0d32bfd7e373d6ef0a', @@ -34,7 +34,7 @@ def _real_extract(self, url): class OfTVPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?of.tv/creators/(?P<id>[a-zA-Z0-9-]+)/.?' + _VALID_URL = r'https?://(?:www\.)?of\.tv/creators/(?P<id>[a-zA-Z0-9-]+)/?(?:$|[?#])' _TESTS = [{ 'url': 'https://of.tv/creators/this-is-fire/', 'playlist_count': 8, diff --git a/yt_dlp/extractor/ondemandkorea.py b/yt_dlp/extractor/ondemandkorea.py index dd7d1d7de..94fcac720 100644 --- a/yt_dlp/extractor/ondemandkorea.py +++ b/yt_dlp/extractor/ondemandkorea.py @@ -1,87 +1,168 @@ +import functools import re +import uuid from .common import InfoExtractor from ..utils import ( ExtractorError, - js_to_json, + OnDemandPagedList, + float_or_none, + int_or_none, + join_nonempty, + parse_age_limit, + parse_qs, + unified_strdate, + url_or_none, ) +from ..utils.traversal import traverse_obj class OnDemandKoreaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?P<id>[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?:en/)?player/vod/[a-z0-9-]+\?(?:[^#]+&)?contentId=(?P<id>\d+)' _GEO_COUNTRIES = ['US', 'CA'] + _TESTS = [{ - 'url': 'https://www.ondemandkorea.com/ask-us-anything-e351.html', + 'url': 'https://www.ondemandkorea.com/player/vod/ask-us-anything?contentId=686471', + 'md5': 'e2ff77255d989e3135bde0c5889fbce8', 'info_dict': { - 'id': 'ask-us-anything-e351', + 'id': '686471', 'ext': 'mp4', - 'title': 'Ask Us Anything : Jung Sung-ho, Park Seul-gi, Kim Bo-min, Yang Seung-won - 09/24/2022', - 'description': 'A talk show/game show with a school theme where celebrity guests appear as “transfer students.”', - 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'Ask Us Anything: Jung Sung-ho, Park Seul-gi, Kim Bo-min, Yang Seung-won', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)', + 'duration': 5486.955, + 'release_date': '20220924', + 'series': 'Ask Us Anything', + 'series_id': 11790, + 'episode_number': 351, + 'episode': 'Jung Sung-ho, Park Seul-gi, Kim Bo-min, Yang Seung-won', }, - 'params': { - 'skip_download': 'm3u8 download' - } }, { - 'url': 'https://www.ondemandkorea.com/work-later-drink-now-e1.html', + 'url': 'https://www.ondemandkorea.com/player/vod/breakup-probation-a-week?contentId=1595796', + 'md5': '57266c720006962be7ff415b24775caa', 'info_dict': { - 'id': 'work-later-drink-now-e1', + 'id': '1595796', 'ext': 'mp4', - 'title': 'Work Later, Drink Now : E01', - 'description': 'Work Later, Drink First follows three women who find solace in a glass of liquor at the end of the day. So-hee, who gets comfort from a cup of soju af', - 'thumbnail': r're:^https?://.*\.png$', - 'subtitles': { - 'English': 'mincount:1', - }, + 'title': 'Breakup Probation, A Week: E08', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)', + 'duration': 1586.0, + 'release_date': '20231001', + 'series': 'Breakup Probation, A Week', + 'series_id': 22912, + 'episode_number': 8, + 'episode': 'E08', }, - 'params': { - 'skip_download': 'm3u8 download' - } + }, { + 'url': 'https://www.ondemandkorea.com/player/vod/the-outlaws?contentId=369531', + 'md5': 'fa5523b87aa1f6d74fc622a97f2b47cd', + 'info_dict': { + 'id': '369531', + 'ext': 'mp4', + 'release_date': '20220519', + 'duration': 7267.0, + 'title': 'The Outlaws: Main Movie', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)', + 'age_limit': 18, + }, + }, { + 'url': 'https://www.ondemandkorea.com/en/player/vod/capture-the-moment-how-is-that-possible?contentId=1605006', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id, fatal=False) - if not webpage: - # Page sometimes returns captcha page with HTTP 403 - raise ExtractorError( - 'Unable to access page. You may have been blocked.', - expected=True) + data = self._download_json( + f'https://odkmedia.io/odx/api/v3/playback/{video_id}/', video_id, fatal=False, + headers={'service-name': 'odk'}, query={'did': str(uuid.uuid4())}, expected_status=(403, 404)) + if not traverse_obj(data, ('result', {dict})): + msg = traverse_obj(data, ('messages', '__default'), 'title', expected_type=str) + raise ExtractorError(msg or 'Got empty response from playback API', expected=True) - if 'msg_block_01.png' in webpage: - self.raise_geo_restricted( - msg='This content is not available in your region', - countries=self._GEO_COUNTRIES) + data = data['result'] - if 'This video is only available to ODK PLUS members.' in webpage: - raise ExtractorError( - 'This video is only available to ODK PLUS members.', - expected=True) + def try_geo_bypass(url): + return traverse_obj(url, ({parse_qs}, 'stream_url', 0, {url_or_none})) or url - if 'ODK PREMIUM Members Only' in webpage: - raise ExtractorError( - 'This video is only available to ODK PREMIUM members.', - expected=True) + formats = [] + for m3u8_url in traverse_obj(data, (('sources', 'manifest'), ..., 'url', {url_or_none}, {try_geo_bypass})): + mod_url = re.sub(r'_720(p?)\.m3u8', r'_1080\1.m3u8', m3u8_url) + if mod_url != m3u8_url: + mod_format = self._extract_m3u8_formats( + mod_url, video_id, note='Checking for higher quality format', + errnote='No higher quality format found', fatal=False) + if mod_format: + formats.extend(mod_format) + continue + formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, fatal=False)) - title = self._search_regex( - r'class=["\']episode_title["\'][^>]*>([^<]+)', - webpage, 'episode_title', fatal=False) or self._og_search_title(webpage) + subtitles = {} + for track in traverse_obj(data, ('text_tracks', lambda _, v: url_or_none(v['url']))): + subtitles.setdefault(track.get('language', 'und'), []).append({ + 'url': track['url'], + 'ext': track.get('codec'), + 'name': track.get('label'), + }) - jw_config = self._parse_json( - self._search_regex(( - r'(?P<options>{\s*[\'"]tracks[\'"].*?})[)\];]+$', - r'playlist\s*=\s*\[(?P<options>.+)];?$', - r'odkPlayer\.init.*?(?P<options>{[^;]+}).*?;', - ), webpage, 'jw config', flags=re.MULTILINE | re.DOTALL, group='options'), - video_id, transform_source=js_to_json) - info = self._parse_jwplayer_data( - jw_config, video_id, require_title=False, m3u8_id='hls', - base_url=url) + def if_series(key=None): + return lambda obj: obj[key] if key and obj['kind'] == 'series' else None - info.update({ - 'title': title, - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage) - }) - return info + return { + 'id': video_id, + 'title': join_nonempty( + ('episode', 'program', 'title'), + ('episode', 'title'), from_dict=data, delim=': '), + **traverse_obj(data, { + 'thumbnail': ('episode', 'images', 'thumbnail', {url_or_none}), + 'release_date': ('episode', 'release_date', {lambda x: x.replace('-', '')}, {unified_strdate}), + 'duration': ('duration', {functools.partial(float_or_none, scale=1000)}), + 'age_limit': ('age_rating', 'name', {lambda x: x.replace('R', '')}, {parse_age_limit}), + 'series': ('episode', {if_series(key='program')}, 'title'), + 'series_id': ('episode', {if_series(key='program')}, 'id'), + 'episode': ('episode', {if_series(key='title')}), + 'episode_number': ('episode', {if_series(key='number')}, {int_or_none}), + }, get_all=False), + 'formats': formats, + 'subtitles': subtitles, + } + + +class OnDemandKoreaProgramIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?:en/)?player/vod/(?P<id>[a-z0-9-]+)(?:$|#)' + _GEO_COUNTRIES = ['US', 'CA'] + + _TESTS = [{ + 'url': 'https://www.ondemandkorea.com/player/vod/uskn-news', + 'info_dict': { + 'id': 'uskn-news', + }, + 'playlist_mincount': 755, + }, { + 'url': 'https://www.ondemandkorea.com/en/player/vod/the-land', + 'info_dict': { + 'id': 'the-land', + }, + 'playlist_count': 52, + }] + + _PAGE_SIZE = 100 + + def _fetch_page(self, display_id, page): + page += 1 + page_data = self._download_json( + f'https://odkmedia.io/odx/api/v3/program/{display_id}/episodes/', display_id, + headers={'service-name': 'odk'}, query={ + 'page': page, + 'page_size': self._PAGE_SIZE, + }, note=f'Downloading page {page}', expected_status=404) + for episode in traverse_obj(page_data, ('result', 'results', ...)): + yield self.url_result( + f'https://www.ondemandkorea.com/player/vod/{display_id}?contentId={episode["id"]}', + ie=OnDemandKoreaIE, video_title=episode.get('title')) + + def _real_extract(self, url): + display_id = self._match_id(url) + + entries = OnDemandPagedList(functools.partial( + self._fetch_page, display_id), self._PAGE_SIZE) + + return self.playlist_result(entries, display_id) diff --git a/yt_dlp/extractor/ooyala.py b/yt_dlp/extractor/ooyala.py deleted file mode 100644 index 65afccdb1..000000000 --- a/yt_dlp/extractor/ooyala.py +++ /dev/null @@ -1,230 +0,0 @@ -import base64 -import re - -from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_str, -) -from ..utils import ( - determine_ext, - float_or_none, - int_or_none, - smuggle_url, - try_get, - unsmuggle_url, -) - - -class OoyalaBaseIE(InfoExtractor): - _PLAYER_BASE = 'http://player.ooyala.com/' - _CONTENT_TREE_BASE = _PLAYER_BASE + 'player_api/v1/content_tree/' - _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v2/authorization/embed_code/%s/%s' - - def _extract(self, content_tree_url, video_id, domain=None, supportedformats=None, embed_token=None): - content_tree = self._download_json(content_tree_url, video_id)['content_tree'] - metadata = content_tree[list(content_tree)[0]] - embed_code = metadata['embed_code'] - pcode = metadata.get('asset_pcode') or embed_code - title = metadata['title'] - - auth_data = self._download_json( - self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code), - video_id, headers=self.geo_verification_headers(), query={ - 'domain': domain or 'player.ooyala.com', - 'supportedFormats': supportedformats or 'mp4,rtmp,m3u8,hds,dash,smooth', - 'embedToken': embed_token, - })['authorization_data'][embed_code] - - urls = [] - formats = [] - streams = auth_data.get('streams') or [{ - 'delivery_type': 'hls', - 'url': { - 'data': base64.b64encode(('http://player.ooyala.com/hls/player/all/%s.m3u8' % embed_code).encode()).decode(), - } - }] - for stream in streams: - url_data = try_get(stream, lambda x: x['url']['data'], compat_str) - if not url_data: - continue - s_url = compat_b64decode(url_data).decode('utf-8') - if not s_url or s_url in urls: - continue - urls.append(s_url) - ext = determine_ext(s_url, None) - delivery_type = stream.get('delivery_type') - if delivery_type == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - re.sub(r'/ip(?:ad|hone)/', '/all/', s_url), embed_code, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif delivery_type == 'hds' or ext == 'f4m': - formats.extend(self._extract_f4m_formats( - s_url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False)) - elif delivery_type == 'dash' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - s_url, embed_code, mpd_id='dash', fatal=False)) - elif delivery_type == 'smooth': - self._extract_ism_formats( - s_url, embed_code, ism_id='mss', fatal=False) - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - s_url, embed_code, fatal=False)) - else: - formats.append({ - 'url': s_url, - 'ext': ext or delivery_type, - 'vcodec': stream.get('video_codec'), - 'format_id': delivery_type, - 'width': int_or_none(stream.get('width')), - 'height': int_or_none(stream.get('height')), - 'abr': int_or_none(stream.get('audio_bitrate')), - 'vbr': int_or_none(stream.get('video_bitrate')), - 'fps': float_or_none(stream.get('framerate')), - }) - if not formats and not auth_data.get('authorized'): - self.raise_no_formats('%s said: %s' % ( - self.IE_NAME, auth_data['message']), expected=True) - - subtitles = {} - for lang, sub in metadata.get('closed_captions_vtt', {}).get('captions', {}).items(): - sub_url = sub.get('url') - if not sub_url: - continue - subtitles[lang] = [{ - 'url': sub_url, - }] - - return { - 'id': embed_code, - 'title': title, - 'description': metadata.get('description'), - 'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'), - 'duration': float_or_none(metadata.get('duration'), 1000), - 'subtitles': subtitles, - 'formats': formats, - } - - -class OoyalaIE(OoyalaBaseIE): - _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)' - - _TESTS = [ - { - # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video - 'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', - 'info_dict': { - 'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', - 'ext': 'mp4', - 'title': 'Explaining Data Recovery from Hard Drives and SSDs', - 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', - 'duration': 853.386, - }, - # The video in the original webpage now uses PlayWire - 'skip': 'Ooyala said: movie expired', - }, { - # Only available for ipad - 'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', - 'info_dict': { - 'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', - 'ext': 'mp4', - 'title': 'Simulation Overview - Levels of Simulation', - 'duration': 194.948, - }, - }, - { - # Information available only through SAS api - # From http://community.plm.automation.siemens.com/t5/News-NX-Manufacturing/Tool-Path-Divide/ba-p/4187 - 'url': 'http://player.ooyala.com/player.js?embedCode=FiOG81ZTrvckcchQxmalf4aQj590qTEx', - 'md5': 'a84001441b35ea492bc03736e59e7935', - 'info_dict': { - 'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx', - 'ext': 'mp4', - 'title': 'Divide Tool Path.mp4', - 'duration': 204.405, - } - }, - { - # empty stream['url']['data'] - 'url': 'http://player.ooyala.com/player.js?embedCode=w2bnZtYjE6axZ_dw1Cd0hQtXd_ige2Is', - 'only_matching': True, - } - ] - - def _extract_from_webpage(self, url, webpage): - mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) - or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) - or re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) - or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) - or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) - if mobj is not None: - embed_token = self._search_regex( - r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)', - webpage, 'ooyala embed token', default=None) - yield self._build_url_result(smuggle_url( - mobj.group('ec'), { - 'domain': url, - 'embed_token': embed_token, - })) - return - - # Look for multiple Ooyala embeds on SBN network websites - mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) - if mobj is not None: - for v in self._parse_json(mobj.group(1), self._generic_id(url), fatal=False) or []: - yield self._build_url_result(smuggle_url(v['provider_video_id'], {'domain': url})) - - @staticmethod - def _url_for_embed_code(embed_code): - return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code - - @classmethod - def _build_url_result(cls, embed_code): - return cls.url_result(cls._url_for_embed_code(embed_code), - ie=cls.ie_key()) - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - embed_code = self._match_id(url) - domain = smuggled_data.get('domain') - supportedformats = smuggled_data.get('supportedformats') - embed_token = smuggled_data.get('embed_token') - content_tree_url = self._CONTENT_TREE_BASE + 'embed_code/%s/%s' % (embed_code, embed_code) - return self._extract(content_tree_url, embed_code, domain, supportedformats, embed_token) - - -class OoyalaExternalIE(OoyalaBaseIE): - _VALID_URL = r'''(?x) - (?: - ooyalaexternal:| - https?://.+?\.ooyala\.com/.*?\bexternalId= - ) - (?P<partner_id>[^:]+) - : - (?P<id>.+) - (?: - :| - .*?&pcode= - ) - (?P<pcode>.+?) - (?:&|$) - ''' - - _TEST = { - 'url': 'https://player.ooyala.com/player.js?externalId=espn:10365079&pcode=1kNG061cgaoolOncv54OAO1ceO-I&adSetCode=91cDU6NuXTGKz3OdjOxFdAgJVtQcKJnI&callback=handleEvents&hasModuleParams=1&height=968&playerBrandingId=7af3bd04449c444c964f347f11873075&targetReplaceId=videoPlayer&width=1656&wmode=opaque&allowScriptAccess=always', - 'info_dict': { - 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG', - 'ext': 'mp4', - 'title': 'dm_140128_30for30Shorts___JudgingJewellv2', - 'duration': 1302.0, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - partner_id, video_id, pcode = self._match_valid_url(url).groups() - content_tree_url = self._CONTENT_TREE_BASE + 'external_id/%s/%s:%s' % (pcode, partner_id, video_id) - return self._extract(content_tree_url, video_id) diff --git a/yt_dlp/extractor/orf.py b/yt_dlp/extractor/orf.py index cc3c003fa..9a48ae1b3 100644 --- a/yt_dlp/extractor/orf.py +++ b/yt_dlp/extractor/orf.py @@ -4,15 +4,16 @@ from .common import InfoExtractor from ..networking import HEADRequest from ..utils import ( + InAdvancePagedList, clean_html, determine_ext, float_or_none, - InAdvancePagedList, int_or_none, join_nonempty, + make_archive_id, + mimetype2ext, orderedSet, remove_end, - make_archive_id, smuggle_url, strip_jsonp, try_call, @@ -21,6 +22,7 @@ unsmuggle_url, url_or_none, ) +from ..utils.traversal import traverse_obj class ORFTVthekIE(InfoExtractor): @@ -334,6 +336,45 @@ def _real_extract(self, url): self._entries(data, station or station2), show_id, data.get('title'), clean_html(data.get('subtitle'))) +class ORFPodcastIE(InfoExtractor): + IE_NAME = 'orf:podcast' + _STATION_RE = '|'.join(map(re.escape, ( + 'bgl', 'fm4', 'ktn', 'noe', 'oe1', 'oe3', + 'ooe', 'sbg', 'stm', 'tir', 'tv', 'vbg', 'wie'))) + _VALID_URL = rf'https?://sound\.orf\.at/podcast/(?P<station>{_STATION_RE})/(?P<show>[\w-]+)/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://sound.orf.at/podcast/oe3/fruehstueck-bei-mir/nicolas-stockhammer-15102023', + 'md5': '526a5700e03d271a1505386a8721ab9b', + 'info_dict': { + 'id': 'nicolas-stockhammer-15102023', + 'ext': 'mp3', + 'title': 'Nicolas Stockhammer (15.10.2023)', + 'duration': 3396.0, + 'series': 'Frühstück bei mir', + }, + 'skip': 'ORF podcasts are only available for a limited time' + }] + + def _real_extract(self, url): + station, show, show_id = self._match_valid_url(url).group('station', 'show', 'id') + data = self._download_json( + f'https://audioapi.orf.at/radiothek/api/2.0/podcast/{station}/{show}/{show_id}', show_id) + + return { + 'id': show_id, + 'ext': 'mp3', + 'vcodec': 'none', + **traverse_obj(data, ('payload', { + 'url': ('enclosures', 0, 'url'), + 'ext': ('enclosures', 0, 'type', {mimetype2ext}), + 'title': 'title', + 'description': ('description', {clean_html}), + 'duration': ('duration', {functools.partial(float_or_none, scale=1000)}), + 'series': ('podcast', 'title'), + })), + } + + class ORFIPTVIE(InfoExtractor): IE_NAME = 'orf:iptv' IE_DESC = 'iptv.ORF.at' diff --git a/yt_dlp/extractor/pandoratv.py b/yt_dlp/extractor/pandoratv.py deleted file mode 100644 index ccc78da57..000000000 --- a/yt_dlp/extractor/pandoratv.py +++ /dev/null @@ -1,128 +0,0 @@ -from .common import InfoExtractor -from ..compat import ( - compat_str, -) -from ..utils import ( - ExtractorError, - float_or_none, - parse_duration, - parse_qs, - str_to_int, - urlencode_postdata, -) - - -class PandoraTVIE(InfoExtractor): - IE_NAME = 'pandora.tv' - IE_DESC = '판도라TV' - _VALID_URL = r'''(?x) - https?:// - (?: - (?:www\.)?pandora\.tv/view/(?P<user_id>[^/]+)/(?P<id>\d+)| # new format - (?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\?| # old format - m\.pandora\.tv/?\? # mobile - ) - ''' - _TESTS = [{ - 'url': 'http://jp.channel.pandora.tv/channel/video.ptv?c1=&prgid=53294230&ch_userid=mikakim&ref=main&lot=cate_01_2', - 'info_dict': { - 'id': '53294230', - 'ext': 'flv', - 'title': '頭を撫でてくれる?', - 'description': '頭を撫でてくれる?', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 39, - 'upload_date': '20151218', - 'uploader': 'カワイイ動物まとめ', - 'uploader_id': 'mikakim', - 'view_count': int, - 'like_count': int, - } - }, { - 'url': 'http://channel.pandora.tv/channel/video.ptv?ch_userid=gogoucc&prgid=54721744', - 'info_dict': { - 'id': '54721744', - 'ext': 'flv', - 'title': '[HD] JAPAN COUNTDOWN 170423', - 'description': '[HD] JAPAN COUNTDOWN 170423', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1704.9, - 'upload_date': '20170423', - 'uploader': 'GOGO_UCC', - 'uploader_id': 'gogoucc', - 'view_count': int, - 'like_count': int, - }, - 'params': { - # Test metadata only - 'skip_download': True, - }, - }, { - 'url': 'http://www.pandora.tv/view/mikakim/53294230#36797454_new', - 'only_matching': True, - }, { - 'url': 'http://m.pandora.tv/?c=view&ch_userid=mikakim&prgid=54600346', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - user_id = mobj.group('user_id') - video_id = mobj.group('id') - - if not user_id or not video_id: - qs = parse_qs(url) - video_id = qs.get('prgid', [None])[0] - user_id = qs.get('ch_userid', [None])[0] - if any(not f for f in (video_id, user_id,)): - raise ExtractorError('Invalid URL', expected=True) - - data = self._download_json( - 'http://m.pandora.tv/?c=view&m=viewJsonApi&ch_userid=%s&prgid=%s' - % (user_id, video_id), video_id) - - info = data['data']['rows']['vod_play_info']['result'] - - formats = [] - for format_id, format_url in info.items(): - if not format_url: - continue - height = self._search_regex( - r'^v(\d+)[Uu]rl$', format_id, 'height', default=None) - if not height: - continue - - play_url = self._download_json( - 'http://m.pandora.tv/?c=api&m=play_url', video_id, - data=urlencode_postdata({ - 'prgid': video_id, - 'runtime': info.get('runtime'), - 'vod_url': format_url, - }), - headers={ - 'Origin': url, - 'Content-Type': 'application/x-www-form-urlencoded', - }) - format_url = play_url.get('url') - if not format_url: - continue - - formats.append({ - 'format_id': '%sp' % height, - 'url': format_url, - 'height': int(height), - }) - - return { - 'id': video_id, - 'title': info['subject'], - 'description': info.get('body'), - 'thumbnail': info.get('thumbnail') or info.get('poster'), - 'duration': float_or_none(info.get('runtime'), 1000) or parse_duration(info.get('time')), - 'upload_date': info['fid'].split('/')[-1][:8] if isinstance(info.get('fid'), compat_str) else None, - 'uploader': info.get('nickname'), - 'uploader_id': info.get('upload_userid'), - 'view_count': str_to_int(info.get('hit')), - 'like_count': str_to_int(info.get('likecnt')), - 'formats': formats, - } diff --git a/yt_dlp/extractor/panopto.py b/yt_dlp/extractor/panopto.py index 5ab2b2bce..ddea32d70 100644 --- a/yt_dlp/extractor/panopto.py +++ b/yt_dlp/extractor/panopto.py @@ -536,7 +536,7 @@ def _fetch_page(self, base_url, query_params, display_id, page): } response = self._call_api( - base_url, '/Services/Data.svc/GetSessions', f'{display_id} page {page+1}', + base_url, '/Services/Data.svc/GetSessions', f'{display_id} page {page + 1}', data={'queryParameters': params}, fatal=False) for result in get_first(response, 'Results', default=[]): diff --git a/yt_dlp/extractor/people.py b/yt_dlp/extractor/people.py deleted file mode 100644 index c5143c3ed..000000000 --- a/yt_dlp/extractor/people.py +++ /dev/null @@ -1,29 +0,0 @@ -from .common import InfoExtractor - - -class PeopleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?people\.com/people/videos/0,,(?P<id>\d+),00\.html' - - _TEST = { - 'url': 'http://www.people.com/people/videos/0,,20995451,00.html', - 'info_dict': { - 'id': 'ref:20995451', - 'ext': 'mp4', - 'title': 'Astronaut Love Triangle Victim Speaks Out: “The Crime in 2007 Hasn’t Defined Us”', - 'description': 'Colleen Shipman speaks to PEOPLE for the first time about life after the attack', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 246.318, - 'timestamp': 1458720585, - 'upload_date': '20160323', - 'uploader_id': '416418724', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['BrightcoveNew'], - } - - def _real_extract(self, url): - return self.url_result( - 'http://players.brightcove.net/416418724/default_default/index.html?videoId=ref:%s' - % self._match_id(url), 'BrightcoveNew') diff --git a/yt_dlp/extractor/periscope.py b/yt_dlp/extractor/periscope.py index 84bcf1573..d2351df1a 100644 --- a/yt_dlp/extractor/periscope.py +++ b/yt_dlp/extractor/periscope.py @@ -4,6 +4,7 @@ parse_iso8601, unescapeHTML, ) +from ..utils.traversal import traverse_obj class PeriscopeBaseIE(InfoExtractor): @@ -20,22 +21,26 @@ def _parse_broadcast_data(self, broadcast, video_id): title = broadcast.get('status') or 'Periscope Broadcast' uploader = broadcast.get('user_display_name') or broadcast.get('username') title = '%s - %s' % (uploader, title) if uploader else title - is_live = broadcast.get('state').lower() == 'running' - thumbnails = [{ 'url': broadcast[image], - } for image in ('image_url', 'image_url_small') if broadcast.get(image)] + } for image in ('image_url', 'image_url_medium', 'image_url_small') if broadcast.get(image)] return { 'id': broadcast.get('id') or video_id, 'title': title, - 'timestamp': parse_iso8601(broadcast.get('created_at')), + 'timestamp': parse_iso8601(broadcast.get('created_at')) or int_or_none( + broadcast.get('created_at_ms'), scale=1000), + 'release_timestamp': int_or_none(broadcast.get('scheduled_start_ms'), scale=1000), 'uploader': uploader, 'uploader_id': broadcast.get('user_id') or broadcast.get('username'), 'thumbnails': thumbnails, 'view_count': int_or_none(broadcast.get('total_watched')), + 'concurrent_view_count': int_or_none(broadcast.get('total_watching')), 'tags': broadcast.get('tags'), - 'is_live': is_live, + 'live_status': { + 'running': 'is_live', + 'not_started': 'is_upcoming', + }.get(traverse_obj(broadcast, ('state', {str.lower}))) or 'was_live' } @staticmethod diff --git a/yt_dlp/extractor/piapro.py b/yt_dlp/extractor/piapro.py index 5f39e0639..3ae985da2 100644 --- a/yt_dlp/extractor/piapro.py +++ b/yt_dlp/extractor/piapro.py @@ -12,7 +12,7 @@ class PiaproIE(InfoExtractor): _NETRC_MACHINE = 'piapro' - _VALID_URL = r'https?://piapro\.jp/(?:t|content)/(?P<id>\w+)/?' + _VALID_URL = r'https?://piapro\.jp/(?:t|content)/(?P<id>[\w-]+)/?' _TESTS = [{ 'url': 'https://piapro.jp/t/NXYR', 'md5': 'f7c0f760913fb1d44a1c45a4af793909', @@ -49,6 +49,9 @@ class PiaproIE(InfoExtractor): }, { 'url': 'https://piapro.jp/content/hcw0z3a169wtemz6', 'only_matching': True + }, { + 'url': 'https://piapro.jp/t/-SO-', + 'only_matching': True }] _login_status = False diff --git a/yt_dlp/extractor/playfm.py b/yt_dlp/extractor/playfm.py deleted file mode 100644 index e895ba480..000000000 --- a/yt_dlp/extractor/playfm.py +++ /dev/null @@ -1,70 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - int_or_none, - parse_iso8601, -) - - -class PlayFMIE(InfoExtractor): - IE_NAME = 'play.fm' - _VALID_URL = r'https?://(?:www\.)?play\.fm/(?P<slug>(?:[^/]+/)+(?P<id>[^/]+))/?(?:$|[?#])' - - _TEST = { - 'url': 'https://www.play.fm/dan-drastic/sven-tasnadi-leipzig-electronic-music-batofar-paris-fr-2014-07-12', - 'md5': 'c505f8307825a245d0c7ad1850001f22', - 'info_dict': { - 'id': '71276', - 'ext': 'mp3', - 'title': 'Sven Tasnadi - LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', - 'description': '', - 'duration': 5627, - 'timestamp': 1406033781, - 'upload_date': '20140722', - 'uploader': 'Dan Drastic', - 'uploader_id': '71170', - 'view_count': int, - 'comment_count': int, - }, - } - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - slug = mobj.group('slug') - - recordings = self._download_json( - 'http://v2api.play.fm/recordings/slug/%s' % slug, video_id) - - error = recordings.get('error') - if isinstance(error, dict): - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error.get('message')), - expected=True) - - audio_url = recordings['audio'] - video_id = compat_str(recordings.get('id') or video_id) - title = recordings['title'] - description = recordings.get('description') - duration = int_or_none(recordings.get('recordingDuration')) - timestamp = parse_iso8601(recordings.get('created_at')) - uploader = recordings.get('page', {}).get('title') - uploader_id = compat_str(recordings.get('page', {}).get('id')) - view_count = int_or_none(recordings.get('playCount')) - comment_count = int_or_none(recordings.get('commentCount')) - categories = [tag['name'] for tag in recordings.get('tags', []) if tag.get('name')] - - return { - 'id': video_id, - 'url': audio_url, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'view_count': view_count, - 'comment_count': comment_count, - 'categories': categories, - } diff --git a/yt_dlp/extractor/plays.py b/yt_dlp/extractor/plays.py deleted file mode 100644 index 9371f7b23..000000000 --- a/yt_dlp/extractor/plays.py +++ /dev/null @@ -1,49 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import int_or_none - - -class PlaysTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?plays\.tv/(?:video|embeds)/(?P<id>[0-9a-f]{18})' - _TESTS = [{ - 'url': 'https://plays.tv/video/56af17f56c95335490/when-you-outplay-the-azir-wall', - 'md5': 'dfeac1198506652b5257a62762cec7bc', - 'info_dict': { - 'id': '56af17f56c95335490', - 'ext': 'mp4', - 'title': 'Bjergsen - When you outplay the Azir wall', - 'description': 'Posted by Bjergsen', - } - }, { - 'url': 'https://plays.tv/embeds/56af17f56c95335490', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://plays.tv/video/%s' % video_id, video_id) - - info = self._search_json_ld(webpage, video_id,) - - mpd_url, sources = re.search( - r'(?s)<video[^>]+data-mpd="([^"]+)"[^>]*>(.+?)</video>', - webpage).groups() - formats = self._extract_mpd_formats( - self._proto_relative_url(mpd_url), video_id, mpd_id='DASH') - for format_id, height, format_url in re.findall(r'<source\s+res="((\d+)h?)"\s+src="([^"]+)"', sources): - formats.append({ - 'url': self._proto_relative_url(format_url), - 'format_id': 'http-' + format_id, - 'height': int_or_none(height), - }) - - info.update({ - 'id': video_id, - 'description': self._og_search_description(webpage), - 'thumbnail': info.get('thumbnail') or self._og_search_thumbnail(webpage), - 'formats': formats, - }) - - return info diff --git a/yt_dlp/extractor/playvid.py b/yt_dlp/extractor/playvid.py deleted file mode 100644 index 1e0989d0a..000000000 --- a/yt_dlp/extractor/playvid.py +++ /dev/null @@ -1,90 +0,0 @@ -import re -import urllib.parse - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote -from ..utils import ExtractorError, clean_html - - -class PlayvidIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)' - _TESTS = [{ - 'url': 'http://www.playvid.com/watch/RnmBNgtrrJu', - 'md5': 'ffa2f6b2119af359f544388d8c01eb6c', - 'info_dict': { - 'id': 'RnmBNgtrrJu', - 'ext': 'mp4', - 'title': 'md5:9256d01c6317e3f703848b5906880dc8', - 'duration': 82, - 'age_limit': 18, - }, - 'skip': 'Video removed due to ToS', - }, { - 'url': 'http://www.playvid.com/watch/hwb0GpNkzgH', - 'md5': '39d49df503ad7b8f23a4432cbf046477', - 'info_dict': { - 'id': 'hwb0GpNkzgH', - 'ext': 'mp4', - 'title': 'Ellen Euro Cutie Blond Takes a Sexy Survey Get Facial in The Park', - 'age_limit': 18, - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - m_error = re.search( - r'<div class="block-error">\s*<div class="heading">\s*<div>(?P<msg>.+?)</div>\s*</div>', webpage) - if m_error: - raise ExtractorError(clean_html(m_error.group('msg')), expected=True) - - video_title = None - duration = None - video_thumbnail = None - formats = [] - - # most of the information is stored in the flashvars - flashvars = self._html_search_regex( - r'flashvars="(.+?)"', webpage, 'flashvars') - - infos = compat_urllib_parse_unquote(flashvars).split(r'&') - for info in infos: - videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info) - if videovars_match: - key = videovars_match.group(1) - val = videovars_match.group(2) - - if key == 'title': - video_title = urllib.parse.unquote_plus(val) - if key == 'duration': - try: - duration = int(val) - except ValueError: - pass - if key == 'big_thumb': - video_thumbnail = val - - videourl_match = re.match( - r'^video_urls\]\[(?P<resolution>[0-9]+)p', key) - if videourl_match: - height = int(videourl_match.group('resolution')) - formats.append({ - 'height': height, - 'url': val, - }) - - # Extract title - should be in the flashvars; if not, look elsewhere - if video_title is None: - video_title = self._html_extract_title(webpage) - - return { - 'id': video_id, - 'formats': formats, - 'title': video_title, - 'thumbnail': video_thumbnail, - 'duration': duration, - 'description': None, - 'age_limit': 18 - } diff --git a/yt_dlp/extractor/polskieradio.py b/yt_dlp/extractor/polskieradio.py index 5bf92b9b5..e0b22fffd 100644 --- a/yt_dlp/extractor/polskieradio.py +++ b/yt_dlp/extractor/polskieradio.py @@ -262,14 +262,14 @@ def _call_lp3(self, path, query, video_id, note): query=query, headers={'x-api-key': '9bf6c5a2-a7d0-4980-9ed7-a3f7291f2a81'}) def _entries(self, playlist_id, has_episodes, has_articles): - for i in itertools.count(1) if has_episodes else []: + for i in itertools.count(0) if has_episodes else []: page = self._call_lp3( 'AudioArticle/GetListByCategoryId', { 'categoryId': playlist_id, 'PageSize': 10, 'skip': i, 'format': 400, - }, playlist_id, f'Downloading episode list page {i}') + }, playlist_id, f'Downloading episode list page {i + 1}') if not traverse_obj(page, 'data'): break for episode in page['data']: @@ -281,14 +281,14 @@ def _entries(self, playlist_id, has_episodes, has_articles): 'timestamp': parse_iso8601(episode.get('datePublic')), } - for i in itertools.count(1) if has_articles else []: + for i in itertools.count(0) if has_articles else []: page = self._call_lp3( 'Article/GetListByCategoryId', { 'categoryId': playlist_id, 'PageSize': 9, 'skip': i, 'format': 400, - }, playlist_id, f'Downloading article list page {i}') + }, playlist_id, f'Downloading article list page {i + 1}') if not traverse_obj(page, 'data'): break for article in page['data']: diff --git a/yt_dlp/extractor/porncom.py b/yt_dlp/extractor/porncom.py deleted file mode 100644 index c8ef240d7..000000000 --- a/yt_dlp/extractor/porncom.py +++ /dev/null @@ -1,99 +0,0 @@ -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - int_or_none, - js_to_json, - parse_filesize, - str_to_int, -) - - -class PornComIE(InfoExtractor): - _VALID_URL = r'https?://(?:[a-zA-Z]+\.)?porn\.com/videos/(?:(?P<display_id>[^/]+)-)?(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.porn.com/videos/teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec-2603339', - 'md5': '3f30ce76267533cd12ba999263156de7', - 'info_dict': { - 'id': '2603339', - 'display_id': 'teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec', - 'ext': 'mp4', - 'title': 'Teen grabs a dildo and fucks her pussy live on 1hottie, I rec', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 551, - 'view_count': int, - 'age_limit': 18, - 'categories': list, - 'tags': list, - }, - }, { - 'url': 'http://se.porn.com/videos/marsha-may-rides-seth-on-top-of-his-thick-cock-2658067', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - - webpage = self._download_webpage(url, display_id) - - config = self._parse_json( - self._search_regex( - (r'=\s*({.+?})\s*;\s*v1ar\b', - r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*='), - webpage, 'config', default='{}'), - display_id, transform_source=js_to_json, fatal=False) - - if config: - title = config['title'] - formats = [{ - 'url': stream['url'], - 'format_id': stream.get('id'), - 'height': int_or_none(self._search_regex( - r'^(\d+)[pP]', stream.get('id') or '', 'height', default=None)) - } for stream in config['streams'] if stream.get('url')] - thumbnail = (compat_urlparse.urljoin( - config['thumbCDN'], config['poster']) - if config.get('thumbCDN') and config.get('poster') else None) - duration = int_or_none(config.get('length')) - else: - title = self._search_regex( - (r'<title>([^<]+)', r']*>([^<]+)'), - webpage, 'title') - formats = [{ - 'url': compat_urlparse.urljoin(url, format_url), - 'format_id': '%sp' % height, - 'height': int(height), - 'filesize_approx': parse_filesize(filesize), - } for format_url, height, filesize in re.findall( - r']+href="(/download/[^"]+)">[^<]*?(\d+)p]*>(\d+\s*[a-zA-Z]+)<', - webpage)] - thumbnail = None - duration = None - - view_count = str_to_int(self._search_regex( - (r'Views:\s*\s*\s*([\d,.]+)', - r'class=["\']views["\'][^>]*>

([\d,.]+)'), webpage, - 'view count', fatal=False)) - - def extract_list(kind): - s = self._search_regex( - (r'(?s)%s:\s*\s*(.+?)' % kind.capitalize(), - r'(?s)]*>%s:(.+?)

' % kind.capitalize()), - webpage, kind, fatal=False) - return re.findall(r']+>([^<]+)', s or '') - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, - 'formats': formats, - 'age_limit': 18, - 'categories': extract_list('categories'), - 'tags': extract_list('tags'), - } diff --git a/yt_dlp/extractor/pornez.py b/yt_dlp/extractor/pornez.py deleted file mode 100644 index bc45f865e..000000000 --- a/yt_dlp/extractor/pornez.py +++ /dev/null @@ -1,60 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - clean_html, - int_or_none, - get_element_by_class, - urljoin, -) - - -class PornezIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornez\.net/(?:video(?P\w+)|watch)/' - _TESTS = [{ - 'url': 'https://pornez.net/video344819/mistresst-funny_penis_names-wmv/', - 'info_dict': { - 'id': '344819', - 'ext': 'mp4', - 'title': 'mistresst funny_penis_names wmv', - 'thumbnail': r're:^https?://.*\.jpg$', - 'age_limit': 18, - }, - 'params': {'skip_download': 'm3u8'}, - }, { - 'url': 'https://pornez.net/watch/leana+lovings+stiff+for+stepdaughter/', - 'info_dict': { - 'id': '156161', - 'ext': 'mp4', - 'title': 'Watch leana lovings stiff for stepdaughter porn video.', - 'age_limit': 18, - }, - 'params': {'skip_download': 'm3u8'}, - }, { - 'url': 'https://pornez.net/videovzs27fj/tutor4k-e14-blue-wave-1080p-nbq-tutor4k-e14-blue-wave/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - if not video_id: - video_id = self._search_regex( - r']+\bhref=["\']https?://pornez.net/\?p=(\w+)["\']', webpage, 'id') - - iframe_src = self._html_search_regex(r']+src="([^"]+)"', webpage, 'iframe') - iframe = self._download_webpage(urljoin('https://pornez.net', iframe_src), video_id) - - entries = self._parse_html5_media_entries(iframe_src, iframe, video_id)[0] - for fmt in entries['formats']: - height = self._search_regex(r'_(\d+)\.m3u8', fmt['url'], 'height') - fmt['format_id'] = '%sp' % height - fmt['height'] = int_or_none(height) - - entries.update({ - 'id': video_id, - 'title': (clean_html(get_element_by_class('video-title', webpage)) - or self._html_search_meta( - ['twitter:title', 'og:title', 'description'], webpage, 'title', default=None)), - 'thumbnail': self._html_search_meta(['thumbnailUrl'], webpage, 'thumb', default=None), - 'age_limit': 18, - }) - return entries diff --git a/yt_dlp/extractor/pornhd.py b/yt_dlp/extractor/pornhd.py deleted file mode 100644 index c8a1ec80b..000000000 --- a/yt_dlp/extractor/pornhd.py +++ /dev/null @@ -1,116 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - js_to_json, - merge_dicts, - urljoin, -) - - -class PornHdIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P\d+)(?:/(?P.+))?' - _TESTS = [{ - 'url': 'http://www.pornhd.com/videos/9864/selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', - 'md5': '87f1540746c1d32ec7a2305c12b96b25', - 'info_dict': { - 'id': '9864', - 'display_id': 'selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', - 'ext': 'mp4', - 'title': 'Restroom selfie masturbation', - 'description': 'md5:3748420395e03e31ac96857a8f125b2b', - 'thumbnail': r're:^https?://.*\.jpg', - 'view_count': int, - 'like_count': int, - 'age_limit': 18, - }, - 'skip': 'HTTP Error 404: Not Found', - }, { - 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', - 'md5': '1b7b3a40b9d65a8e5b25f7ab9ee6d6de', - 'info_dict': { - 'id': '1962', - 'display_id': 'sierra-day-gets-his-cum-all-over-herself-hd-porn-video', - 'ext': 'mp4', - 'title': 'md5:98c6f8b2d9c229d0f0fde47f61a1a759', - 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', - 'thumbnail': r're:^https?://.*\.jpg', - 'view_count': int, - 'like_count': int, - 'age_limit': 18, - }, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id or video_id) - - title = self._html_search_regex( - [r']+class=["\']video-name["\'][^>]*>([^<]+)', - r'(.+?) - .*?[Pp]ornHD.*?'], webpage, 'title') - - sources = self._parse_json(js_to_json(self._search_regex( - r"(?s)sources'?\s*[:=]\s*(\{.+?\})", - webpage, 'sources', default='{}')), video_id) - - info = {} - if not sources: - entries = self._parse_html5_media_entries(url, webpage, video_id) - if entries: - info = entries[0] - - if not sources and not info: - message = self._html_search_regex( - r'(?s)<(div|p)[^>]+class="no-video"[^>]*>(?P.+?)]+class=["\']video-description[^>]+>(?P.+?)', - r'<(div|p)[^>]+class="description"[^>]*>(?P[^<]+)(?:(?!\1).)+)\1", webpage, - 'thumbnail', default=None, group='url') - - like_count = int_or_none(self._search_regex( - (r'(\d+)
\s*likes', - r'(\d+)\s*]+>(?: |\s)*\blikes', - r'class=["\']save-count["\'][^>]*>\s*(\d+)'), - webpage, 'like count', fatal=False)) - - return merge_dicts(info, { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'view_count': view_count, - 'like_count': like_count, - 'formats': formats, - 'age_limit': 18, - }) diff --git a/yt_dlp/extractor/pr0gramm.py b/yt_dlp/extractor/pr0gramm.py index c8e0bb493..2a6794208 100644 --- a/yt_dlp/extractor/pr0gramm.py +++ b/yt_dlp/extractor/pr0gramm.py @@ -4,7 +4,14 @@ from .common import InfoExtractor from ..compat import functools -from ..utils import ExtractorError, make_archive_id, urljoin +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + make_archive_id, + mimetype2ext, + urljoin, +) from ..utils.traversal import traverse_obj @@ -26,6 +33,7 @@ class Pr0grammIE(InfoExtractor): 'dislike_count': int, 'age_limit': 0, 'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg', + '_old_archive_ids': ['pr0grammstatic 5466437'], }, }, { # Tags require account @@ -43,6 +51,7 @@ class Pr0grammIE(InfoExtractor): 'dislike_count': int, 'age_limit': 0, 'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg', + '_old_archive_ids': ['pr0grammstatic 3052805'], }, }, { # Requires verified account @@ -60,6 +69,7 @@ class Pr0grammIE(InfoExtractor): 'dislike_count': int, 'age_limit': 18, 'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg', + '_old_archive_ids': ['pr0grammstatic 5848332'], }, }, { 'url': 'https://pr0gramm.com/static/5466437', @@ -110,37 +120,61 @@ def _call_api(self, endpoint, video_id, query={}, note='Downloading API json'): return data + @staticmethod + def _create_source_url(path): + return urljoin('https://img.pr0gramm.com', path) + def _real_extract(self, url): video_id = self._match_id(url) video_info = traverse_obj( self._call_api('get', video_id, {'id': video_id, 'flags': self._maximum_flags}), ('items', 0, {dict})) - source = urljoin('https://img.pr0gramm.com', video_info.get('image')) + source = video_info.get('image') if not source or not source.endswith('mp4'): self.raise_no_formats('Could not extract a video', expected=bool(source), video_id=video_id) tags = None if self._is_logged_in: - metadata = self._call_api('info', video_id, {'itemId': video_id}) + metadata = self._call_api('info', video_id, {'itemId': video_id}, note='Downloading tags') tags = traverse_obj(metadata, ('tags', ..., 'tag', {str})) # Sorted by "confidence", higher confidence = earlier in list confidences = traverse_obj(metadata, ('tags', ..., 'confidence', ({int}, {float}))) if confidences: tags = [tag for _, tag in sorted(zip(confidences, tags), reverse=True)] + formats = traverse_obj(video_info, ('variants', ..., { + 'format_id': ('name', {str}), + 'url': ('path', {self._create_source_url}), + 'ext': ('mimeType', {mimetype2ext}), + 'vcodec': ('codec', {str}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'bitrate': ('bitRate', {float_or_none}), + 'filesize': ('fileSize', {int_or_none}), + })) if video_info.get('variants') else [{ + 'ext': 'mp4', + 'format_id': 'source', + **traverse_obj(video_info, { + 'url': ('image', {self._create_source_url}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }] + + subtitles = {} + for subtitle in traverse_obj(video_info, ('subtitles', lambda _, v: v['language'])): + subtitles.setdefault(subtitle['language'], []).append(traverse_obj(subtitle, { + 'url': ('path', {self._create_source_url}), + 'note': ('label', {str}), + })) + return { 'id': video_id, 'title': f'pr0gramm-{video_id} by {video_info.get("user")}', - 'formats': [{ - 'url': source, - 'ext': 'mp4', - **traverse_obj(video_info, { - 'width': ('width', {int}), - 'height': ('height', {int}), - }), - }], 'tags': tags, + 'formats': formats, + 'subtitles': subtitles, 'age_limit': 18 if traverse_obj(video_info, ('flags', {0b110.__and__})) else 0, '_old_archive_ids': [make_archive_id('Pr0grammStatic', video_id)], **traverse_obj(video_info, { diff --git a/yt_dlp/extractor/qdance.py b/yt_dlp/extractor/qdance.py index d817677f0..934ebbfd7 100644 --- a/yt_dlp/extractor/qdance.py +++ b/yt_dlp/extractor/qdance.py @@ -15,7 +15,7 @@ class QDanceIE(InfoExtractor): _NETRC_MACHINE = 'qdance' - _VALID_URL = r'https?://(?:www\.)?q-dance\.com/network/(?:library|live)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?q-dance\.com/network/(?:library|live)/(?P[\w-]+)' _TESTS = [{ 'note': 'vod', 'url': 'https://www.q-dance.com/network/library/146542138', @@ -53,6 +53,27 @@ class QDanceIE(InfoExtractor): 'channel_id': 'qdancenetwork.video_149170353', }, 'skip': 'Completed livestream', + }, { + 'note': 'vod with alphanumeric id', + 'url': 'https://www.q-dance.com/network/library/WhDleSIWSfeT3Q9ObBKBeA', + 'info_dict': { + 'id': 'WhDleSIWSfeT3Q9ObBKBeA', + 'ext': 'mp4', + 'title': 'Aftershock I Defqon.1 Weekend Festival 2023 I Sunday I BLUE', + 'display_id': 'naam-i-defqon-1-weekend-festival-2023-i-dag-i-podium', + 'description': 'Relive Defqon.1 Path of the Warrior with Aftershock at the BLUE 🔥', + 'series': 'Defqon.1', + 'series_id': '31840378', + 'season': 'Defqon.1 Weekend Festival 2023', + 'season_id': '141735599', + 'duration': 3507, + 'availability': 'premium_only', + 'thumbnail': 'https://images.q-dance.network/1698158361-230625-135716-defqon-1-aftershock.jpg', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.q-dance.com/network/library/-uRFKXwmRZGVnve7av9uqA', + 'only_matching': True, }] _access_token = None diff --git a/yt_dlp/extractor/radiko.py b/yt_dlp/extractor/radiko.py index 8c8fb1a8f..c363d9ba5 100644 --- a/yt_dlp/extractor/radiko.py +++ b/yt_dlp/extractor/radiko.py @@ -154,7 +154,7 @@ def _extract_formats(self, video_id, station, is_onair, ft, cursor, auth_token, sf['preference'] = -100 sf['format_note'] = 'not preferred' if not is_onair and timefree_int == 1 and time_to_skip: - sf['downloader_options'] = {'ffmpeg_args': ['-ss', time_to_skip]} + sf['downloader_options'] = {'ffmpeg_args': ['-ss', str(time_to_skip)]} formats.extend(subformats) return formats diff --git a/yt_dlp/extractor/radiobremen.py b/yt_dlp/extractor/radiobremen.py deleted file mode 100644 index 99ba050d0..000000000 --- a/yt_dlp/extractor/radiobremen.py +++ /dev/null @@ -1,59 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import parse_duration - - -class RadioBremenIE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?radiobremen\.de/mediathek/(?:index\.html)?\?id=(?P[0-9]+)' - IE_NAME = 'radiobremen' - - _TEST = { - 'url': 'http://www.radiobremen.de/mediathek/?id=141876', - 'info_dict': { - 'id': '141876', - 'ext': 'mp4', - 'duration': 178, - 'width': 512, - 'title': 'Druck auf Patrick Öztürk', - 'thumbnail': r're:https?://.*\.jpg$', - 'description': 'Gegen den SPD-Bürgerschaftsabgeordneten Patrick Öztürk wird wegen Beihilfe zum gewerbsmäßigen Betrug ermittelt. Am Donnerstagabend sollte er dem Vorstand des SPD-Unterbezirks Bremerhaven dazu Rede und Antwort stehen.', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - meta_url = 'http://www.radiobremen.de/apps/php/mediathek/metadaten.php?id=%s' % video_id - meta_doc = self._download_webpage( - meta_url, video_id, 'Downloading metadata') - title = self._html_search_regex( - r'(?P.+)</h1>', meta_doc, 'title') - description = self._html_search_regex( - r'<p>(?P<description>.*)</p>', meta_doc, 'description', fatal=False) - duration = parse_duration(self._html_search_regex( - r'Länge:</td>\s+<td>(?P<duration>[0-9]+:[0-9]+)</td>', - meta_doc, 'duration', fatal=False)) - - page_doc = self._download_webpage( - url, video_id, 'Downloading video information') - mobj = re.search( - r"ardformatplayerclassic\(\'playerbereich\',\'(?P<width>[0-9]+)\',\'.*\',\'(?P<video_id>[0-9]+)\',\'(?P<secret>[0-9]+)\',\'(?P<thumbnail>.+)\',\'\'\)", - page_doc) - video_url = ( - "http://dl-ondemand.radiobremen.de/mediabase/%s/%s_%s_%s.mp4" % - (video_id, video_id, mobj.group("secret"), mobj.group('width'))) - - formats = [{ - 'url': video_url, - 'ext': 'mp4', - 'width': int(mobj.group('width')), - }] - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'formats': formats, - 'thumbnail': mobj.group('thumbnail'), - } diff --git a/yt_dlp/extractor/radiocomercial.py b/yt_dlp/extractor/radiocomercial.py new file mode 100644 index 000000000..07891fe41 --- /dev/null +++ b/yt_dlp/extractor/radiocomercial.py @@ -0,0 +1,150 @@ +import itertools + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + extract_attributes, + get_element_by_class, + get_element_html_by_class, + get_element_text_and_html_by_tag, + get_elements_html_by_class, + int_or_none, + join_nonempty, + try_call, + unified_strdate, + update_url, + urljoin +) +from ..utils.traversal import traverse_obj + + +class RadioComercialIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/[^/?#]+/t?(?P<season>\d+)/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao/t6/taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas#page-content-wrapper', + 'md5': '5f4fe8e485b29d2e8fd495605bc2c7e4', + 'info_dict': { + 'id': 'taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas', + 'ext': 'mp3', + 'title': 'Taylor Swift entranhando-se que nem uma espada no ventre dos fãs.', + 'release_date': '20231025', + 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg', + 'season': 6 + } + }, { + 'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3/convenca-me-num-minuto-que-os-lobisomens-existem', + 'md5': '47e96c273aef96a8eb160cd6cf46d782', + 'info_dict': { + 'id': 'convenca-me-num-minuto-que-os-lobisomens-existem', + 'ext': 'mp3', + 'title': 'Convença-me num minuto que os lobisomens existem', + 'release_date': '20231026', + 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg', + 'season': 3 + } + }, { + 'url': 'https://radiocomercial.pt/podcasts/inacreditavel-by-ines-castel-branco/t2/o-desastre-de-aviao', + 'md5': '69be64255420fec23b7259955d771e54', + 'info_dict': { + 'id': 'o-desastre-de-aviao', + 'ext': 'mp3', + 'title': 'O desastre de avião', + 'description': 'md5:8a82beeb372641614772baab7246245f', + 'release_date': '20231101', + 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg', + 'season': 2 + }, + 'params': { + # inconsistant md5 + 'skip_download': True, + }, + }, { + 'url': 'https://radiocomercial.pt/podcasts/tnt-todos-no-top/2023/t-n-t-29-de-outubro', + 'md5': '91d32d4d4b1407272068b102730fc9fa', + 'info_dict': { + 'id': 't-n-t-29-de-outubro', + 'ext': 'mp3', + 'title': 'T.N.T 29 de outubro', + 'release_date': '20231029', + 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg', + 'season': 2023 + } + }] + + def _real_extract(self, url): + video_id, season = self._match_valid_url(url).group('id', 'season') + webpage = self._download_webpage(url, video_id) + return { + 'id': video_id, + 'title': self._html_extract_title(webpage), + 'description': self._og_search_description(webpage, default=None), + 'release_date': unified_strdate(get_element_by_class( + 'date', get_element_html_by_class('descriptions', webpage) or '')), + 'thumbnail': self._og_search_thumbnail(webpage), + 'season': int_or_none(season), + 'url': extract_attributes(get_element_html_by_class('audiofile', webpage) or '').get('href'), + } + + +class RadioComercialPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/(?P<id>[\w-]+)(?:/t?(?P<season>\d+))?/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3', + 'info_dict': { + 'id': 'convenca-me-num-minuto_t3', + 'title': 'Convença-me num Minuto - Temporada 3', + }, + 'playlist_mincount': 32 + }, { + 'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao', + 'info_dict': { + 'id': 'o-homem-que-mordeu-o-cao', + 'title': 'O Homem Que Mordeu o Cão', + }, + 'playlist_mincount': 19 + }, { + 'url': 'https://radiocomercial.pt/podcasts/as-minhas-coisas-favoritas', + 'info_dict': { + 'id': 'as-minhas-coisas-favoritas', + 'title': 'As Minhas Coisas Favoritas', + }, + 'playlist_mincount': 131 + }, { + 'url': 'https://radiocomercial.pt/podcasts/tnt-todos-no-top/t2023', + 'info_dict': { + 'id': 'tnt-todos-no-top_t2023', + 'title': 'TNT - Todos No Top - Temporada 2023', + }, + 'playlist_mincount': 39 + }] + + def _entries(self, url, playlist_id): + for page in itertools.count(1): + try: + webpage = self._download_webpage( + f'{url}/{page}', playlist_id, f'Downloading page {page}') + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 404: + break + raise + + episodes = get_elements_html_by_class('tm-ouvir-podcast', webpage) + if not episodes: + break + for url_path in traverse_obj(episodes, (..., {extract_attributes}, 'href')): + episode_url = urljoin(url, url_path) + if RadioComercialIE.suitable(episode_url): + yield episode_url + + def _real_extract(self, url): + podcast, season = self._match_valid_url(url).group('id', 'season') + playlist_id = join_nonempty(podcast, season, delim='_t') + url = update_url(url, query=None, fragment=None) + webpage = self._download_webpage(url, playlist_id) + + name = try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0]) + title = name if name == season else join_nonempty(name, season, delim=' - Temporada ') + + return self.playlist_from_matches( + self._entries(url, playlist_id), playlist_id, title, ie=RadioComercialIE) diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py index ec1b97631..6bd6fe9b6 100644 --- a/yt_dlp/extractor/radiofrance.py +++ b/yt_dlp/extractor/radiofrance.py @@ -264,7 +264,7 @@ def _real_extract(self, url): } -class RadioFrancePlaylistBase(RadioFranceBaseIE): +class RadioFrancePlaylistBaseIE(RadioFranceBaseIE): """Subclasses must set _METADATA_KEY""" def _call_api(self, content_id, cursor, page_num): @@ -308,7 +308,7 @@ def _real_extract(self, url): })}) -class RadioFrancePodcastIE(RadioFrancePlaylistBase): +class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE): _VALID_URL = rf'''(?x) {RadioFranceBaseIE._VALID_URL_BASE} /(?:{RadioFranceBaseIE._STATIONS_RE}) @@ -369,7 +369,7 @@ def _call_api(self, podcast_id, cursor, page_num): note=f'Downloading page {page_num}', query={'pageCursor': cursor}) -class RadioFranceProfileIE(RadioFrancePlaylistBase): +class RadioFranceProfileIE(RadioFrancePlaylistBaseIE): _VALID_URL = rf'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P<id>[\w-]+)' _TESTS = [{ diff --git a/yt_dlp/extractor/recurbate.py b/yt_dlp/extractor/recurbate.py deleted file mode 100644 index d7294cb14..000000000 --- a/yt_dlp/extractor/recurbate.py +++ /dev/null @@ -1,42 +0,0 @@ -from .common import InfoExtractor -from ..networking.exceptions import HTTPError -from ..utils import ExtractorError, merge_dicts - - -class RecurbateIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?recurbate\.com/play\.php\?video=(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://recurbate.com/play.php?video=39161415', - 'md5': 'dd2b4ec57aa3e3572cb5cf0997fca99f', - 'info_dict': { - 'id': '39161415', - 'ext': 'mp4', - 'description': 'md5:db48d09e4d93fc715f47fd3d6b7edd51', - 'title': 'Performer zsnicole33 show on 2022-10-25 20:23, Chaturbate Archive – Recurbate', - 'age_limit': 18, - }, - 'skip': 'Website require membership.', - }] - - def _real_extract(self, url): - SUBSCRIPTION_MISSING_MESSAGE = 'This video is only available for registered users; Set your authenticated browser user agent via the --user-agent parameter.' - video_id = self._match_id(url) - try: - webpage = self._download_webpage(url, video_id) - except ExtractorError as e: - if isinstance(e.cause, HTTPError) and e.cause.status == 403: - self.raise_login_required(msg=SUBSCRIPTION_MISSING_MESSAGE, method='cookies') - raise - token = self._html_search_regex(r'data-token="([^"]+)"', webpage, 'token') - video_url = f'https://recurbate.com/api/get.php?video={video_id}&token={token}' - - video_webpage = self._download_webpage(video_url, video_id) - if video_webpage == 'shall_subscribe': - self.raise_login_required(msg=SUBSCRIPTION_MISSING_MESSAGE, method='cookies') - entries = self._parse_html5_media_entries(video_url, video_webpage, video_id) - return merge_dicts({ - 'id': video_id, - 'title': self._html_extract_title(webpage, 'title'), - 'description': self._og_search_description(webpage), - 'age_limit': self._rta_search(webpage), - }, entries[0]) diff --git a/yt_dlp/extractor/redtube.py b/yt_dlp/extractor/redtube.py index 49076ccd8..172c31b39 100644 --- a/yt_dlp/extractor/redtube.py +++ b/yt_dlp/extractor/redtube.py @@ -39,7 +39,7 @@ class RedTubeIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://www.redtube.com/%s' % video_id, video_id) + f'https://www.redtube.com/{video_id}', video_id) ERRORS = ( (('video-deleted-info', '>This video has been removed'), 'has been removed'), diff --git a/yt_dlp/extractor/rice.py b/yt_dlp/extractor/rice.py deleted file mode 100644 index 3dd4d31de..000000000 --- a/yt_dlp/extractor/rice.py +++ /dev/null @@ -1,112 +0,0 @@ -import re - -from .common import InfoExtractor -from ..compat import compat_parse_qs -from ..utils import ( - xpath_text, - xpath_element, - int_or_none, - parse_iso8601, - ExtractorError, -) - - -class RICEIE(InfoExtractor): - _VALID_URL = r'https?://mediahub\.rice\.edu/app/[Pp]ortal/video\.aspx\?(?P<query>.+)' - _TEST = { - 'url': 'https://mediahub.rice.edu/app/Portal/video.aspx?PortalID=25ffd62c-3d01-4b29-8c70-7c94270efb3e&DestinationID=66bc9434-03bd-4725-b47e-c659d8d809db&ContentID=YEWIvbhb40aqdjMD1ALSqw', - 'md5': '9b83b4a2eead4912dc3b7fac7c449b6a', - 'info_dict': { - 'id': 'YEWIvbhb40aqdjMD1ALSqw', - 'ext': 'mp4', - 'title': 'Active Learning in Archeology', - 'upload_date': '20140616', - 'timestamp': 1402926346, - } - } - _NS = 'http://schemas.datacontract.org/2004/07/ensembleVideo.Data.Service.Contracts.Models.Player.Config' - - def _real_extract(self, url): - qs = compat_parse_qs(self._match_valid_url(url).group('query')) - if not qs.get('PortalID') or not qs.get('DestinationID') or not qs.get('ContentID'): - raise ExtractorError('Invalid URL', expected=True) - - portal_id = qs['PortalID'][0] - playlist_id = qs['DestinationID'][0] - content_id = qs['ContentID'][0] - - content_data = self._download_xml('https://mediahub.rice.edu/api/portal/GetContentTitle', content_id, query={ - 'portalId': portal_id, - 'playlistId': playlist_id, - 'contentId': content_id - }) - metadata = xpath_element(content_data, './/metaData', fatal=True) - title = xpath_text(metadata, 'primaryTitle', fatal=True) - encodings = xpath_element(content_data, './/encodings', fatal=True) - player_data = self._download_xml('https://mediahub.rice.edu/api/player/GetPlayerConfig', content_id, query={ - 'temporaryLinkId': xpath_text(encodings, 'temporaryLinkId', fatal=True), - 'contentId': content_id, - }) - - common_fmt = {} - dimensions = xpath_text(encodings, 'dimensions') - if dimensions: - wh = dimensions.split('x') - if len(wh) == 2: - common_fmt.update({ - 'width': int_or_none(wh[0]), - 'height': int_or_none(wh[1]), - }) - - formats = [] - rtsp_path = xpath_text(player_data, self._xpath_ns('RtspPath', self._NS)) - if rtsp_path: - fmt = { - 'url': rtsp_path, - 'format_id': 'rtsp', - } - fmt.update(common_fmt) - formats.append(fmt) - for source in player_data.findall(self._xpath_ns('.//Source', self._NS)): - video_url = xpath_text(source, self._xpath_ns('File', self._NS)) - if not video_url: - continue - if '.m3u8' in video_url: - formats.extend(self._extract_m3u8_formats(video_url, content_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - else: - fmt = { - 'url': video_url, - 'format_id': video_url.split(':')[0], - } - fmt.update(common_fmt) - rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', video_url) - if rtmp: - fmt.update({ - 'url': rtmp.group('url'), - 'play_path': rtmp.group('playpath'), - 'app': rtmp.group('app'), - 'ext': 'flv', - }) - formats.append(fmt) - - thumbnails = [] - for content_asset in content_data.findall('.//contentAssets'): - asset_type = xpath_text(content_asset, 'type') - if asset_type == 'image': - image_url = xpath_text(content_asset, 'httpPath') - if not image_url: - continue - thumbnails.append({ - 'id': xpath_text(content_asset, 'ID'), - 'url': image_url, - }) - - return { - 'id': content_id, - 'title': title, - 'description': xpath_text(metadata, 'abstract'), - 'duration': int_or_none(xpath_text(metadata, 'duration')), - 'timestamp': parse_iso8601(xpath_text(metadata, 'dateUpdated')), - 'thumbnails': thumbnails, - 'formats': formats, - } diff --git a/yt_dlp/extractor/rinsefm.py b/yt_dlp/extractor/rinsefm.py new file mode 100644 index 000000000..f87b895df --- /dev/null +++ b/yt_dlp/extractor/rinsefm.py @@ -0,0 +1,89 @@ +from .common import InfoExtractor +from ..utils import ( + MEDIA_EXTENSIONS, + determine_ext, + parse_iso8601, + traverse_obj, + url_or_none, +) + + +class RinseFMBaseIE(InfoExtractor): + @staticmethod + def _parse_entry(entry): + return { + **traverse_obj(entry, { + 'id': ('id', {str}), + 'title': ('title', {str}), + 'url': ('fileUrl', {url_or_none}), + 'release_timestamp': ('episodeDate', {parse_iso8601}), + 'thumbnail': ('featuredImage', 0, 'filename', {str}, + {lambda x: x and f'https://rinse.imgix.net/media/{x}'}), + 'webpage_url': ('slug', {str}, + {lambda x: x and f'https://rinse.fm/episodes/{x}'}), + }), + 'vcodec': 'none', + 'extractor_key': RinseFMIE.ie_key(), + 'extractor': RinseFMIE.IE_NAME, + } + + +class RinseFMIE(RinseFMBaseIE): + _VALID_URL = r'https?://(?:www\.)?rinse\.fm/episodes/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://rinse.fm/episodes/club-glow-15-12-2023-2000/', + 'md5': '76ee0b719315617df42e15e710f46c7b', + 'info_dict': { + 'id': '1536535', + 'ext': 'mp3', + 'title': 'Club Glow - 15/12/2023 - 20:00', + 'thumbnail': r're:^https://.+\.(?:jpg|JPG)$', + 'release_timestamp': 1702598400, + 'release_date': '20231215' + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + entry = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['entry'] + + return self._parse_entry(entry) + + +class RinseFMArtistPlaylistIE(RinseFMBaseIE): + _VALID_URL = r'https?://(?:www\.)?rinse\.fm/shows/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://rinse.fm/shows/resources/', + 'info_dict': { + 'id': 'resources', + 'title': '[re]sources', + 'description': '[re]sources est un label parisien piloté par le DJ et producteur Tommy Kid.' + }, + 'playlist_mincount': 40 + }, { + 'url': 'https://rinse.fm/shows/ivy/', + 'info_dict': { + 'id': 'ivy', + 'title': '[IVY]', + 'description': 'A dedicated space for DNB/Turbo House and 4x4.' + }, + 'playlist_mincount': 7 + }] + + def _entries(self, data): + for episode in traverse_obj(data, ( + 'props', 'pageProps', 'episodes', lambda _, v: determine_ext(v['fileUrl']) in MEDIA_EXTENSIONS.audio) + ): + yield self._parse_entry(episode) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + title = self._og_search_title(webpage) or self._html_search_meta('title', webpage) + description = self._og_search_description(webpage) or self._html_search_meta( + 'description', webpage) + data = self._search_nextjs_data(webpage, playlist_id) + + return self.playlist_result( + self._entries(data), playlist_id, title, description=description) diff --git a/yt_dlp/extractor/rtl2.py b/yt_dlp/extractor/rtl2.py index 056cf87d2..07e1aa3ce 100644 --- a/yt_dlp/extractor/rtl2.py +++ b/yt_dlp/extractor/rtl2.py @@ -1,16 +1,7 @@ import re from .common import InfoExtractor -from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 -from ..compat import ( - compat_b64decode, - compat_str, -) -from ..utils import ( - ExtractorError, - int_or_none, - strip_or_none, -) +from ..utils import int_or_none class RTL2IE(InfoExtractor): @@ -102,92 +93,3 @@ def _real_extract(self, url): 'duration': int_or_none(video_info.get('duration')), 'formats': formats, } - - -class RTL2YouBaseIE(InfoExtractor): - _BACKWERK_BASE_URL = 'https://p-you-backwerk.rtl2apps.de/' - - -class RTL2YouIE(RTL2YouBaseIE): - IE_NAME = 'rtl2:you' - _VALID_URL = r'http?://you\.rtl2\.de/(?:video/\d+/|youplayer/index\.html\?.*?\bvid=)(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://you.rtl2.de/video/3002/15740/MJUNIK%20%E2%80%93%20Home%20of%20YOU/307-hirn-wo-bist-du', - 'info_dict': { - 'id': '15740', - 'ext': 'mp4', - 'title': 'MJUNIK – Home of YOU - #307 Hirn, wo bist du?!', - 'description': 'md5:ddaa95c61b372b12b66e115b2772fe01', - 'age_limit': 12, - }, - }, { - 'url': 'http://you.rtl2.de/youplayer/index.html?vid=15712', - 'only_matching': True, - }] - _AES_KEY = b'\xe9W\xe4.<*\xb8\x1a\xd2\xb6\x92\xf3C\xd3\xefL\x1b\x03*\xbbbH\xc0\x03\xffo\xc2\xf2(\xaa\xaa!' - _GEO_COUNTRIES = ['DE'] - - def _real_extract(self, url): - video_id = self._match_id(url) - - stream_data = self._download_json( - self._BACKWERK_BASE_URL + 'stream/video/' + video_id, video_id) - - data, iv = compat_b64decode(stream_data['streamUrl']).decode().split(':') - stream_url = unpad_pkcs7(aes_cbc_decrypt_bytes( - compat_b64decode(data), self._AES_KEY, compat_b64decode(iv))) - if b'rtl2_you_video_not_found' in stream_url: - raise ExtractorError('video not found', expected=True) - - formats = self._extract_m3u8_formats(stream_url.decode(), video_id, 'mp4', 'm3u8_native') - - video_data = self._download_json( - self._BACKWERK_BASE_URL + 'video/' + video_id, video_id) - - series = video_data.get('formatTitle') - title = episode = video_data.get('title') or series - if series and series != title: - title = '%s - %s' % (series, title) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'description': strip_or_none(video_data.get('description')), - 'thumbnail': video_data.get('image'), - 'duration': int_or_none(stream_data.get('duration') or video_data.get('duration'), 1000), - 'series': series, - 'episode': episode, - 'age_limit': int_or_none(video_data.get('minimumAge')), - } - - -class RTL2YouSeriesIE(RTL2YouBaseIE): - IE_NAME = 'rtl2:you:series' - _VALID_URL = r'http?://you\.rtl2\.de/videos/(?P<id>\d+)' - _TEST = { - 'url': 'http://you.rtl2.de/videos/115/dragon-ball', - 'info_dict': { - 'id': '115', - }, - 'playlist_mincount': 5, - } - - def _real_extract(self, url): - series_id = self._match_id(url) - stream_data = self._download_json( - self._BACKWERK_BASE_URL + 'videos', - series_id, query={ - 'formatId': series_id, - 'limit': 1000000000, - }) - - entries = [] - for video in stream_data.get('videos', []): - video_id = compat_str(video['videoId']) - if not video_id: - continue - entries.append(self.url_result( - 'http://you.rtl2.de/video/%s/%s' % (series_id, video_id), - 'RTL2You', video_id)) - return self.playlist_result(entries, series_id) diff --git a/yt_dlp/extractor/rtvnh.py b/yt_dlp/extractor/rtvnh.py deleted file mode 100644 index 7c6174494..000000000 --- a/yt_dlp/extractor/rtvnh.py +++ /dev/null @@ -1,58 +0,0 @@ -from .common import InfoExtractor -from ..utils import ExtractorError - - -class RTVNHIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.rtvnh.nl/video/131946', - 'md5': 'cdbec9f44550763c8afc96050fa747dc', - 'info_dict': { - 'id': '131946', - 'ext': 'mp4', - 'title': 'Grote zoektocht in zee bij Zandvoort naar vermiste vrouw', - 'thumbnail': r're:^https?:.*\.jpg$' - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - meta = self._parse_json(self._download_webpage( - 'http://www.rtvnh.nl/video/json?m=' + video_id, video_id), video_id) - - status = meta.get('status') - if status != 200: - raise ExtractorError( - '%s returned error code %d' % (self.IE_NAME, status), expected=True) - - formats = [] - rtmp_formats = self._extract_smil_formats( - 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id) - formats.extend(rtmp_formats) - - for rtmp_format in rtmp_formats: - rtmp_url = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) - rtsp_format = rtmp_format.copy() - del rtsp_format['play_path'] - del rtsp_format['ext'] - rtsp_format.update({ - 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), - 'url': rtmp_url.replace('rtmp://', 'rtsp://'), - 'protocol': 'rtsp', - }) - formats.append(rtsp_format) - http_base_url = rtmp_url.replace('rtmp://', 'http://') - formats.extend(self._extract_m3u8_formats( - http_base_url + '/playlist.m3u8', video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - http_base_url + '/manifest.f4m', - video_id, f4m_id='hds', fatal=False)) - - return { - 'id': video_id, - 'title': meta['title'].strip(), - 'thumbnail': meta.get('image'), - 'formats': formats - } diff --git a/yt_dlp/extractor/rudovideo.py b/yt_dlp/extractor/rudovideo.py new file mode 100644 index 000000000..1b8595593 --- /dev/null +++ b/yt_dlp/extractor/rudovideo.py @@ -0,0 +1,135 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + js_to_json, + traverse_obj, + update_url_query, + url_or_none, +) + + +class RudoVideoIE(InfoExtractor): + _VALID_URL = r'https?://rudo\.video/(?P<type>vod|podcast|live)/(?P<id>[^/?&#]+)' + _EMBED_REGEX = [r'<iframe[^>]+src=[\'"](?P<url>(?:https?:)//rudo\.video/(?:vod|podcast|live)/[^\'"]+)'] + _TESTS = [{ + 'url': 'https://rudo.video/podcast/cz2wrUy8l0o', + 'md5': '28ed82b477708dc5e12e072da2449221', + 'info_dict': { + 'id': 'cz2wrUy8l0o', + 'title': 'Diego Cabot', + 'ext': 'mp4', + 'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$', + }, + }, { + 'url': 'https://rudo.video/podcast/bQkt07', + 'md5': '36b22a9863de0f47f00fc7532a32a898', + 'info_dict': { + 'id': 'bQkt07', + 'title': 'Tubular Bells', + 'ext': 'mp4', + 'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$', + }, + }, { + 'url': 'https://rudo.video/podcast/b42ZUznHX0', + 'md5': 'b91c70d832938871367f8ad10c895821', + 'info_dict': { + 'id': 'b42ZUznHX0', + 'title': 'Columna Ruperto Concha', + 'ext': 'mp3', + 'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$', + }, + }, { + 'url': 'https://rudo.video/vod/bN5AaJ', + 'md5': '01324a329227e2591530ecb4f555c881', + 'info_dict': { + 'id': 'bN5AaJ', + 'title': 'Ucrania 19.03', + 'creator': 'La Tercera', + 'ext': 'mp4', + 'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$', + }, + }, { + 'url': 'https://rudo.video/live/bbtv', + 'info_dict': { + 'id': 'bbtv', + 'ext': 'mp4', + 'creator': 'BioBioTV', + 'live_status': 'is_live', + 'title': r're:^LIVE BBTV\s\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}$', + 'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$', + }, + }, { + 'url': 'https://rudo.video/live/c13', + 'info_dict': { + 'id': 'c13', + 'title': 'CANAL13', + 'ext': 'mp4', + }, + 'skip': 'Geo-restricted to Chile', + }, { + 'url': 'https://rudo.video/live/t13-13cl', + 'info_dict': { + 'id': 't13-13cl', + 'title': 'T13', + 'ext': 'mp4', + }, + 'skip': 'Geo-restricted to Chile', + }] + + def _real_extract(self, url): + video_id, type_ = self._match_valid_url(url).group('id', 'type') + is_live = type_ == 'live' + + webpage = self._download_webpage(url, video_id) + if 'Streaming is not available in your area' in webpage: + self.raise_geo_restricted() + + media_url = ( + self._search_regex( + r'var\s+streamURL\s*=\s*[\'"]([^?\'"]+)', webpage, 'stream url', default=None) + # Source URL must be used only if streamURL is unavailable + or self._search_regex( + r'<source[^>]+src=[\'"]([^\'"]+)', webpage, 'source url', default=None)) + if not media_url: + youtube_url = self._search_regex(r'file:\s*[\'"]((?:https?:)//(?:www\.)?youtube\.com[^\'"]+)', + webpage, 'youtube url', default=None) + if youtube_url: + return self.url_result(youtube_url, 'Youtube') + raise ExtractorError('Unable to extract stream url') + + token_array = self._search_json( + r'<script>var\s+_\$_[a-zA-Z0-9]+\s*=', webpage, 'access token array', video_id, + contains_pattern=r'\[(?s:.+)\]', default=None, transform_source=js_to_json) + if token_array: + token_url = traverse_obj(token_array, (..., {url_or_none}), get_all=False) + if not token_url: + raise ExtractorError('Invalid access token array') + access_token = self._download_json( + token_url, video_id, note='Downloading access token')['data']['authToken'] + media_url = update_url_query(media_url, {'auth-token': access_token}) + + ext = determine_ext(media_url) + if ext == 'm3u8': + formats = self._extract_m3u8_formats(media_url, video_id, live=is_live) + elif ext == 'mp3': + formats = [{ + 'url': media_url, + 'vcodec': 'none', + }] + else: + formats = [{'url': media_url}] + + return { + 'id': video_id, + 'title': (self._search_regex(r'var\s+titleVideo\s*=\s*[\'"]([^\'"]+)', + webpage, 'title', default=None) + or self._og_search_title(webpage)), + 'creator': self._search_regex(r'var\s+videoAuthor\s*=\s*[\'"]([^?\'"]+)', + webpage, 'videoAuthor', default=None), + 'thumbnail': (self._search_regex(r'var\s+posterIMG\s*=\s*[\'"]([^?\'"]+)', + webpage, 'thumbnail', default=None) + or self._og_search_thumbnail(webpage)), + 'formats': formats, + 'is_live': is_live, + } diff --git a/yt_dlp/extractor/ruhd.py b/yt_dlp/extractor/ruhd.py deleted file mode 100644 index abaa3f9ea..000000000 --- a/yt_dlp/extractor/ruhd.py +++ /dev/null @@ -1,42 +0,0 @@ -from .common import InfoExtractor - - -class RUHDIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)' - _TEST = { - 'url': 'http://www.ruhd.ru/play.php?vid=207', - 'md5': 'd1a9ec4edf8598e3fbd92bb16072ba83', - 'info_dict': { - 'id': '207', - 'ext': 'divx', - 'title': 'КОТ бааааам', - 'description': 'классный кот)', - 'thumbnail': r're:^http://.*\.jpg$', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_url = self._html_search_regex( - r'<param name="src" value="([^"]+)"', webpage, 'video url') - title = self._html_search_regex( - r'<title>([^<]+)   RUHD\.ru - Видео Высокого качества №1 в России!', - webpage, 'title') - description = self._html_search_regex( - r'(?s)
(.+?)', - webpage, 'description', fatal=False) - thumbnail = self._html_search_regex( - r'\s+((?:\d+:?)+)', webpage, 'duration', default=None) + categories, creator, uploader, uploader_url = [None] * 4 + for col in get_elements_by_class('col', webpage): + label = clean_html(get_element_by_class('label', col)) + if label == 'Categories:': + categories = list(map(clean_html, get_elements_by_class('item', col))) + elif label == 'Artist:': + creator = join_nonempty(*map(clean_html, get_elements_by_class('item', col)), delim=', ') + elif label == 'Uploaded By:': + uploader = clean_html(get_element_by_class('name', col)) + uploader_url = extract_attributes(get_element_html_by_class('name', col) or '').get('href') return { + **traverse_obj(self._search_json_ld(webpage, video_id, default={}), ({ + 'title': 'title', + 'view_count': 'view_count', + 'like_count': 'like_count', + 'duration': 'duration', + 'timestamp': 'timestamp', + 'description': 'description', + 'thumbnail': ('thumbnails', 0, 'url'), + })), 'id': video_id, 'formats': formats, - 'title': title, - 'thumbnail': thumbnail, - 'duration': parse_duration(duration), + 'title': self._html_extract_title(webpage), + 'thumbnail': self._html_search_regex( + r'preview_url:\s+\'([^\']+)\'', webpage, 'thumbnail', default=None), + 'duration': parse_duration(self._html_search_regex( + r'"icon-clock">\s+((?:\d+:?)+)', webpage, 'duration', default=None)), + 'view_count': int_or_none(self._html_search_regex( + r'"icon-eye">\s+([ \d]+)', webpage, 'views', default='').replace(' ', '')), + 'like_count': parse_count(get_element_by_class('voters count', webpage)), + 'comment_count': int_or_none(self._search_regex( + r'[^(]+\((\d+)\)', get_element_by_attribute('href', '#tab_comments', webpage), 'comment count', fatal=False)), 'age_limit': 18, + 'creator': creator, + 'uploader': uploader, + 'uploader_url': uploader_url, + 'categories': categories, 'tags': list(map(unescapeHTML, re.findall( r']+\bhref="https://rule34video\.com/tags/\d+/"[^>]*>(?P[^>]*)', webpage))), } diff --git a/yt_dlp/extractor/sbscokr.py b/yt_dlp/extractor/sbscokr.py new file mode 100644 index 000000000..001d19ee1 --- /dev/null +++ b/yt_dlp/extractor/sbscokr.py @@ -0,0 +1,200 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + parse_iso8601, + parse_resolution, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class SBSCoKrIE(InfoExtractor): + IE_NAME = 'sbs.co.kr' + _VALID_URL = [r'https?://allvod\.sbs\.co\.kr/allvod/vod(?:Package)?EndPage\.do\?(?:[^#]+&)?mdaId=(?P\d+)', + r'https?://programs\.sbs\.co\.kr/(?:enter|drama|culture|sports|plus|mtv|kth)/[a-z0-9]+/(?:vod|clip|movie)/\d+/(?P(?:OC)?\d+)'] + + _TESTS = [{ + 'url': 'https://programs.sbs.co.kr/enter/dongsang2/clip/52007/OC467706746?div=main_pop_clip', + 'md5': 'c3f6d45e1fb5682039d94cda23c36f19', + 'info_dict': { + 'id': 'OC467706746', + 'ext': 'mp4', + 'title': '‘아슬아슬’ 박군♥한영의 새 집 인테리어 대첩♨', + 'description': 'md5:6a71eb1979ee4a94ea380310068ccab4', + 'thumbnail': 'https://img2.sbs.co.kr/ops_clip_img/2023/10/10/34c4c0f9-a9a5-4ff6-a92e-9bb4b5f6fa65915w1280.jpg', + 'release_timestamp': 1696889400, + 'release_date': '20231009', + 'view_count': int, + 'like_count': int, + 'duration': 238, + 'age_limit': 15, + 'series': '동상이몽2_너는 내 운명', + 'episode': '레이디제인, ‘혼전임신설’ ‘3개월’ 앞당긴 결혼식 비하인드 스토리 최초 공개!', + 'episode_number': 311, + }, + }, { + 'url': 'https://allvod.sbs.co.kr/allvod/vodPackageEndPage.do?mdaId=22000489324&combiId=PA000000284&packageType=A&isFreeYN=', + 'md5': 'bf46b2e89fda7ae7de01f5743cef7236', + 'info_dict': { + 'id': '22000489324', + 'ext': 'mp4', + 'title': '[다시보기] 트롤리 15회', + 'description': 'md5:0e55d74bef1ac55c61ae90c73ac485f4', + 'thumbnail': 'https://img2.sbs.co.kr/img/sbs_cms/WE/2023/02/14/arC1676333794938-1280-720.jpg', + 'release_timestamp': 1676325600, + 'release_date': '20230213', + 'view_count': int, + 'like_count': int, + 'duration': 5931, + 'age_limit': 15, + 'series': '트롤리', + 'episode': '이거 다 거짓말이야', + 'episode_number': 15, + }, + }, { + 'url': 'https://programs.sbs.co.kr/enter/fourman/vod/69625/22000508948', + 'md5': '41e8ae4cc6c8424f4e4d76661a4becbf', + 'info_dict': { + 'id': '22000508948', + 'ext': 'mp4', + 'title': '[다시보기] 신발 벗고 돌싱포맨 104회', + 'description': 'md5:c6a247383c4dd661e4b956bf4d3b586e', + 'thumbnail': 'https://img2.sbs.co.kr/img/sbs_cms/WE/2023/08/30/2vb1693355446261-1280-720.jpg', + 'release_timestamp': 1693342800, + 'release_date': '20230829', + 'view_count': int, + 'like_count': int, + 'duration': 7036, + 'age_limit': 15, + 'series': '신발 벗고 돌싱포맨', + 'episode': '돌싱포맨 저격수들 등장!', + 'episode_number': 104, + }, + }] + + def _call_api(self, video_id, rscuse=''): + return self._download_json( + f'https://api.play.sbs.co.kr/1.0/sbs_vodall/{video_id}', video_id, + note=f'Downloading m3u8 information {rscuse}', + query={ + 'platform': 'pcweb', + 'protocol': 'download', + 'absolute_show': 'Y', + 'service': 'program', + 'ssl': 'Y', + 'rscuse': rscuse, + }) + + def _real_extract(self, url): + video_id = self._match_id(url) + + details = self._call_api(video_id) + source = traverse_obj(details, ('vod', 'source', 'mediasource', {dict})) or {} + + formats = [] + for stream in traverse_obj(details, ( + 'vod', 'source', 'mediasourcelist', lambda _, v: v['mediaurl'] or v['mediarscuse'] + ), default=[source]): + if not stream.get('mediaurl'): + new_source = traverse_obj( + self._call_api(video_id, rscuse=stream['mediarscuse']), + ('vod', 'source', 'mediasource', {dict})) or {} + if new_source.get('mediarscuse') == source.get('mediarscuse') or not new_source.get('mediaurl'): + continue + stream = new_source + formats.append({ + 'url': stream['mediaurl'], + 'format_id': stream.get('mediarscuse'), + 'format_note': stream.get('medianame'), + **parse_resolution(stream.get('quality')), + 'preference': int_or_none(stream.get('mediarscuse')) + }) + + caption_url = traverse_obj(details, ('vod', 'source', 'subtitle', {url_or_none})) + + return { + 'id': video_id, + **traverse_obj(details, ('vod', { + 'title': ('info', 'title'), + 'duration': ('info', 'duration', {int_or_none}), + 'view_count': ('info', 'viewcount', {int_or_none}), + 'like_count': ('info', 'likecount', {int_or_none}), + 'description': ('info', 'synopsis', {clean_html}), + 'episode': ('info', 'content', ('contenttitle', 'title')), + 'episode_number': ('info', 'content', 'number', {int_or_none}), + 'series': ('info', 'program', 'programtitle'), + 'age_limit': ('info', 'targetage', {int_or_none}), + 'release_timestamp': ('info', 'broaddate', {parse_iso8601}), + 'thumbnail': ('source', 'thumbnail', 'origin', {url_or_none}), + }), get_all=False), + 'formats': formats, + 'subtitles': {'ko': [{'url': caption_url}]} if caption_url else None, + } + + +class SBSCoKrAllvodProgramIE(InfoExtractor): + IE_NAME = 'sbs.co.kr:allvod_program' + _VALID_URL = r'https?://allvod\.sbs\.co\.kr/allvod/vod(?:Free)?ProgramDetail\.do\?(?:[^#]+&)?pgmId=(?PP?\d+)' + + _TESTS = [{ + 'url': 'https://allvod.sbs.co.kr/allvod/vodFreeProgramDetail.do?type=legend&pgmId=22000010159&listOrder=vodCntAsc', + 'info_dict': { + '_type': 'playlist', + 'id': '22000010159', + }, + 'playlist_count': 18, + }, { + 'url': 'https://allvod.sbs.co.kr/allvod/vodProgramDetail.do?pgmId=P460810577', + 'info_dict': { + '_type': 'playlist', + 'id': 'P460810577', + }, + 'playlist_count': 13, + }] + + def _real_extract(self, url): + program_id = self._match_id(url) + + details = self._download_json( + 'https://allvod.sbs.co.kr/allvod/vodProgramDetail/vodProgramDetailAjax.do', + program_id, note='Downloading program details', + query={ + 'pgmId': program_id, + 'currentCount': '10000', + }) + + return self.playlist_result( + [self.url_result(f'https://allvod.sbs.co.kr/allvod/vodEndPage.do?mdaId={video_id}', SBSCoKrIE) + for video_id in traverse_obj(details, ('list', ..., 'mdaId'))], program_id) + + +class SBSCoKrProgramsVodIE(InfoExtractor): + IE_NAME = 'sbs.co.kr:programs_vod' + _VALID_URL = r'https?://programs\.sbs\.co\.kr/(?:enter|drama|culture|sports|plus|mtv)/(?P[a-z0-9]+)/vods' + + _TESTS = [{ + 'url': 'https://programs.sbs.co.kr/culture/morningwide/vods/65007', + 'info_dict': { + '_type': 'playlist', + 'id': '00000210215', + }, + 'playlist_mincount': 9782, + }, { + 'url': 'https://programs.sbs.co.kr/enter/dongsang2/vods/52006', + 'info_dict': { + '_type': 'playlist', + 'id': '22000010476', + }, + 'playlist_mincount': 312, + }] + + def _real_extract(self, url): + program_slug = self._match_id(url) + + program_id = self._download_json( + f'https://static.apis.sbs.co.kr/program-api/1.0/menu/{program_slug}', program_slug, + note='Downloading program menu data')['program']['programid'] + + return self.url_result( + f'https://allvod.sbs.co.kr/allvod/vodProgramDetail.do?pgmId={program_id}', SBSCoKrAllvodProgramIE) diff --git a/yt_dlp/extractor/scrippsnetworks.py b/yt_dlp/extractor/scrippsnetworks.py index 7f0bc9645..3912f7786 100644 --- a/yt_dlp/extractor/scrippsnetworks.py +++ b/yt_dlp/extractor/scrippsnetworks.py @@ -114,6 +114,8 @@ class ScrippsNetworksIE(InfoExtractor): 'timestamp': 1475678834, 'upload_date': '20161005', 'uploader': 'SCNI-SCND', + 'tags': 'count:10', + 'creator': 'Cooking Channel', 'duration': 29.995, 'chapters': [{'start_time': 0.0, 'end_time': 29.995, 'title': ''}], 'thumbnail': 'https://images.dds.discovery.com/up/tp/Scripps_-_Food_Category_Prod/122/987/0260338_630x355.jpg', diff --git a/yt_dlp/extractor/scte.py b/yt_dlp/extractor/scte.py index d839ffcde..9c2ca8c51 100644 --- a/yt_dlp/extractor/scte.py +++ b/yt_dlp/extractor/scte.py @@ -46,6 +46,7 @@ def is_logged(webpage): class SCTEIE(SCTEBaseIE): + _WORKING = False _VALID_URL = r'https?://learning\.scte\.org/mod/scorm/view\.php?.*?\bid=(?P\d+)' _TESTS = [{ 'url': 'https://learning.scte.org/mod/scorm/view.php?id=31484', @@ -93,6 +94,7 @@ def _real_extract(self, url): class SCTECourseIE(SCTEBaseIE): + _WORKING = False _VALID_URL = r'https?://learning\.scte\.org/(?:mod/sub)?course/view\.php?.*?\bid=(?P\d+)' _TESTS = [{ 'url': 'https://learning.scte.org/mod/subcourse/view.php?id=31491', diff --git a/yt_dlp/extractor/shared.py b/yt_dlp/extractor/shared.py deleted file mode 100644 index 9a237b320..000000000 --- a/yt_dlp/extractor/shared.py +++ /dev/null @@ -1,138 +0,0 @@ -import urllib.parse - -from .common import InfoExtractor -from ..compat import compat_b64decode -from ..utils import ( - KNOWN_EXTENSIONS, - ExtractorError, - determine_ext, - int_or_none, - js_to_json, - parse_filesize, - rot47, - url_or_none, - urlencode_postdata, -) - - -class SharedBaseIE(InfoExtractor): - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage, urlh = self._download_webpage_handle(url, video_id) - - if self._FILE_NOT_FOUND in webpage: - raise ExtractorError( - 'Video %s does not exist' % video_id, expected=True) - - video_url = self._extract_video_url(webpage, video_id, url) - - title = self._extract_title(webpage) - filesize = int_or_none(self._extract_filesize(webpage)) - - return { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'filesize': filesize, - 'title': title, - } - - def _extract_title(self, webpage): - return compat_b64decode(self._html_search_meta( - 'full:title', webpage, 'title')).decode('utf-8') - - def _extract_filesize(self, webpage): - return self._html_search_meta( - 'full:size', webpage, 'file size', fatal=False) - - -class SharedIE(SharedBaseIE): - IE_DESC = 'shared.sx' - _VALID_URL = r'https?://shared\.sx/(?P[\da-z]{10})' - _FILE_NOT_FOUND = '>File does not exist<' - - _TEST = { - 'url': 'http://shared.sx/0060718775', - 'md5': '106fefed92a8a2adb8c98e6a0652f49b', - 'info_dict': { - 'id': '0060718775', - 'ext': 'mp4', - 'title': 'Bmp4', - 'filesize': 1720110, - }, - } - - def _extract_video_url(self, webpage, video_id, url): - download_form = self._hidden_inputs(webpage) - - video_page = self._download_webpage( - url, video_id, 'Downloading video page', - data=urlencode_postdata(download_form), - headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': url, - }) - - video_url = self._html_search_regex( - r'data-url=(["\'])(?P(?:(?!\1).)+)\1', - video_page, 'video URL', group='url') - - return video_url - - -class VivoIE(SharedBaseIE): - IE_DESC = 'vivo.sx' - _VALID_URL = r'https?://vivo\.s[xt]/(?P[\da-z]{10})' - _FILE_NOT_FOUND = '>The file you have requested does not exists or has been removed' - - _TESTS = [{ - 'url': 'http://vivo.sx/d7ddda0e78', - 'md5': '15b3af41be0b4fe01f4df075c2678b2c', - 'info_dict': { - 'id': 'd7ddda0e78', - 'ext': 'mp4', - 'title': 'Chicken', - 'filesize': 515659, - }, - }, { - 'url': 'http://vivo.st/d7ddda0e78', - 'only_matching': True, - }] - - def _extract_title(self, webpage): - title = self._html_search_regex( - r'data-name\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'title', default=None, group='title') - if title: - ext = determine_ext(title) - if ext.lower() in KNOWN_EXTENSIONS: - title = title.rpartition('.' + ext)[0] - return title - return self._og_search_title(webpage) - - def _extract_filesize(self, webpage): - return parse_filesize(self._search_regex( - r'data-type=["\']video["\'][^>]*>Watch.*?<strong>\s*\((.+?)\)', - webpage, 'filesize', fatal=False)) - - def _extract_video_url(self, webpage, video_id, url): - def decode_url_old(encoded_url): - return compat_b64decode(encoded_url).decode('utf-8') - - stream_url = self._search_regex( - r'data-stream\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'stream url', default=None, group='url') - if stream_url: - stream_url = url_or_none(decode_url_old(stream_url)) - if stream_url: - return stream_url - - def decode_url(encoded_url): - return rot47(urllib.parse.unquote_plus(encoded_url)) - - return decode_url(self._parse_json( - self._search_regex( - r'(?s)InitializeStream\s*\(\s*({.+?})\s*\)\s*;', webpage, - 'stream'), - video_id, transform_source=js_to_json)['source']) diff --git a/yt_dlp/extractor/sina.py b/yt_dlp/extractor/sina.py index 984281188..eeb9ebb44 100644 --- a/yt_dlp/extractor/sina.py +++ b/yt_dlp/extractor/sina.py @@ -11,7 +11,7 @@ class SinaIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/ + _VALID_URL = r'''(?x)https?://(?:[^/?#]+\.)?video\.sina\.com\.cn/ (?: (?:view/|.*\#)(?P<id>\d+)| .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)| diff --git a/yt_dlp/extractor/sky.py b/yt_dlp/extractor/sky.py index 0a8b6cc76..574ac219c 100644 --- a/yt_dlp/extractor/sky.py +++ b/yt_dlp/extractor/sky.py @@ -3,9 +3,7 @@ from .common import InfoExtractor from ..utils import ( extract_attributes, - smuggle_url, strip_or_none, - urljoin, ) @@ -13,29 +11,10 @@ class SkyBaseIE(InfoExtractor): BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' _SDC_EL_REGEX = r'(?s)(<div[^>]+data-(?:component-name|fn)="sdc-(?:articl|sit)e-video"[^>]*>)' - def _process_ooyala_element(self, webpage, sdc_el, url): + def _process_video_element(self, webpage, sdc_el, url): sdc = extract_attributes(sdc_el) provider = sdc.get('data-provider') - if provider == 'ooyala': - video_id = sdc['data-sdc-video-id'] - video_url = 'ooyala:%s' % video_id - ie_key = 'Ooyala' - ooyala_el = self._search_regex( - r'(<div[^>]+class="[^"]*\bsdc-article-video__media-ooyala\b[^"]*"[^>]+data-video-id="%s"[^>]*>)' % video_id, - webpage, 'video data', fatal=False) - if ooyala_el: - ooyala_attrs = extract_attributes(ooyala_el) or {} - if ooyala_attrs.get('data-token-required') == 'true': - token_fetch_url = (self._parse_json(ooyala_attrs.get( - 'data-token-fetch-options', '{}'), - video_id, fatal=False) or {}).get('url') - if token_fetch_url: - embed_token = self._download_json(urljoin( - url, token_fetch_url), video_id, fatal=False) - if embed_token: - video_url = smuggle_url( - video_url, {'embed_token': embed_token}) - elif provider == 'brightcove': + if provider == 'brightcove': video_id = sdc['data-video-id'] account_id = sdc.get('data-account-id') or '6058004172001' player_id = sdc.get('data-player-id') or 'RC9PQUaJ6' @@ -52,7 +31,7 @@ def _process_ooyala_element(self, webpage, sdc_el, url): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - info = self._process_ooyala_element(webpage, self._search_regex( + info = self._process_video_element(webpage, self._search_regex( self._SDC_EL_REGEX, webpage, 'sdc element'), url) info.update({ 'title': self._og_search_title(webpage), @@ -73,7 +52,7 @@ class SkySportsIE(SkyBaseIE): 'title': 'Bale: It\'s our time to shine', 'description': 'md5:e88bda94ae15f7720c5cb467e777bb6d', }, - 'add_ie': ['Ooyala'], + 'add_ie': ['BrightcoveNew'], }, { 'url': 'https://www.skysports.com/watch/video/sports/f1/12160544/abu-dhabi-gp-the-notebook', 'only_matching': True, @@ -122,7 +101,7 @@ def _real_extract(self, url): article_id = self._match_id(url) webpage = self._download_webpage(url, article_id) - entries = [self._process_ooyala_element(webpage, sdc_el, url) + entries = [self._process_video_element(webpage, sdc_el, url) for sdc_el in re.findall(self._SDC_EL_REGEX, webpage)] return self.playlist_result( @@ -149,7 +128,7 @@ def _real_extract(self, url): entries = [] for sdc_el in re.findall(self._SDC_EL_REGEX, webpage): - entries.append(self._process_ooyala_element(webpage, sdc_el, url)) + entries.append(self._process_video_element(webpage, sdc_el, url)) return self.playlist_result( entries, article_id, self._og_search_title(webpage), diff --git a/yt_dlp/extractor/slideslive.py b/yt_dlp/extractor/slideslive.py index 25f867a60..df2af3b35 100644 --- a/yt_dlp/extractor/slideslive.py +++ b/yt_dlp/extractor/slideslive.py @@ -1,5 +1,6 @@ import re import urllib.parse +import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -469,11 +470,12 @@ def _real_extract(self, url): slides = self._download_xml( player_info['slides_xml_url'], video_id, fatal=False, note='Downloading slides XML', errnote='Failed to download slides info') - slide_url_template = 'https://cdn.slideslive.com/data/presentations/%s/slides/big/%s%s' - for slide_id, slide in enumerate(slides.findall('./slide') if slides else [], 1): - slides_info.append(( - slide_id, xpath_text(slide, './slideName', 'name'), '.jpg', - int_or_none(xpath_text(slide, './timeSec', 'time')))) + if isinstance(slides, xml.etree.ElementTree.Element): + slide_url_template = 'https://cdn.slideslive.com/data/presentations/%s/slides/big/%s%s' + for slide_id, slide in enumerate(slides.findall('./slide')): + slides_info.append(( + slide_id, xpath_text(slide, './slideName', 'name'), '.jpg', + int_or_none(xpath_text(slide, './timeSec', 'time')))) chapters, thumbnails = [], [] if url_or_none(player_info.get('thumbnail')): @@ -528,7 +530,7 @@ def _real_extract(self, url): if service_name == 'vimeo': info['url'] = smuggle_url( f'https://player.vimeo.com/video/{service_id}', - {'http_headers': {'Referer': url}}) + {'referer': url}) video_slides = traverse_obj(slides, ('slides', ..., 'video', 'id')) if not video_slides: diff --git a/yt_dlp/extractor/spankwire.py b/yt_dlp/extractor/spankwire.py deleted file mode 100644 index 334b29773..000000000 --- a/yt_dlp/extractor/spankwire.py +++ /dev/null @@ -1,174 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import ( - float_or_none, - int_or_none, - merge_dicts, - str_or_none, - str_to_int, - url_or_none, -) - - -class SpankwireIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:www\.)?spankwire\.com/ - (?: - [^/]+/video| - EmbedPlayer\.aspx/?\?.*?\bArticleId= - ) - (?P<id>\d+) - ''' - _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)'] - _TESTS = [{ - # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4 - 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', - 'md5': '5aa0e4feef20aad82cbcae3aed7ab7cd', - 'info_dict': { - 'id': '103545', - 'ext': 'mp4', - 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch', - 'description': 'Crazy Bitch X rated music video.', - 'duration': 222, - 'uploader': 'oreusz', - 'uploader_id': '124697', - 'timestamp': 1178587885, - 'upload_date': '20070508', - 'average_rating': float, - 'view_count': int, - 'comment_count': int, - 'age_limit': 18, - 'categories': list, - 'tags': list, - }, - }, { - # download URL pattern: */mp4_<format_id>_<video_id>.mp4 - 'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/', - 'md5': '09b3c20833308b736ae8902db2f8d7e6', - 'info_dict': { - 'id': '1921551', - 'ext': 'mp4', - 'title': 'Titcums Compiloation I', - 'description': 'cum on tits', - 'uploader': 'dannyh78999', - 'uploader_id': '3056053', - 'upload_date': '20150822', - 'age_limit': 18, - }, - 'params': { - 'proxy': '127.0.0.1:8118' - }, - 'skip': 'removed', - }, { - 'url': 'https://www.spankwire.com/EmbedPlayer.aspx/?ArticleId=156156&autostart=true', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - video = self._download_json( - 'https://www.spankwire.com/api/video/%s.json' % video_id, video_id) - - title = video['title'] - - formats = [] - videos = video.get('videos') - if isinstance(videos, dict): - for format_id, format_url in videos.items(): - video_url = url_or_none(format_url) - if not format_url: - continue - height = int_or_none(self._search_regex( - r'(\d+)[pP]', format_id, 'height', default=None)) - m = re.search( - r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', video_url) - if m: - tbr = int(m.group('tbr')) - height = height or int(m.group('height')) - else: - tbr = None - formats.append({ - 'url': video_url, - 'format_id': '%dp' % height if height else format_id, - 'height': height, - 'tbr': tbr, - }) - m3u8_url = url_or_none(video.get('HLS')) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - - view_count = str_to_int(video.get('viewed')) - - thumbnails = [] - for preference, t in enumerate(('', '2x'), start=0): - thumbnail_url = url_or_none(video.get('poster%s' % t)) - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'preference': preference, - }) - - def extract_names(key): - entries_list = video.get(key) - if not isinstance(entries_list, list): - return - entries = [] - for entry in entries_list: - name = str_or_none(entry.get('name')) - if name: - entries.append(name) - return entries - - categories = extract_names('categories') - tags = extract_names('tags') - - uploader = None - info = {} - - webpage = self._download_webpage( - 'https://www.spankwire.com/_/video%s/' % video_id, video_id, - fatal=False) - if webpage: - info = self._search_json_ld(webpage, video_id, default={}) - thumbnail_url = None - if 'thumbnail' in info: - thumbnail_url = url_or_none(info['thumbnail']) - del info['thumbnail'] - if not thumbnail_url: - thumbnail_url = self._og_search_thumbnail(webpage) - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - 'preference': 10, - }) - uploader = self._html_search_regex( - r'(?s)by\s*<a[^>]+\bclass=["\']uploaded__by[^>]*>(.+?)</a>', - webpage, 'uploader', fatal=False) - if not view_count: - view_count = str_to_int(self._search_regex( - r'data-views=["\']([\d,.]+)', webpage, 'view count', - fatal=False)) - - return merge_dicts({ - 'id': video_id, - 'title': title, - 'description': video.get('description'), - 'duration': int_or_none(video.get('duration')), - 'thumbnails': thumbnails, - 'uploader': uploader, - 'uploader_id': str_or_none(video.get('userId')), - 'timestamp': int_or_none(video.get('time_approved_on')), - 'average_rating': float_or_none(video.get('rating')), - 'view_count': view_count, - 'comment_count': int_or_none(video.get('comments')), - 'age_limit': 18, - 'categories': categories, - 'tags': tags, - 'formats': formats, - }, info) diff --git a/yt_dlp/extractor/srmediathek.py b/yt_dlp/extractor/srmediathek.py index 3cc39870f..f0b3b585f 100644 --- a/yt_dlp/extractor/srmediathek.py +++ b/yt_dlp/extractor/srmediathek.py @@ -6,6 +6,7 @@ class SRMediathekIE(ARDMediathekBaseIE): + _WORKING = False IE_NAME = 'sr:mediathek' IE_DESC = 'Saarländischer Rundfunk' _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' diff --git a/yt_dlp/extractor/stacommu.py b/yt_dlp/extractor/stacommu.py index 6f58f06dc..1308c595d 100644 --- a/yt_dlp/extractor/stacommu.py +++ b/yt_dlp/extractor/stacommu.py @@ -38,9 +38,48 @@ def _extract_hls_key(self, data, path, decrypt): return None return traverse_obj(encryption_data, {'key': ('key', {decrypt}), 'iv': ('iv', {decrypt})}) + def _extract_vod(self, url): + video_id = self._match_id(url) + video_info = self._download_metadata( + url, video_id, 'ja', ('dehydratedState', 'queries', 0, 'state', 'data')) + hls_info, decrypt = self._call_encrypted_api( + video_id, ':watch', 'stream information', data={'method': 1}) + + return { + 'id': video_id, + 'formats': self._get_formats(hls_info, ('protocolHls', 'url', {url_or_none}), video_id), + 'hls_aes': self._extract_hls_key(hls_info, 'protocolHls', decrypt), + **traverse_obj(video_info, { + 'title': ('displayName', {str}), + 'description': ('description', {str}), + 'timestamp': ('watchStartTime', {int_or_none}), + 'thumbnail': ('keyVisualUrl', {url_or_none}), + 'cast': ('casts', ..., 'displayName', {str}), + 'duration': ('duration', {int}), + }), + } + + def _extract_ppv(self, url): + video_id = self._match_id(url) + video_info = self._call_api(video_id, msg='video information', query={'al': 'ja'}, auth=False) + hls_info, decrypt = self._call_encrypted_api( + video_id, ':watchArchive', 'stream information', data={'method': 1}) + + return { + 'id': video_id, + 'formats': self._get_formats(hls_info, ('hls', 'urls', ..., {url_or_none}), video_id), + 'hls_aes': self._extract_hls_key(hls_info, 'hls', decrypt), + **traverse_obj(video_info, { + 'title': ('displayName', {str}), + 'timestamp': ('startTime', {int_or_none}), + 'thumbnail': ('keyVisualUrl', {url_or_none}), + 'duration': ('duration', {int_or_none}), + }), + } + class StacommuVODIE(StacommuBaseIE): - _VALID_URL = r'https?://www\.stacommu\.jp/videos/episodes/(?P<id>[\da-zA-Z]+)' + _VALID_URL = r'https?://www\.stacommu\.jp/(?:en/)?videos/episodes/(?P<id>[\da-zA-Z]+)' _TESTS = [{ # not encrypted 'url': 'https://www.stacommu.jp/videos/episodes/aXcVKjHyAENEjard61soZZ', @@ -79,34 +118,19 @@ class StacommuVODIE(StacommuBaseIE): 'params': { 'skip_download': 'm3u8', }, + }, { + 'url': 'https://www.stacommu.jp/en/videos/episodes/aXcVKjHyAENEjard61soZZ', + 'only_matching': True, }] _API_PATH = 'videoEpisodes' def _real_extract(self, url): - video_id = self._match_id(url) - video_info = self._download_metadata( - url, video_id, 'ja', ('dehydratedState', 'queries', 0, 'state', 'data')) - hls_info, decrypt = self._call_encrypted_api( - video_id, ':watch', 'stream information', data={'method': 1}) - - return { - 'id': video_id, - 'formats': self._get_formats(hls_info, ('protocolHls', 'url', {url_or_none}), video_id), - 'hls_aes': self._extract_hls_key(hls_info, 'protocolHls', decrypt), - **traverse_obj(video_info, { - 'title': ('displayName', {str}), - 'description': ('description', {str}), - 'timestamp': ('watchStartTime', {int_or_none}), - 'thumbnail': ('keyVisualUrl', {url_or_none}), - 'cast': ('casts', ..., 'displayName', {str}), - 'duration': ('duration', {int}), - }), - } + return self._extract_vod(url) class StacommuLiveIE(StacommuBaseIE): - _VALID_URL = r'https?://www\.stacommu\.jp/live/(?P<id>[\da-zA-Z]+)' + _VALID_URL = r'https?://www\.stacommu\.jp/(?:en/)?live/(?P<id>[\da-zA-Z]+)' _TESTS = [{ 'url': 'https://www.stacommu.jp/live/d2FJ3zLnndegZJCAEzGM3m', 'info_dict': { @@ -125,24 +149,83 @@ class StacommuLiveIE(StacommuBaseIE): 'params': { 'skip_download': 'm3u8', }, + }, { + 'url': 'https://www.stacommu.jp/en/live/d2FJ3zLnndegZJCAEzGM3m', + 'only_matching': True, }] _API_PATH = 'events' def _real_extract(self, url): - video_id = self._match_id(url) - video_info = self._call_api(video_id, msg='video information', query={'al': 'ja'}, auth=False) - hls_info, decrypt = self._call_encrypted_api( - video_id, ':watchArchive', 'stream information', data={'method': 1}) + return self._extract_ppv(url) - return { - 'id': video_id, - 'formats': self._get_formats(hls_info, ('hls', 'urls', ..., {url_or_none}), video_id), - 'hls_aes': self._extract_hls_key(hls_info, 'hls', decrypt), - **traverse_obj(video_info, { - 'title': ('displayName', {str}), - 'timestamp': ('startTime', {int_or_none}), - 'thumbnail': ('keyVisualUrl', {url_or_none}), - 'duration': ('duration', {int_or_none}), - }), - } + +class TheaterComplexTownBaseIE(StacommuBaseIE): + _NETRC_MACHINE = 'theatercomplextown' + _API_HOST = 'api.theater-complex.town' + _LOGIN_QUERY = {'key': 'AIzaSyAgNCqToaIz4a062EeIrkhI_xetVfAOrfc'} + _LOGIN_HEADERS = { + 'Accept': '*/*', + 'Content-Type': 'application/json', + 'X-Client-Version': 'Chrome/JsCore/9.23.0/FirebaseCore-web', + 'Referer': 'https://www.theater-complex.town/', + 'Origin': 'https://www.theater-complex.town', + } + + +class TheaterComplexTownVODIE(TheaterComplexTownBaseIE): + _VALID_URL = r'https?://(?:www\.)?theater-complex\.town/(?:en/)?videos/episodes/(?P<id>\w+)' + IE_NAME = 'theatercomplextown:vod' + _TESTS = [{ + 'url': 'https://www.theater-complex.town/videos/episodes/hoxqidYNoAn7bP92DN6p78', + 'info_dict': { + 'id': 'hoxqidYNoAn7bP92DN6p78', + 'ext': 'mp4', + 'title': '演劇ドラフトグランプリ2023 劇団『恋のぼり』〜劇団名決定秘話ラジオ', + 'description': 'md5:a7e2e9cf570379ea67fb630f345ff65d', + 'cast': ['玉城 裕規', '石川 凌雅'], + 'thumbnail': 'https://image.theater-complex.town/5URnXX6KCeDysuFrPkP38o/5URnXX6KCeDysuFrPkP38o', + 'upload_date': '20231103', + 'timestamp': 1699016400, + 'duration': 868, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + 'url': 'https://www.theater-complex.town/en/videos/episodes/6QT7XYwM9dJz5Gf9VB6K5y', + 'only_matching': True, + }] + + _API_PATH = 'videoEpisodes' + + def _real_extract(self, url): + return self._extract_vod(url) + + +class TheaterComplexTownPPVIE(TheaterComplexTownBaseIE): + _VALID_URL = r'https?://(?:www\.)?theater-complex\.town/(?:en/)?ppv/(?P<id>\w+)' + IE_NAME = 'theatercomplextown:ppv' + _TESTS = [{ + 'url': 'https://www.theater-complex.town/ppv/wytW3X7khrjJBUpKuV3jen', + 'info_dict': { + 'id': 'wytW3X7khrjJBUpKuV3jen', + 'ext': 'mp4', + 'title': 'BREAK FREE STARS 11月5日(日)12:30千秋楽公演', + 'thumbnail': 'https://image.theater-complex.town/5GWEB31JcTUfjtgdeV5t6o/5GWEB31JcTUfjtgdeV5t6o', + 'upload_date': '20231105', + 'timestamp': 1699155000, + 'duration': 8378, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + 'url': 'https://www.theater-complex.town/en/ppv/wytW3X7khrjJBUpKuV3jen', + 'only_matching': True, + }] + + _API_PATH = 'events' + + def _real_extract(self, url): + return self._extract_ppv(url) diff --git a/yt_dlp/extractor/storyfire.py b/yt_dlp/extractor/storyfire.py index 035747c31..566f77782 100644 --- a/yt_dlp/extractor/storyfire.py +++ b/yt_dlp/extractor/storyfire.py @@ -32,9 +32,7 @@ def _parse_video(self, video): 'description': video.get('description'), 'url': smuggle_url( 'https://player.vimeo.com/video/' + vimeo_id, { - 'http_headers': { - 'Referer': 'https://storyfire.com/', - } + 'referer': 'https://storyfire.com/', }), 'thumbnail': video.get('storyImage'), 'view_count': int_or_none(video.get('views')), diff --git a/yt_dlp/extractor/streamcloud.py b/yt_dlp/extractor/streamcloud.py deleted file mode 100644 index 728980921..000000000 --- a/yt_dlp/extractor/streamcloud.py +++ /dev/null @@ -1,75 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - urlencode_postdata, -) - - -class StreamcloudIE(InfoExtractor): - IE_NAME = 'streamcloud.eu' - _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)(?:/(?P<fname>[^#?]*)\.html)?' - - _TESTS = [{ - 'url': 'http://streamcloud.eu/skp9j99s4bpz/youtube-dl_test_video_____________-BaW_jenozKc.mp4.html', - 'md5': '6bea4c7fa5daaacc2a946b7146286686', - 'info_dict': { - 'id': 'skp9j99s4bpz', - 'ext': 'mp4', - 'title': 'youtube-dl test video \'/\\ ä ↭', - }, - 'skip': 'Only available from the EU' - }, { - 'url': 'http://streamcloud.eu/ua8cmfh1nbe6/NSHIP-148--KUC-NG--H264-.mp4.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - url = 'http://streamcloud.eu/%s' % video_id - - orig_webpage = self._download_webpage(url, video_id) - - if '>File Not Found<' in orig_webpage: - raise ExtractorError( - 'Video %s does not exist' % video_id, expected=True) - - fields = re.findall(r'''(?x)<input\s+ - type="(?:hidden|submit)"\s+ - name="([^"]+)"\s+ - (?:id="[^"]+"\s+)? - value="([^"]*)" - ''', orig_webpage) - - self._sleep(6, video_id) - - webpage = self._download_webpage( - url, video_id, data=urlencode_postdata(fields), headers={ - b'Content-Type': b'application/x-www-form-urlencoded', - }) - - try: - title = self._html_search_regex( - r'<h1[^>]*>([^<]+)<', webpage, 'title') - video_url = self._search_regex( - r'file:\s*"([^"]+)"', webpage, 'video URL') - except ExtractorError: - message = self._html_search_regex( - r'(?s)<div[^>]+class=(["\']).*?msgboxinfo.*?\1[^>]*>(?P<message>.+?)</div>', - webpage, 'message', default=None, group='message') - if message: - raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) - raise - thumbnail = self._search_regex( - r'image:\s*"([^"]+)"', webpage, 'thumbnail URL', fatal=False) - - return { - 'id': video_id, - 'title': title, - 'url': video_url, - 'thumbnail': thumbnail, - 'http_headers': { - 'Referer': url, - }, - } diff --git a/yt_dlp/extractor/swrmediathek.py b/yt_dlp/extractor/swrmediathek.py deleted file mode 100644 index 38bdfced7..000000000 --- a/yt_dlp/extractor/swrmediathek.py +++ /dev/null @@ -1,111 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - parse_duration, - int_or_none, - determine_protocol, -) - - -class SWRMediathekIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?swrmediathek\.de/(?:content/)?player\.htm\?show=(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - - _TESTS = [{ - 'url': 'http://swrmediathek.de/player.htm?show=849790d0-dab8-11e3-a953-0026b975f2e6', - 'md5': '8c5f6f0172753368547ca8413a7768ac', - 'info_dict': { - 'id': '849790d0-dab8-11e3-a953-0026b975f2e6', - 'ext': 'mp4', - 'title': 'SWR odysso', - 'description': 'md5:2012e31baad36162e97ce9eb3f157b8a', - 'thumbnail': r're:^http:.*\.jpg$', - 'duration': 2602, - 'upload_date': '20140515', - 'uploader': 'SWR Fernsehen', - 'uploader_id': '990030', - }, - }, { - 'url': 'http://swrmediathek.de/player.htm?show=0e1a8510-ddf2-11e3-9be3-0026b975f2e6', - 'md5': 'b10ab854f912eecc5a6b55cd6fc1f545', - 'info_dict': { - 'id': '0e1a8510-ddf2-11e3-9be3-0026b975f2e6', - 'ext': 'mp4', - 'title': 'Nachtcafé - Alltagsdroge Alkohol - zwischen Sektempfang und Komasaufen', - 'description': 'md5:e0a3adc17e47db2c23aab9ebc36dbee2', - 'thumbnail': r're:http://.*\.jpg', - 'duration': 5305, - 'upload_date': '20140516', - 'uploader': 'SWR Fernsehen', - 'uploader_id': '990030', - }, - 'skip': 'redirect to http://swrmediathek.de/index.htm?hinweis=swrlink', - }, { - 'url': 'http://swrmediathek.de/player.htm?show=bba23e10-cb93-11e3-bf7f-0026b975f2e6', - 'md5': '4382e4ef2c9d7ce6852535fa867a0dd3', - 'info_dict': { - 'id': 'bba23e10-cb93-11e3-bf7f-0026b975f2e6', - 'ext': 'mp3', - 'title': 'Saša Stanišic: Vor dem Fest', - 'description': 'md5:5b792387dc3fbb171eb709060654e8c9', - 'thumbnail': r're:http://.*\.jpg', - 'duration': 3366, - 'upload_date': '20140520', - 'uploader': 'SWR 2', - 'uploader_id': '284670', - }, - 'skip': 'redirect to http://swrmediathek.de/index.htm?hinweis=swrlink', - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - video = self._download_json( - 'http://swrmediathek.de/AjaxEntry?ekey=%s' % video_id, - video_id, 'Downloading video JSON') - - attr = video['attr'] - title = attr['entry_title'] - media_type = attr.get('entry_etype') - - formats = [] - for entry in video.get('sub', []): - if entry.get('name') != 'entry_media': - continue - - entry_attr = entry.get('attr', {}) - f_url = entry_attr.get('val2') - if not f_url: - continue - codec = entry_attr.get('val0') - if codec == 'm3u8': - formats.extend(self._extract_m3u8_formats( - f_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif codec == 'f4m': - formats.extend(self._extract_f4m_formats( - f_url + '?hdcore=3.7.0', video_id, - f4m_id='hds', fatal=False)) - else: - formats.append({ - 'format_id': determine_protocol({'url': f_url}), - 'url': f_url, - 'quality': int_or_none(entry_attr.get('val1')), - 'vcodec': codec if media_type == 'Video' else 'none', - 'acodec': codec if media_type == 'Audio' else None, - }) - - upload_date = None - entry_pdatet = attr.get('entry_pdatet') - if entry_pdatet: - upload_date = entry_pdatet[:-4] - - return { - 'id': video_id, - 'title': title, - 'description': attr.get('entry_descl'), - 'thumbnail': attr.get('entry_image_16_9'), - 'duration': parse_duration(attr.get('entry_durat')), - 'upload_date': upload_date, - 'uploader': attr.get('channel_title'), - 'uploader_id': attr.get('channel_idkey'), - 'formats': formats, - } diff --git a/yt_dlp/extractor/techtalks.py b/yt_dlp/extractor/techtalks.py deleted file mode 100644 index d37de360b..000000000 --- a/yt_dlp/extractor/techtalks.py +++ /dev/null @@ -1,80 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import ( - get_element_by_attribute, - clean_html, -) - - -class TechTalksIE(InfoExtractor): - _VALID_URL = r'https?://techtalks\.tv/talks/(?:[^/]+/)?(?P<id>\d+)' - - _TESTS = [{ - 'url': 'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/', - 'info_dict': { - 'id': '57758', - 'title': 'Learning Topic Models --- Going beyond SVD', - }, - 'playlist': [ - { - 'info_dict': { - 'id': '57758', - 'ext': 'flv', - 'title': 'Learning Topic Models --- Going beyond SVD', - }, - }, - { - 'info_dict': { - 'id': '57758-slides', - 'ext': 'flv', - 'title': 'Learning Topic Models --- Going beyond SVD', - }, - }, - ], - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - 'url': 'http://techtalks.tv/talks/57758', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - talk_id = mobj.group('id') - webpage = self._download_webpage(url, talk_id) - rtmp_url = self._search_regex( - r'netConnectionUrl: \'(.*?)\'', webpage, 'rtmp url') - play_path = self._search_regex( - r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"', - webpage, 'presenter play path') - title = clean_html(get_element_by_attribute('class', 'title', webpage)) - video_info = { - 'id': talk_id, - 'title': title, - 'url': rtmp_url, - 'play_path': play_path, - 'ext': 'flv', - } - m_slides = re.search(r'<a class="slides" href=\'(.*?)\'', webpage) - if m_slides is None: - return video_info - else: - return { - '_type': 'playlist', - 'id': talk_id, - 'title': title, - 'entries': [ - video_info, - # The slides video - { - 'id': talk_id + '-slides', - 'title': title, - 'url': rtmp_url, - 'play_path': m_slides.group(1), - 'ext': 'flv', - }, - ], - } diff --git a/yt_dlp/extractor/telecinco.py b/yt_dlp/extractor/telecinco.py index 20bb82420..a3f0c7cda 100644 --- a/yt_dlp/extractor/telecinco.py +++ b/yt_dlp/extractor/telecinco.py @@ -77,7 +77,6 @@ class TelecincoIE(InfoExtractor): 'url': 'http://www.telecinco.es/espanasinirmaslejos/Espana-gran-destino-turistico_2_1240605043.html', 'only_matching': True, }, { - # ooyala video 'url': 'http://www.cuatro.com/chesterinlove/a-carta/chester-chester_in_love-chester_edu_2_2331030022.html', 'only_matching': True, }] diff --git a/yt_dlp/extractor/telewebion.py b/yt_dlp/extractor/telewebion.py index 550549f05..9378ed021 100644 --- a/yt_dlp/extractor/telewebion.py +++ b/yt_dlp/extractor/telewebion.py @@ -1,52 +1,133 @@ +from __future__ import annotations + +import json +from functools import partial +from textwrap import dedent + from .common import InfoExtractor +from ..utils import ExtractorError, format_field, int_or_none, parse_iso8601 +from ..utils.traversal import traverse_obj + + +def _fmt_url(url): + return partial(format_field, template=url, default=None) class TelewebionIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?telewebion\.com/#!/episode/(?P<id>\d+)' - - _TEST = { - 'url': 'http://www.telewebion.com/#!/episode/1263668/', + _VALID_URL = r'https?://(?:www\.)?telewebion\.com/episode/(?P<id>(?:0x[a-fA-F\d]+|\d+))' + _TESTS = [{ + 'url': 'http://www.telewebion.com/episode/0x1b3139c/', 'info_dict': { - 'id': '1263668', + 'id': '0x1b3139c', 'ext': 'mp4', - 'title': 'قرعه\u200cکشی لیگ قهرمانان اروپا', - 'thumbnail': r're:^https?://.*\.jpg', + 'title': 'قرعه‌کشی لیگ قهرمانان اروپا', + 'series': '+ فوتبال', + 'series_id': '0x1b2505c', + 'channel': 'شبکه 3', + 'channel_id': '0x1b1a761', + 'channel_url': 'https://telewebion.com/live/tv3', + 'timestamp': 1425522414, + 'upload_date': '20150305', + 'release_timestamp': 1425517020, + 'release_date': '20150305', + 'duration': 420, 'view_count': int, + 'tags': ['ورزشی', 'لیگ اروپا', 'اروپا'], + 'thumbnail': 'https://static.telewebion.com/episodeImages/YjFhM2MxMDBkMDNiZTU0MjE5YjQ3ZDY0Mjk1ZDE0ZmUwZWU3OTE3OWRmMDAyODNhNzNkNjdmMWMzMWIyM2NmMA/default', }, - 'params': { - # m3u8 download - 'skip_download': True, + 'skip_download': 'm3u8', + }, { + 'url': 'https://telewebion.com/episode/162175536', + 'info_dict': { + 'id': '0x9aa9a30', + 'ext': 'mp4', + 'title': 'کارما یعنی این !', + 'series': 'پاورقی', + 'series_id': '0x29a7426', + 'channel': 'شبکه 2', + 'channel_id': '0x1b1a719', + 'channel_url': 'https://telewebion.com/live/tv2', + 'timestamp': 1699979968, + 'upload_date': '20231114', + 'release_timestamp': 1699991638, + 'release_date': '20231114', + 'duration': 78, + 'view_count': int, + 'tags': ['کلیپ های منتخب', ' کلیپ طنز ', ' کلیپ سیاست ', 'پاورقی', 'ویژه فلسطین'], + 'thumbnail': 'https://static.telewebion.com/episodeImages/871e9455-7567-49a5-9648-34c22c197f5f/default', }, - } + 'skip_download': 'm3u8', + }] + + def _call_graphql_api( + self, operation, video_id, query, + variables: dict[str, tuple[str, str]] | None = None, + note='Downloading GraphQL JSON metadata', + ): + parameters = '' + if variables: + parameters = ', '.join(f'${name}: {type_}' for name, (type_, _) in variables.items()) + parameters = f'({parameters})' + + result = self._download_json('https://graph.telewebion.com/graphql', video_id, note, data=json.dumps({ + 'operationName': operation, + 'query': f'query {operation}{parameters} @cacheControl(maxAge: 60) {{{query}\n}}\n', + 'variables': {name: value for name, (_, value) in (variables or {}).items()} + }, separators=(',', ':')).encode(), headers={ + 'Content-Type': 'application/json', + 'Accept': 'application/json', + }) + if not result or traverse_obj(result, 'errors'): + message = ', '.join(traverse_obj(result, ('errors', ..., 'message', {str}))) + raise ExtractorError(message or 'Unknown GraphQL API error') + + return result['data'] def _real_extract(self, url): video_id = self._match_id(url) + if not video_id.startswith('0x'): + video_id = hex(int(video_id)) - secure_token = self._download_webpage( - 'http://m.s2.telewebion.com/op/op?action=getSecurityToken', video_id) - episode_details = self._download_json( - 'http://m.s2.telewebion.com/op/op', video_id, - query={'action': 'getEpisodeDetails', 'episode_id': video_id}) + episode_data = self._call_graphql_api('getEpisodeDetail', video_id, dedent(''' + queryEpisode(filter: {EpisodeID: $EpisodeId}, first: 1) { + title + program { + ProgramID + title + } + image + view_count + duration + started_at + created_at + channel { + ChannelID + name + descriptor + } + tags { + name + } + } + '''), {'EpisodeId': ('[ID!]', video_id)}) - m3u8_url = 'http://m.s1.telewebion.com/smil/%s.m3u8?filepath=%s&m3u8=1&secure_token=%s' % ( - video_id, episode_details['file_path'], secure_token) - formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', m3u8_id='hls') - - picture_paths = [ - episode_details.get('picture_path'), - episode_details.get('large_picture_path'), - ] - - thumbnails = [{ - 'url': picture_path, - 'preference': idx, - } for idx, picture_path in enumerate(picture_paths) if picture_path is not None] - - return { - 'id': video_id, - 'title': episode_details['title'], - 'formats': formats, - 'thumbnails': thumbnails, - 'view_count': episode_details.get('view_count'), - } + info_dict = traverse_obj(episode_data, ('queryEpisode', 0, { + 'title': ('title', {str}), + 'view_count': ('view_count', {int_or_none}), + 'duration': ('duration', {int_or_none}), + 'tags': ('tags', ..., 'name', {str}), + 'release_timestamp': ('started_at', {parse_iso8601}), + 'timestamp': ('created_at', {parse_iso8601}), + 'series': ('program', 'title', {str}), + 'series_id': ('program', 'ProgramID', {str}), + 'channel': ('channel', 'name', {str}), + 'channel_id': ('channel', 'ChannelID', {str}), + 'channel_url': ('channel', 'descriptor', {_fmt_url('https://telewebion.com/live/%s')}), + 'thumbnail': ('image', {_fmt_url('https://static.telewebion.com/episodeImages/%s/default')}), + 'formats': ( + 'channel', 'descriptor', {str}, + {_fmt_url(f'https://cdna.telewebion.com/%s/episode/{video_id}/playlist.m3u8')}, + {partial(self._extract_m3u8_formats, video_id=video_id, ext='mp4', m3u8_id='hls')}), + })) + info_dict['id'] = video_id + return info_dict diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index c7097cf02..7ce7cbf84 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -1,9 +1,11 @@ -from datetime import datetime import base64 +import functools +import itertools +from datetime import datetime from .common import InfoExtractor from ..networking import HEADRequest -from ..utils import int_or_none, urlencode_postdata +from ..utils import int_or_none, traverse_obj, urlencode_postdata, urljoin class TenPlayIE(InfoExtractor): @@ -113,3 +115,55 @@ def _real_extract(self, url): 'uploader': 'Channel 10', 'uploader_id': '2199827728001', } + + +class TenPlaySeasonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?P<show>[^/?#]+)/episodes/(?P<season>[^/?#]+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://10play.com.au/masterchef/episodes/season-14', + 'info_dict': { + 'title': 'Season 14', + 'id': 'MjMyOTIy', + }, + 'playlist_mincount': 64, + }, { + 'url': 'https://10play.com.au/the-bold-and-the-beautiful-fast-tracked/episodes/season-2022', + 'info_dict': { + 'title': 'Season 2022', + 'id': 'Mjc0OTIw', + }, + 'playlist_mincount': 256, + }] + + def _entries(self, load_more_url, display_id=None): + skip_ids = [] + for page in itertools.count(1): + episodes_carousel = self._download_json( + load_more_url, display_id, query={'skipIds[]': skip_ids}, + note=f'Fetching episodes page {page}') + + episodes_chunk = episodes_carousel['items'] + skip_ids.extend(ep['id'] for ep in episodes_chunk) + + for ep in episodes_chunk: + yield ep['cardLink'] + if not episodes_carousel['hasMore']: + break + + def _real_extract(self, url): + show, season = self._match_valid_url(url).group('show', 'season') + season_info = self._download_json( + f'https://10play.com.au/api/shows/{show}/episodes/{season}', f'{show}/{season}') + + episodes_carousel = traverse_obj(season_info, ( + 'content', 0, 'components', ( + lambda _, v: v['title'].lower() == 'episodes', + (..., {dict}), + )), get_all=False) or {} + + playlist_id = episodes_carousel['tpId'] + + return self.playlist_from_matches( + self._entries(urljoin(url, episodes_carousel['loadMoreUrl']), playlist_id), + playlist_id, traverse_obj(season_info, ('content', 0, 'title', {str})), + getter=functools.partial(urljoin, url)) diff --git a/yt_dlp/extractor/theguardian.py b/yt_dlp/extractor/theguardian.py new file mode 100644 index 000000000..a231eccf4 --- /dev/null +++ b/yt_dlp/extractor/theguardian.py @@ -0,0 +1,135 @@ +import itertools + +from .common import InfoExtractor +from ..utils import ( + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_class, + get_elements_html_by_class, + parse_qs, + traverse_obj, + unified_strdate, + urljoin +) + + +class TheGuardianPodcastIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?theguardian\.com/\w+/audio/\d{4}/\w{3}/\d{1,2}/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.theguardian.com/news/audio/2023/nov/03/we-are-just-getting-started-the-plastic-eating-bacteria-that-could-change-the-world-podcast', + 'md5': 'd1771744681789b4cd7da2a08e487702', + 'info_dict': { + 'id': 'we-are-just-getting-started-the-plastic-eating-bacteria-that-could-change-the-world-podcast', + 'ext': 'mp3', + 'title': '‘We are just getting started’: the plastic-eating bacteria that could change the world – podcast', + 'description': 'md5:cfd3df2791d394d2ab62cd571d5207ee', + 'creator': 'Stephen Buranyi', + 'thumbnail': 'md5:73c12558fcb3b0e2a59422bfb33b3f79', + 'release_date': '20231103' + } + }, { + 'url': 'https://www.theguardian.com/news/audio/2023/oct/30/the-trials-of-robert-habeck-is-the-worlds-most-powerful-green-politician-doomed-to-fail-podcast', + 'md5': 'd1771744681789b4cd7da2a08e487702', + 'info_dict': { + 'id': 'the-trials-of-robert-habeck-is-the-worlds-most-powerful-green-politician-doomed-to-fail-podcast', + 'ext': 'mp3', + 'title': 'The trials of Robert Habeck: is the world’s most powerful green politician doomed to fail? – podcast', + 'description': 'md5:1b5cf6582d1771c6b7077784b5456994', + 'creator': 'Philip Oltermann', + 'thumbnail': 'md5:6e5c5ec43843e956e20be793722e9080', + 'release_date': '20231030' + } + }, { + 'url': 'https://www.theguardian.com/football/audio/2023/nov/06/arsenal-feel-hard-done-by-and-luton-hold-liverpool-football-weekly', + 'md5': 'a2fcff6f8e060a95b1483295273dc35e', + 'info_dict': { + 'id': 'arsenal-feel-hard-done-by-and-luton-hold-liverpool-football-weekly', + 'ext': 'mp3', + 'title': 'Arsenal feel hard done by and Luton hold Liverpool – Football Weekly', + 'description': 'md5:286a9fbddaeb7c83cc65d1c4a5330b2a', + 'creator': 'Max Rushden', + 'thumbnail': 'md5:93eb7d6440f1bb94eb3a6cad63f48afd', + 'release_date': '20231106' + } + }, { + 'url': 'https://www.theguardian.com/politics/audio/2023/nov/02/the-covid-inquiry-politics-weekly-uk-podcast', + 'md5': '06a0f7e9701a80c8064a5d35690481ec', + 'info_dict': { + 'id': 'the-covid-inquiry-politics-weekly-uk-podcast', + 'ext': 'mp3', + 'title': 'The Covid inquiry | Politics Weekly UK - podcast', + 'description': 'md5:207c98859c14903582b17d25b014046e', + 'creator': 'Gaby Hinsliff', + 'thumbnail': 'md5:28932a7b5a25b057be330d2ed70ea7f3', + 'release_date': '20231102' + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + return { + 'id': video_id, + 'title': self._og_search_title(webpage) or get_element_by_class('content__headline', webpage), + 'description': self._og_search_description(webpage), + 'creator': self._html_search_meta('author', webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'release_date': unified_strdate(self._html_search_meta('article:published_time', webpage)), + 'url': extract_attributes(get_element_html_by_class( + 'podcast__player', webpage) or '').get('data-source'), + } + + +class TheGuardianPodcastPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?theguardian\.com/\w+/series/(?P<id>[\w-]+)(?:\?page=\d+)?' + _TESTS = [{ + 'url': 'https://www.theguardian.com/football/series/theguardianswomensfootballweekly', + 'info_dict': { + 'id': 'theguardianswomensfootballweekly', + 'title': "The Guardian's Women's Football Weekly", + 'description': 'md5:e2cc021311e582d29935a73614a43f51' + }, + 'playlist_mincount': 69 + }, { + 'url': 'https://www.theguardian.com/news/series/todayinfocus?page=2', + 'info_dict': { + 'id': 'todayinfocus', + 'title': 'Today in Focus', + 'description': 'md5:0f097764fc0d359e0b6eb537be0387e2' + }, + 'playlist_mincount': 1261 + }, { + 'url': 'https://www.theguardian.com/news/series/the-audio-long-read', + 'info_dict': { + 'id': 'the-audio-long-read', + 'title': 'The Audio Long Read', + 'description': 'md5:5462994a27527309562b25b6defc4ef3' + }, + 'playlist_mincount': 996 + }] + + def _entries(self, url, playlist_id): + for page in itertools.count(1): + webpage, urlh = self._download_webpage_handle( + url, playlist_id, f'Downloading page {page}', query={'page': page}) + if 'page' not in parse_qs(urlh.url): + break + + episodes = get_elements_html_by_class('fc-item--type-media', webpage) + for url_path in traverse_obj(episodes, (..., {extract_attributes}, 'data-id')): + yield url_path + + def _real_extract(self, url): + podcast_id = self._match_id(url) + + webpage = self._download_webpage(url, podcast_id) + + title = clean_html(get_element_by_class( + 'index-page-header__title', webpage) or get_element_by_class('flagship-audio__title', webpage)) + description = self._og_search_description(webpage) or self._html_search_meta( + 'description', webpage) + + return self.playlist_from_matches( + self._entries(url, podcast_id), podcast_id, title, description=description, + ie=TheGuardianPodcastIE, getter=lambda x: urljoin('https://www.theguardian.com', x)) diff --git a/yt_dlp/extractor/theplatform.py b/yt_dlp/extractor/theplatform.py index 433ce8427..9160f5ec6 100644 --- a/yt_dlp/extractor/theplatform.py +++ b/yt_dlp/extractor/theplatform.py @@ -104,6 +104,10 @@ def _add_chapter(start_time, end_time): _add_chapter(chapter.get('startTime'), chapter.get('endTime')) _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration) + def extract_site_specific_field(field): + # A number of sites have custom-prefixed keys, e.g. 'cbc$seasonNumber' + return traverse_obj(info, lambda k, v: v and k.endswith(f'${field}'), get_all=False) + return { 'title': info['title'], 'subtitles': subtitles, @@ -113,6 +117,14 @@ def _add_chapter(start_time, end_time): 'timestamp': int_or_none(info.get('pubDate'), 1000) or None, 'uploader': info.get('billingCode'), 'chapters': chapters, + 'creator': traverse_obj(info, ('author', {str})) or None, + 'categories': traverse_obj(info, ( + 'categories', lambda _, v: v.get('label') in ('category', None), 'name', {str})) or None, + 'tags': traverse_obj(info, ('keywords', {lambda x: re.split(r'[;,]\s?', x) if x else None})), + 'location': extract_site_specific_field('region'), + 'series': extract_site_specific_field('show'), + 'season_number': int_or_none(extract_site_specific_field('seasonNumber')), + 'media_type': extract_site_specific_field('programmingType') or extract_site_specific_field('type'), } def _extract_theplatform_metadata(self, path, video_id): diff --git a/yt_dlp/extractor/thisav.py b/yt_dlp/extractor/thisav.py deleted file mode 100644 index b1cd57d1f..000000000 --- a/yt_dlp/extractor/thisav.py +++ /dev/null @@ -1,66 +0,0 @@ -from .common import InfoExtractor -from ..utils import remove_end - - -class ThisAVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*' - _TESTS = [{ - # jwplayer - 'url': 'http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html', - 'md5': '0480f1ef3932d901f0e0e719f188f19b', - 'info_dict': { - 'id': '47734', - 'ext': 'flv', - 'title': '高樹マリア - Just fit', - 'uploader': 'dj7970', - 'uploader_id': 'dj7970' - } - }, { - # html5 media - 'url': 'http://www.thisav.com/video/242352/nerdy-18yo-big-ass-tattoos-and-glasses.html', - 'md5': 'ba90c076bd0f80203679e5b60bf523ee', - 'info_dict': { - 'id': '242352', - 'ext': 'mp4', - 'title': 'Nerdy 18yo Big Ass Tattoos and Glasses', - 'uploader': 'cybersluts', - 'uploader_id': 'cybersluts', - }, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - title = remove_end(self._html_extract_title(webpage), ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站') - video_url = self._html_search_regex( - r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None) - if video_url: - info_dict = { - 'formats': [{ - 'url': video_url, - }], - } - else: - entries = self._parse_html5_media_entries(url, webpage, video_id) - if entries: - info_dict = entries[0] - else: - info_dict = self._extract_jwplayer_data( - webpage, video_id, require_title=False) - uploader = self._html_search_regex( - r': <a href="http://www\.thisav\.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>', - webpage, 'uploader name', fatal=False) - uploader_id = self._html_search_regex( - r': <a href="http://www\.thisav\.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>', - webpage, 'uploader id', fatal=False) - - info_dict.update({ - 'id': video_id, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'title': title, - }) - - return info_dict diff --git a/yt_dlp/extractor/thisoldhouse.py b/yt_dlp/extractor/thisoldhouse.py index cc7beeea5..15f8380d3 100644 --- a/yt_dlp/extractor/thisoldhouse.py +++ b/yt_dlp/extractor/thisoldhouse.py @@ -1,11 +1,23 @@ +import json + from .common import InfoExtractor +from .zype import ZypeIE from ..networking import HEADRequest +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + filter_dict, + parse_qs, + try_call, + urlencode_postdata, +) class ThisOldHouseIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode|(?:[^/]+/)?\d+)/(?P<id>[^/?#]+)' + _NETRC_MACHINE = 'thisoldhouse' + _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode|(?:[^/?#]+/)?\d+)/(?P<id>[^/?#]+)' _TESTS = [{ - 'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench', + 'url': 'https://www.thisoldhouse.com/furniture/21017078/how-to-build-a-storage-bench', 'info_dict': { 'id': '5dcdddf673c3f956ef5db202', 'ext': 'mp4', @@ -23,13 +35,16 @@ class ThisOldHouseIE(InfoExtractor): 'skip_download': True, }, }, { + # Page no longer has video 'url': 'https://www.thisoldhouse.com/watch/arlington-arts-crafts-arts-and-crafts-class-begins', 'only_matching': True, }, { + # 404 Not Found 'url': 'https://www.thisoldhouse.com/tv-episode/ask-toh-shelf-rough-electric', 'only_matching': True, }, { - 'url': 'https://www.thisoldhouse.com/furniture/21017078/how-to-build-a-storage-bench', + # 404 Not Found + 'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench', 'only_matching': True, }, { 'url': 'https://www.thisoldhouse.com/21113884/s41-e13-paradise-lost', @@ -39,17 +54,51 @@ class ThisOldHouseIE(InfoExtractor): 'url': 'https://www.thisoldhouse.com/21083431/seaside-transformation-the-westerly-project', 'only_matching': True, }] - _ZYPE_TMPL = 'https://player.zype.com/embed/%s.html?api_key=hsOk_yMSPYNrT22e9pu8hihLXjaZf0JW5jsOWv4ZqyHJFvkJn6rtToHl09tbbsbe' + + _LOGIN_URL = 'https://login.thisoldhouse.com/usernamepassword/login' + + def _perform_login(self, username, password): + self._request_webpage( + HEADRequest('https://www.thisoldhouse.com/insider'), None, 'Requesting session cookies') + urlh = self._request_webpage( + 'https://www.thisoldhouse.com/wp-login.php', None, 'Requesting login info', + errnote='Unable to login', query={'redirect_to': 'https://www.thisoldhouse.com/insider'}) + + try: + auth_form = self._download_webpage( + self._LOGIN_URL, None, 'Submitting credentials', headers={ + 'Content-Type': 'application/json', + 'Referer': urlh.url, + }, data=json.dumps(filter_dict({ + **{('client_id' if k == 'client' else k): v[0] for k, v in parse_qs(urlh.url).items()}, + 'tenant': 'thisoldhouse', + 'username': username, + 'password': password, + 'popup_options': {}, + 'sso': True, + '_csrf': try_call(lambda: self._get_cookies(self._LOGIN_URL)['_csrf'].value), + '_intstate': 'deprecated', + }), separators=(',', ':')).encode()) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + raise ExtractorError('Invalid username or password', expected=True) + raise + + self._request_webpage( + 'https://login.thisoldhouse.com/login/callback', None, 'Completing login', + data=urlencode_postdata(self._hidden_inputs(auth_form))) def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) if 'To Unlock This content' in webpage: - self.raise_login_required(method='cookies') - video_url = self._search_regex( + self.raise_login_required( + 'This video is only available for subscribers. ' + 'Note that --cookies-from-browser may not work due to this site using session cookies') + + video_url, video_id = self._search_regex( r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})[^\'"]*)[\'"]', - webpage, 'video url') - if 'subscription_required=true' in video_url or 'c-entry-group-labels__image' in webpage: - return self.url_result(self._request_webpage(HEADRequest(video_url), display_id).url, 'Zype', display_id) - video_id = self._search_regex(r'(?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})', video_url, 'video id') - return self.url_result(self._ZYPE_TMPL % video_id, 'Zype', video_id) + webpage, 'video url', group=(1, 2)) + video_url = self._request_webpage(HEADRequest(video_url), video_id, 'Resolving Zype URL').url + + return self.url_result(video_url, ZypeIE, video_id) diff --git a/yt_dlp/extractor/tinypic.py b/yt_dlp/extractor/tinypic.py deleted file mode 100644 index 216208cbd..000000000 --- a/yt_dlp/extractor/tinypic.py +++ /dev/null @@ -1,54 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import ExtractorError - - -class TinyPicIE(InfoExtractor): - IE_NAME = 'tinypic' - IE_DESC = 'tinypic.com videos' - _VALID_URL = r'https?://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+' - - _TESTS = [ - { - 'url': 'http://tinypic.com/player.php?v=6xw7tc%3E&s=5#.UtqZmbRFCM8', - 'md5': '609b74432465364e72727ebc6203f044', - 'info_dict': { - 'id': '6xw7tc', - 'ext': 'flv', - 'title': 'shadow phenomenon weird', - }, - }, - { - 'url': 'http://de.tinypic.com/player.php?v=dy90yh&s=8', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id, 'Downloading page') - - mobj = re.search(r'(?m)fo\.addVariable\("file",\s"(?P<fileid>[\da-z]+)"\);\n' - r'\s+fo\.addVariable\("s",\s"(?P<serverid>\d+)"\);', webpage) - if mobj is None: - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - file_id = mobj.group('fileid') - server_id = mobj.group('serverid') - - KEYWORDS_SUFFIX = ', Video, images, photos, videos, myspace, ebay, video hosting, photo hosting' - keywords = self._html_search_meta('keywords', webpage, 'title') - title = keywords[:-len(KEYWORDS_SUFFIX)] if keywords.endswith(KEYWORDS_SUFFIX) else '' - - video_url = 'http://v%s.tinypic.com/%s.flv' % (server_id, file_id) - thumbnail = 'http://v%s.tinypic.com/%s_th.jpg' % (server_id, file_id) - - return { - 'id': file_id, - 'url': video_url, - 'thumbnail': thumbnail, - 'title': title - } diff --git a/yt_dlp/extractor/tokentube.py b/yt_dlp/extractor/tokentube.py deleted file mode 100644 index d022e2753..000000000 --- a/yt_dlp/extractor/tokentube.py +++ /dev/null @@ -1,153 +0,0 @@ -import functools -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - get_element_by_class, - parse_count, - remove_end, - unified_strdate, - js_to_json, - OnDemandPagedList, -) - - -class TokentubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tokentube\.net/(?:view\?[vl]=|[vl]/)(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://tokentube.net/l/3236632011/Praise-A-Thon-Pastori-Chrisin-ja-Pastori-Bennyn-kanssa-27-8-2021', - 'info_dict': { - 'id': '3236632011', - 'ext': 'mp4', - 'title': 'Praise-A-Thon Pastori Chrisin ja Pastori Bennyn kanssa 27.8.2021', - 'description': '', - 'uploader': 'Pastori Chris - Rapsodia.fi', - 'upload_date': '20210827', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://tokentube.net/v/3950239124/Linux-Ubuntu-Studio-perus-k%C3%A4ytt%C3%B6', - 'md5': '0e1f00421f501f5eada9890d38fcfb56', - 'info_dict': { - 'id': '3950239124', - 'ext': 'mp4', - 'title': 'Linux Ubuntu Studio perus käyttö', - 'description': 'md5:46077d0daaba1974f2dc381257f9d64c', - 'uploader': 'jyrilehtonen', - 'upload_date': '20210825', - }, - }, { - 'url': 'https://tokentube.net/view?v=3582463289', - 'info_dict': { - 'id': '3582463289', - 'ext': 'mp4', - 'title': 'Police for Freedom - toiminta aloitetaan Suomessa ❤️??', - 'description': 'md5:37ebf1cb44264e0bf23ed98b337ee63e', - 'uploader': 'Voitontie', - 'upload_date': '20210428', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex(r'<h1\s*class=["\']title-text["\']>(.+?)</h1>', webpage, 'title') - - data_json = self._html_search_regex(r'({["\']html5["\'].+?}}}+)', webpage, 'data json') - data_json = self._parse_json(js_to_json(data_json), video_id, fatal=False) - - sources = data_json.get('sources') or self._parse_json( - self._html_search_regex(r'updateSrc\(([^\)]+)\)', webpage, 'sources'), - video_id, transform_source=js_to_json) - - formats = [{ - 'url': format.get('src'), - 'format_id': format.get('label'), - 'height': format.get('res'), - } for format in sources] - - view_count = parse_count(self._html_search_regex( - r'<p\s*class=["\']views_counter["\']>\s*([\d\.,]+)\s*<span>views?</span></p>', - webpage, 'view_count', fatal=False)) - - like_count = parse_count(self._html_search_regex( - r'<div\s*class="sh_button\s*likes_count">\s*(\d+)\s*</div>', - webpage, 'like count', fatal=False)) - - dislike_count = parse_count(self._html_search_regex( - r'<div\s*class="sh_button\s*dislikes_count">\s*(\d+)\s*</div>', - webpage, 'dislike count', fatal=False)) - - upload_date = unified_strdate(self._html_search_regex( - r'<span\s*class="p-date">Published\s*on\s+([^<]+)', - webpage, 'upload date', fatal=False)) - - uploader = self._html_search_regex( - r'<a\s*class="place-left"[^>]+>(.+?)</a>', - webpage, 'uploader', fatal=False) - - description = (clean_html(get_element_by_class('p-d-txt', webpage)) - or self._html_search_meta(('og:description', 'description', 'twitter:description'), webpage)) - - description = remove_end(description, 'Category') - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'upload_date': upload_date, - 'description': description, - 'uploader': uploader, - } - - -class TokentubeChannelIE(InfoExtractor): - _PAGE_SIZE = 20 - IE_NAME = 'Tokentube:channel' - _VALID_URL = r'https?://(?:www\.)?tokentube\.net/channel/(?P<id>\d+)/[^/]+(?:/videos)?' - _TESTS = [{ - 'url': 'https://tokentube.net/channel/3697658904/TokenTube', - 'info_dict': { - 'id': '3697658904', - }, - 'playlist_mincount': 7, - }, { - 'url': 'https://tokentube.net/channel/3353234420/Linux/videos', - 'info_dict': { - 'id': '3353234420', - }, - 'playlist_mincount': 20, - }, { - 'url': 'https://tokentube.net/channel/3475834195/Voitontie', - 'info_dict': { - 'id': '3475834195', - }, - 'playlist_mincount': 150, - }] - - def _fetch_page(self, channel_id, page): - page += 1 - videos_info = self._download_webpage( - f'https://tokentube.net/videos?p=0&m=1&sort=recent&u={channel_id}&page={page}', - channel_id, headers={'X-Requested-With': 'XMLHttpRequest'}, - note=f'Downloading page {page}', fatal=False) - if '</i> Sorry, no results were found.' not in videos_info: - for path, media_id in re.findall( - r'<a[^>]+\bhref=["\']([^"\']+/[lv]/(\d+)/\S+)["\'][^>]+>', - videos_info): - yield self.url_result(path, ie=TokentubeIE.ie_key(), video_id=media_id) - - def _real_extract(self, url): - channel_id = self._match_id(url) - - entries = OnDemandPagedList(functools.partial( - self._fetch_page, channel_id), self._PAGE_SIZE) - - return self.playlist_result(entries, channel_id) diff --git a/yt_dlp/extractor/toypics.py b/yt_dlp/extractor/toypics.py index bc7336186..aa7ee6c48 100644 --- a/yt_dlp/extractor/toypics.py +++ b/yt_dlp/extractor/toypics.py @@ -3,6 +3,7 @@ class ToypicsIE(InfoExtractor): + _WORKING = False IE_DESC = 'Toypics video' _VALID_URL = r'https?://videos\.toypics\.net/view/(?P<id>[0-9]+)' _TEST = { @@ -43,6 +44,7 @@ def _real_extract(self, url): class ToypicsUserIE(InfoExtractor): + _WORKING = False IE_DESC = 'Toypics user profile' _VALID_URL = r'https?://videos\.toypics\.net/(?!view)(?P<id>[^/?#&]+)' _TEST = { diff --git a/yt_dlp/extractor/trilulilu.py b/yt_dlp/extractor/trilulilu.py deleted file mode 100644 index fb97be737..000000000 --- a/yt_dlp/extractor/trilulilu.py +++ /dev/null @@ -1,100 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - parse_iso8601, -) - - -class TriluliluIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m)\.)?trilulilu\.ro/(?:[^/]+/)?(?P<id>[^/#\?]+)' - _TESTS = [{ - 'url': 'http://www.trilulilu.ro/big-buck-bunny-1', - 'md5': '68da087b676a6196a413549212f60cc6', - 'info_dict': { - 'id': 'ae2899e124140b', - 'ext': 'mp4', - 'title': 'Big Buck Bunny', - 'description': ':) pentru copilul din noi', - 'uploader_id': 'chipy', - 'upload_date': '20120304', - 'timestamp': 1330830647, - 'uploader': 'chipy', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - }, - }, { - 'url': 'http://www.trilulilu.ro/adena-ft-morreti-inocenta', - 'md5': '929dfb8729dc71750463af88bbbbf4a4', - 'info_dict': { - 'id': 'f299710e3c91c5', - 'ext': 'mp4', - 'title': 'Adena ft. Morreti - Inocenta', - 'description': 'pop music', - 'uploader_id': 'VEVOmixt', - 'upload_date': '20151204', - 'uploader': 'VEVOmixt', - 'timestamp': 1449187937, - 'view_count': int, - 'like_count': int, - 'comment_count': int, - }, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - media_info = self._download_json('http://m.trilulilu.ro/%s?format=json' % display_id, display_id) - - age_limit = 0 - errors = media_info.get('errors', {}) - if errors.get('friends'): - raise ExtractorError('This video is private.', expected=True) - elif errors.get('geoblock'): - raise ExtractorError('This video is not available in your country.', expected=True) - elif errors.get('xxx_unlogged'): - age_limit = 18 - - media_class = media_info.get('class') - if media_class not in ('video', 'audio'): - raise ExtractorError('not a video or an audio') - - user = media_info.get('user', {}) - - thumbnail = media_info.get('cover_url') - if thumbnail: - thumbnail.format(width='1600', height='1200') - - # TODO: get correct ext for audio files - stream_type = media_info.get('stream_type') - formats = [{ - 'url': media_info['href'], - 'ext': stream_type, - }] - if media_info.get('is_hd'): - formats.append({ - 'format_id': 'hd', - 'url': media_info['hrefhd'], - 'ext': stream_type, - }) - if media_class == 'audio': - formats[0]['vcodec'] = 'none' - else: - formats[0]['format_id'] = 'sd' - - return { - 'id': media_info['identifier'].split('|')[1], - 'display_id': display_id, - 'formats': formats, - 'title': media_info['title'], - 'description': media_info.get('description'), - 'thumbnail': thumbnail, - 'uploader_id': user.get('username'), - 'uploader': user.get('fullname'), - 'timestamp': parse_iso8601(media_info.get('published'), ' '), - 'duration': int_or_none(media_info.get('duration')), - 'view_count': int_or_none(media_info.get('count_views')), - 'like_count': int_or_none(media_info.get('count_likes')), - 'comment_count': int_or_none(media_info.get('count_comments')), - 'age_limit': age_limit, - } diff --git a/yt_dlp/extractor/trtworld.py b/yt_dlp/extractor/trtworld.py new file mode 100644 index 000000000..dbb72a4fe --- /dev/null +++ b/yt_dlp/extractor/trtworld.py @@ -0,0 +1,101 @@ +from .common import InfoExtractor +from ..utils import ExtractorError, determine_ext, parse_iso8601, url_or_none +from ..utils.traversal import traverse_obj + + +class TrtWorldIE(InfoExtractor): + _VALID_URL = r'https?://www\.trtworld\.com/video/[\w-]+/[\w-]+-(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://www.trtworld.com/video/news/turkiye-switches-to-sustainable-tourism-16067690', + 'info_dict': { + 'id': '16067690', + 'ext': 'mp4', + 'title': 'Türkiye switches to sustainable tourism', + 'release_timestamp': 1701529569, + 'release_date': '20231202', + 'thumbnail': 'https://cdn-i.pr.trt.com.tr/trtworld/17647563_0-0-1920-1080.jpeg', + 'description': 'md5:0a975c04257fb529c8f99c7b76a2cf12', + } + }, { + 'url': 'https://www.trtworld.com/video/one-offs/frames-from-anatolia-recreating-a-james-bond-scene-in-istanbuls-grand-bazaar-14541780', + 'info_dict': { + 'id': '14541780', + 'ext': 'mp4', + 'title': 'Frames From Anatolia: Recreating a ‘James Bond’ Scene in Istanbul’s Grand Bazaar', + 'release_timestamp': 1692440844, + 'release_date': '20230819', + 'thumbnail': 'https://cdn-i.pr.trt.com.tr/trtworld/16939810_0-0-1920-1080.jpeg', + 'description': 'md5:4050e21570cc3c40b6c9badae800a94f', + } + }, { + 'url': 'https://www.trtworld.com/video/the-newsmakers/can-sudan-find-peace-amidst-failed-transition-to-democracy-12904760', + 'info_dict': { + 'id': '12904760', + 'ext': 'mp4', + 'title': 'Can Sudan find peace amidst failed transition to democracy?', + 'release_timestamp': 1681972747, + 'release_date': '20230420', + 'thumbnail': 'http://cdni0.trtworld.com/w768/q70/154214_NMYOUTUBETEMPLATE1_1681833018736.jpg' + } + }, { + 'url': 'https://www.trtworld.com/video/africa-matters/locals-learning-to-cope-with-rising-tides-of-kenyas-great-lakes-16059545', + 'info_dict': { + 'id': 'zEns2dWl00w', + 'ext': 'mp4', + 'title': "Locals learning to cope with rising tides of Kenya's Great Lakes", + 'thumbnail': 'https://i.ytimg.com/vi/zEns2dWl00w/maxresdefault.jpg', + 'description': 'md5:3ad9d7c5234d752a4ead4340c79c6b8d', + 'channel_id': 'UC7fWeaHhqgM4Ry-RMpM2YYw', + 'channel_url': 'https://www.youtube.com/channel/UC7fWeaHhqgM4Ry-RMpM2YYw', + 'duration': 210, + 'view_count': int, + 'age_limit': 0, + 'webpage_url': 'https://www.youtube.com/watch?v=zEns2dWl00w', + 'categories': ['News & Politics'], + 'channel': 'TRT World', + 'channel_follower_count': int, + 'channel_is_verified': True, + 'uploader': 'TRT World', + 'uploader_id': '@trtworld', + 'uploader_url': 'https://www.youtube.com/@trtworld', + 'upload_date': '20231202', + 'availability': 'public', + 'comment_count': int, + 'playable_in_embed': True, + 'tags': [], + 'live_status': 'not_live', + 'like_count': int, + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + nuxtjs_data = self._search_nuxt_data(webpage, display_id)['videoData']['content']['platforms'] + formats = [] + for media_url in traverse_obj(nuxtjs_data, ( + ('website', 'ott'), 'metadata', ('hls_url', 'url'), {url_or_none})): + # NB: Website sometimes serves mp4 files under `hls_url` key + if determine_ext(media_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats(media_url, display_id, fatal=False)) + else: + formats.append({ + 'format_id': 'http', + 'url': media_url, + }) + if not formats: + if youtube_id := traverse_obj(nuxtjs_data, ('youtube', 'metadata', 'youtubeId')): + return self.url_result(youtube_id, 'Youtube') + raise ExtractorError('No video found', expected=True) + + return { + 'id': display_id, + 'formats': formats, + **traverse_obj(nuxtjs_data, (('website', 'ott'), { + 'title': ('fields', 'title', 'text', {str}), + 'description': ('fields', 'description', 'text', {str}), + 'thumbnail': ('fields', 'thumbnail', 'url', {url_or_none}), + 'release_timestamp': ('published', 'date', {parse_iso8601}), + }), get_all=False), + } diff --git a/yt_dlp/extractor/tube8.py b/yt_dlp/extractor/tube8.py index 77ed05ffd..5f15b4581 100644 --- a/yt_dlp/extractor/tube8.py +++ b/yt_dlp/extractor/tube8.py @@ -1,13 +1,20 @@ import re +from .common import InfoExtractor +from ..aes import aes_decrypt_text +from ..compat import compat_urllib_parse_unquote from ..utils import ( + determine_ext, + format_field, int_or_none, str_to_int, + strip_or_none, + url_or_none, ) -from .keezmovies import KeezMoviesIE -class Tube8IE(KeezMoviesIE): # XXX: Do not subclass from concrete IE +class Tube8IE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/(?P<id>\d+)' _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)+\d+)'] _TESTS = [{ @@ -30,6 +37,90 @@ class Tube8IE(KeezMoviesIE): # XXX: Do not subclass from concrete IE 'only_matching': True, }] + def _extract_info(self, url, fatal=True): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + display_id = (mobj.group('display_id') + if 'display_id' in mobj.groupdict() + else None) or mobj.group('id') + + webpage = self._download_webpage( + url, display_id, headers={'Cookie': 'age_verified=1'}) + + formats = [] + format_urls = set() + + title = None + thumbnail = None + duration = None + encrypted = False + + def extract_format(format_url, height=None): + format_url = url_or_none(format_url) + if not format_url or not format_url.startswith(('http', '//')): + return + if format_url in format_urls: + return + format_urls.add(format_url) + tbr = int_or_none(self._search_regex( + r'[/_](\d+)[kK][/_]', format_url, 'tbr', default=None)) + if not height: + height = int_or_none(self._search_regex( + r'[/_](\d+)[pP][/_]', format_url, 'height', default=None)) + if encrypted: + format_url = aes_decrypt_text( + video_url, title, 32).decode('utf-8') + formats.append({ + 'url': format_url, + 'format_id': format_field(height, None, '%dp'), + 'height': height, + 'tbr': tbr, + }) + + flashvars = self._parse_json( + self._search_regex( + r'flashvars\s*=\s*({.+?});', webpage, + 'flashvars', default='{}'), + display_id, fatal=False) + + if flashvars: + title = flashvars.get('video_title') + thumbnail = flashvars.get('image_url') + duration = int_or_none(flashvars.get('video_duration')) + encrypted = flashvars.get('encrypted') is True + for key, value in flashvars.items(): + mobj = re.search(r'quality_(\d+)[pP]', key) + if mobj: + extract_format(value, int(mobj.group(1))) + video_url = flashvars.get('video_url') + if video_url and determine_ext(video_url, None): + extract_format(video_url) + + video_url = self._html_search_regex( + r'flashvars\.video_url\s*=\s*(["\'])(?P<url>http.+?)\1', + webpage, 'video url', default=None, group='url') + if video_url: + extract_format(compat_urllib_parse_unquote(video_url)) + + if not formats: + if 'title="This video is no longer available"' in webpage: + self.raise_no_formats( + 'Video %s is no longer available' % video_id, expected=True) + + if not title: + title = self._html_search_regex( + r'<h1[^>]*>([^<]+)', webpage, 'title') + + return webpage, { + 'id': video_id, + 'display_id': display_id, + 'title': strip_or_none(title), + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': 18, + 'formats': formats, + } + def _real_extract(self, url): webpage, info = self._extract_info(url) diff --git a/yt_dlp/extractor/tunepk.py b/yt_dlp/extractor/tunepk.py deleted file mode 100644 index e4e507b00..000000000 --- a/yt_dlp/extractor/tunepk.py +++ /dev/null @@ -1,87 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - try_get, - unified_timestamp, -) - - -class TunePkIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?: - (?:www\.)?tune\.pk/(?:video/|player/embed_player.php?.*?\bvid=)| - embed\.tune\.pk/play/ - ) - (?P<id>\d+) - ''' - _TESTS = [{ - 'url': 'https://tune.pk/video/6919541/maudie-2017-international-trailer-1-ft-ethan-hawke-sally-hawkins', - 'md5': '0c537163b7f6f97da3c5dd1e3ef6dd55', - 'info_dict': { - 'id': '6919541', - 'ext': 'mp4', - 'title': 'Maudie (2017) | International Trailer # 1 ft Ethan Hawke, Sally Hawkins', - 'description': 'md5:eb5a04114fafef5cec90799a93a2d09c', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1487327564, - 'upload_date': '20170217', - 'uploader': 'Movie Trailers', - 'duration': 107, - 'view_count': int, - } - }, { - 'url': 'https://tune.pk/player/embed_player.php?vid=6919541&folder=2017/02/17/&width=600&height=350&autoplay=no', - 'only_matching': True, - }, { - 'url': 'https://embed.tune.pk/play/6919541?autoplay=no&ssl=yes&inline=true', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'https://tune.pk/video/%s' % video_id, video_id) - - details = self._parse_json( - self._search_regex( - r'new\s+TunePlayer\(({.+?})\)\s*;\s*\n', webpage, 'tune player'), - video_id)['details'] - - video = details['video'] - title = video.get('title') or self._og_search_title( - webpage, default=None) or self._html_search_meta( - 'title', webpage, 'title', fatal=True) - - formats = self._parse_jwplayer_formats( - details['player']['sources'], video_id) - - description = self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'description', webpage, 'description') - - thumbnail = video.get('thumb') or self._og_search_thumbnail( - webpage, default=None) or self._html_search_meta( - 'thumbnail', webpage, 'thumbnail') - - timestamp = unified_timestamp(video.get('date_added')) - uploader = try_get( - video, lambda x: x['uploader']['name'], - compat_str) or self._html_search_meta('author', webpage, 'author') - - duration = int_or_none(video.get('duration')) - view_count = int_or_none(video.get('views')) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'uploader': uploader, - 'duration': duration, - 'view_count': view_count, - 'formats': formats, - } diff --git a/yt_dlp/extractor/turbo.py b/yt_dlp/extractor/turbo.py deleted file mode 100644 index cdb7dcff8..000000000 --- a/yt_dlp/extractor/turbo.py +++ /dev/null @@ -1,64 +0,0 @@ -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - int_or_none, - qualities, - xpath_text, -) - - -class TurboIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?turbo\.fr/videos-voiture/(?P<id>[0-9]+)-' - _API_URL = 'http://www.turbo.fr/api/tv/xml.php?player_generique=player_generique&id={0:}' - _TEST = { - 'url': 'http://www.turbo.fr/videos-voiture/454443-turbo-du-07-09-2014-renault-twingo-3-bentley-continental-gt-speed-ces-guide-achat-dacia.html', - 'md5': '33f4b91099b36b5d5a91f84b5bcba600', - 'info_dict': { - 'id': '454443', - 'ext': 'mp4', - 'duration': 3715, - 'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ', - 'description': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...', - 'thumbnail': r're:^https?://.*\.jpg$', - } - } - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - - playlist = self._download_xml(self._API_URL.format(video_id), video_id) - item = playlist.find('./channel/item') - if item is None: - raise ExtractorError('Playlist item was not found', expected=True) - - title = xpath_text(item, './title', 'title') - duration = int_or_none(xpath_text(item, './durate', 'duration')) - thumbnail = xpath_text(item, './visuel_clip', 'thumbnail') - description = self._html_search_meta('description', webpage) - - formats = [] - get_quality = qualities(['3g', 'sd', 'hq']) - for child in item: - m = re.search(r'url_video_(?P<quality>.+)', child.tag) - if m: - quality = compat_str(m.group('quality')) - formats.append({ - 'format_id': quality, - 'url': child.text, - 'quality': get_quality(quality), - }) - - return { - 'id': video_id, - 'title': title, - 'duration': duration, - 'thumbnail': thumbnail, - 'description': description, - 'formats': formats, - } diff --git a/yt_dlp/extractor/tv5mondeplus.py b/yt_dlp/extractor/tv5mondeplus.py index 4da1b26d1..a445fae85 100644 --- a/yt_dlp/extractor/tv5mondeplus.py +++ b/yt_dlp/extractor/tv5mondeplus.py @@ -84,6 +84,13 @@ class TV5MondePlusIE(InfoExtractor): }] _GEO_BYPASS = False + @staticmethod + def _extract_subtitles(data_captions): + subtitles = {} + for f in traverse_obj(data_captions, ('files', lambda _, v: url_or_none(v['file']))): + subtitles.setdefault(f.get('label') or 'fra', []).append({'url': f['file']}) + return subtitles + def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) @@ -176,6 +183,8 @@ def process_video_files(v): 'duration': duration, 'upload_date': upload_date, 'formats': formats, + 'subtitles': self._extract_subtitles(self._parse_json( + traverse_obj(vpl_data, ('data-captions', {str}), default='{}'), display_id, fatal=False)), 'series': series, 'episode': episode, } diff --git a/yt_dlp/extractor/tvnet.py b/yt_dlp/extractor/tvnet.py deleted file mode 100644 index 77426f7e6..000000000 --- a/yt_dlp/extractor/tvnet.py +++ /dev/null @@ -1,138 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unescapeHTML, - url_or_none, -) - - -class TVNetIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?:\d+/)?(?P<id>\d+)(?:/|$)' - _TESTS = [{ - # video - 'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', - 'md5': 'b4d7abe0252c9b47774760b7519c7558', - 'info_dict': { - 'id': '109788', - 'ext': 'mp4', - 'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang', - 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', - 'is_live': False, - 'view_count': int, - }, - }, { - # audio - 'url': 'http://vn.tvnet.gov.vn/radio/27017/vov1---ban-tin-chieu-10062018/doi-song-va-xa-hoi', - 'md5': 'b5875ce9b0a2eecde029216d0e6db2ae', - 'info_dict': { - 'id': '27017', - 'ext': 'm4a', - 'title': 'VOV1 - Bản tin chiều (10/06/2018)', - 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', - 'is_live': False, - }, - }, { - 'url': 'http://us.tvnet.gov.vn/video/118023/129999/ngay-0705', - 'info_dict': { - 'id': '129999', - 'ext': 'mp4', - 'title': 'VTV1 - Quốc hội với cử tri (11/06/2018)', - 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', - 'is_live': False, - }, - 'params': { - 'skip_download': True, - }, - }, { - # live stream - 'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1', - 'info_dict': { - 'id': '1011', - 'ext': 'mp4', - 'title': r're:^VTV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', - 'is_live': True, - }, - 'params': { - 'skip_download': True, - }, - }, { - # radio live stream - 'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014', - 'info_dict': { - 'id': '1014', - 'ext': 'm4a', - 'title': r're:VOV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', - 'is_live': True, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://us.tvnet.gov.vn/phim/6136/25510/vtv3---ca-mot-doi-an-oan-tap-1-50/phim-truyen-hinh', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - title = self._og_search_title( - webpage, default=None) or self._html_search_meta( - 'title', webpage, default=None) or self._search_regex( - r'<title>([^<]+)<', webpage, 'title') - title = re.sub(r'\s*-\s*TV Net\s*$', '', title) - - if '/video/' in url or '/radio/' in url: - is_live = False - elif '/kenh-truyen-hinh/' in url: - is_live = True - else: - is_live = None - - data_file = unescapeHTML(self._search_regex( - r'data-file=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, - 'data file', group='url')) - - stream_urls = set() - formats = [] - for stream in self._download_json(data_file, video_id): - if not isinstance(stream, dict): - continue - stream_url = url_or_none(stream.get('url')) - if stream_url in stream_urls or not stream_url: - continue - stream_urls.add(stream_url) - formats.extend(self._extract_m3u8_formats( - stream_url, video_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False)) - - # better support for radio streams - if title.startswith('VOV'): - for f in formats: - f.update({ - 'ext': 'm4a', - 'vcodec': 'none', - }) - - thumbnail = self._og_search_thumbnail( - webpage, default=None) or unescapeHTML( - self._search_regex( - r'data-image=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, - 'thumbnail', default=None, group='url')) - - view_count = int_or_none(self._search_regex( - r'(?s)<div[^>]+\bclass=["\'].*?view-count[^>]+>.*?(\d+).*?</div>', - webpage, 'view count', default=None)) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'is_live': is_live, - 'view_count': view_count, - 'formats': formats, - } diff --git a/yt_dlp/extractor/tvnow.py b/yt_dlp/extractor/tvnow.py deleted file mode 100644 index 0acc306df..000000000 --- a/yt_dlp/extractor/tvnow.py +++ /dev/null @@ -1,639 +0,0 @@ -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - get_element_by_id, - int_or_none, - parse_iso8601, - parse_duration, - str_or_none, - try_get, - update_url_query, - urljoin, -) - - -class TVNowBaseIE(InfoExtractor): - _VIDEO_FIELDS = ( - 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', - 'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode', - 'manifest.dashclear', 'manifest.hlsclear', 'manifest.smoothclear', - 'format.title', 'format.defaultImage169Format', 'format.defaultImage169Logo') - - def _call_api(self, path, video_id, query): - return self._download_json( - 'https://api.tvnow.de/v3/' + path, video_id, query=query) - - def _extract_video(self, info, display_id): - video_id = compat_str(info['id']) - title = info['title'] - - paths = [] - for manifest_url in (info.get('manifest') or {}).values(): - if not manifest_url: - continue - manifest_url = update_url_query(manifest_url, {'filter': ''}) - path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path') - if path in paths: - continue - paths.append(path) - - def url_repl(proto, suffix): - return re.sub( - r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub( - r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)', - '.ism/' + suffix, manifest_url)) - - def make_urls(proto, suffix): - urls = [url_repl(proto, suffix)] - hd_url = urls[0].replace('/manifest/', '/ngvod/') - if hd_url != urls[0]: - urls.append(hd_url) - return urls - - for man_url in make_urls('dash', '.mpd'): - formats = self._extract_mpd_formats( - man_url, video_id, mpd_id='dash', fatal=False) - for man_url in make_urls('hss', 'Manifest'): - formats.extend(self._extract_ism_formats( - man_url, video_id, ism_id='mss', fatal=False)) - for man_url in make_urls('hls', '.m3u8'): - formats.extend(self._extract_m3u8_formats( - man_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', - fatal=False)) - if formats: - break - else: - if not self.get_param('allow_unplayable_formats') and info.get('isDrm'): - raise ExtractorError( - 'Video %s is DRM protected' % video_id, expected=True) - if info.get('geoblocked'): - raise self.raise_geo_restricted() - if not info.get('free', True): - raise ExtractorError( - 'Video %s is not available for free' % video_id, expected=True) - - description = info.get('articleLong') or info.get('articleShort') - timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') - duration = parse_duration(info.get('duration')) - - f = info.get('format', {}) - - thumbnails = [{ - 'url': 'https://aistvnow-a.akamaihd.net/tvnow/movie/%s' % video_id, - }] - thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') - if thumbnail: - thumbnails.append({ - 'url': thumbnail, - }) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnails': thumbnails, - 'timestamp': timestamp, - 'duration': duration, - 'series': f.get('title'), - 'season_number': int_or_none(info.get('season')), - 'episode_number': int_or_none(info.get('episode')), - 'episode': title, - 'formats': formats, - } - - -class TVNowIE(TVNowBaseIE): - _VALID_URL = r'''(?x) - https?:// - (?:www\.)?tvnow\.(?:de|at|ch)/(?P<station>[^/]+)/ - (?P<show_id>[^/]+)/ - (?!(?:list|jahr)(?:/|$))(?P<id>[^/?\#&]+) - ''' - - @classmethod - def suitable(cls, url): - return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url) or TVNowShowIE.suitable(url) - else super(TVNowIE, cls).suitable(url)) - - _TESTS = [{ - 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3/player', - 'info_dict': { - 'id': '331082', - 'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3', - 'ext': 'mp4', - 'title': 'Der neue Porsche 911 GT 3', - 'description': 'md5:6143220c661f9b0aae73b245e5d898bb', - 'timestamp': 1495994400, - 'upload_date': '20170528', - 'duration': 5283, - 'series': 'GRIP - Das Motormagazin', - 'season_number': 14, - 'episode_number': 405, - 'episode': 'Der neue Porsche 911 GT 3', - }, - }, { - # rtl2 - 'url': 'https://www.tvnow.de/rtl2/armes-deutschland/episode-0008/player', - 'only_matching': True, - }, { - # rtlnitro - 'url': 'https://www.tvnow.de/nitro/alarm-fuer-cobra-11-die-autobahnpolizei/auf-eigene-faust-pilot/player', - 'only_matching': True, - }, { - # superrtl - 'url': 'https://www.tvnow.de/superrtl/die-lustigsten-schlamassel-der-welt/u-a-ketchup-effekt/player', - 'only_matching': True, - }, { - # ntv - 'url': 'https://www.tvnow.de/ntv/startup-news/goetter-in-weiss/player', - 'only_matching': True, - }, { - # vox - 'url': 'https://www.tvnow.de/vox/auto-mobil/neues-vom-automobilmarkt-2017-11-19-17-00-00/player', - 'only_matching': True, - }, { - # rtlplus - 'url': 'https://www.tvnow.de/rtlplus/op-ruft-dr-bruckner/die-vernaehte-frau/player', - 'only_matching': True, - }, { - 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - display_id = '%s/%s' % mobj.group(2, 3) - - info = self._call_api( - 'movies/' + display_id, display_id, query={ - 'fields': ','.join(self._VIDEO_FIELDS), - }) - - return self._extract_video(info, display_id) - - -class TVNowNewIE(InfoExtractor): - _VALID_URL = r'''(?x) - (?P<base_url>https?:// - (?:www\.)?tvnow\.(?:de|at|ch)/ - (?:shows|serien))/ - (?P<show>[^/]+)-\d+/ - [^/]+/ - episode-\d+-(?P<episode>[^/?$&]+)-(?P<id>\d+) - ''' - - _TESTS = [{ - 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - base_url = re.sub(r'(?:shows|serien)', '_', mobj.group('base_url')) - show, episode = mobj.group('show', 'episode') - return self.url_result( - # Rewrite new URLs to the old format and use extraction via old API - # at api.tvnow.de as a loophole for bypassing premium content checks - '%s/%s/%s' % (base_url, show, episode), - ie=TVNowIE.ie_key(), video_id=mobj.group('id')) - - -class TVNowFilmIE(TVNowBaseIE): - _VALID_URL = r'''(?x) - (?P<base_url>https?:// - (?:www\.)?tvnow\.(?:de|at|ch)/ - (?:filme))/ - (?P<title>[^/?$&]+)-(?P<id>\d+) - ''' - _TESTS = [{ - 'url': 'https://www.tvnow.de/filme/lord-of-war-haendler-des-todes-7959', - 'info_dict': { - 'id': '1426690', - 'display_id': 'lord-of-war-haendler-des-todes', - 'ext': 'mp4', - 'title': 'Lord of War', - 'description': 'md5:5eda15c0d5b8cb70dac724c8a0ff89a9', - 'timestamp': 1550010000, - 'upload_date': '20190212', - 'duration': 7016, - }, - }, { - 'url': 'https://www.tvnow.de/filme/the-machinist-12157', - 'info_dict': { - 'id': '328160', - 'display_id': 'the-machinist', - 'ext': 'mp4', - 'title': 'The Machinist', - 'description': 'md5:9a0e363fdd74b3a9e1cdd9e21d0ecc28', - 'timestamp': 1496469720, - 'upload_date': '20170603', - 'duration': 5836, - }, - }, { - 'url': 'https://www.tvnow.de/filme/horst-schlaemmer-isch-kandidiere-17777', - 'only_matching': True, # DRM protected - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - display_id = mobj.group('title') - - webpage = self._download_webpage(url, display_id, fatal=False) - if not webpage: - raise ExtractorError('Cannot download "%s"' % url, expected=True) - - json_text = get_element_by_id('now-web-state', webpage) - if not json_text: - raise ExtractorError('Cannot read video data', expected=True) - - json_data = self._parse_json( - json_text, - display_id, - transform_source=lambda x: x.replace('&q;', '"'), - fatal=False) - if not json_data: - raise ExtractorError('Cannot read video data', expected=True) - - player_key = next( - (key for key in json_data.keys() if 'module/player' in key), - None) - page_key = next( - (key for key in json_data.keys() if 'page/filme' in key), - None) - movie_id = try_get( - json_data, - [ - lambda x: x[player_key]['body']['id'], - lambda x: x[page_key]['body']['modules'][0]['id'], - lambda x: x[page_key]['body']['modules'][1]['id']], - int) - if not movie_id: - raise ExtractorError('Cannot extract movie ID', expected=True) - - info = self._call_api( - 'movies/%d' % movie_id, - display_id, - query={'fields': ','.join(self._VIDEO_FIELDS)}) - - return self._extract_video(info, display_id) - - -class TVNowNewBaseIE(InfoExtractor): - def _call_api(self, path, video_id, query={}): - result = self._download_json( - 'https://apigw.tvnow.de/module/' + path, video_id, query=query) - error = result.get('error') - if error: - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error), expected=True) - return result - - -r""" -TODO: new apigw.tvnow.de based version of TVNowIE. Replace old TVNowIE with it -when api.tvnow.de is shut down. This version can't bypass premium checks though. -class TVNowIE(TVNowNewBaseIE): - _VALID_URL = r'''(?x) - https?:// - (?:www\.)?tvnow\.(?:de|at|ch)/ - (?:shows|serien)/[^/]+/ - (?:[^/]+/)+ - (?P<display_id>[^/?$&]+)-(?P<id>\d+) - ''' - - _TESTS = [{ - # episode with annual navigation - 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082', - 'info_dict': { - 'id': '331082', - 'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3', - 'ext': 'mp4', - 'title': 'Der neue Porsche 911 GT 3', - 'description': 'md5:6143220c661f9b0aae73b245e5d898bb', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1495994400, - 'upload_date': '20170528', - 'duration': 5283, - 'series': 'GRIP - Das Motormagazin', - 'season_number': 14, - 'episode_number': 405, - 'episode': 'Der neue Porsche 911 GT 3', - }, - }, { - # rtl2, episode with season navigation - 'url': 'https://www.tvnow.de/shows/armes-deutschland-11471/staffel-3/episode-14-bernd-steht-seit-der-trennung-von-seiner-frau-allein-da-526124', - 'only_matching': True, - }, { - # rtlnitro - 'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13/episode-5-auf-eigene-faust-pilot-366822', - 'only_matching': True, - }, { - # superrtl - 'url': 'https://www.tvnow.de/shows/die-lustigsten-schlamassel-der-welt-1221/staffel-2/episode-14-u-a-ketchup-effekt-364120', - 'only_matching': True, - }, { - # ntv - 'url': 'https://www.tvnow.de/shows/startup-news-10674/staffel-2/episode-39-goetter-in-weiss-387630', - 'only_matching': True, - }, { - # vox - 'url': 'https://www.tvnow.de/shows/auto-mobil-174/2017-11/episode-46-neues-vom-automobilmarkt-2017-11-19-17-00-00-380072', - 'only_matching': True, - }, { - 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082', - 'only_matching': True, - }] - - def _extract_video(self, info, url, display_id): - config = info['config'] - source = config['source'] - - video_id = compat_str(info.get('id') or source['videoId']) - title = source['title'].strip() - - paths = [] - for manifest_url in (info.get('manifest') or {}).values(): - if not manifest_url: - continue - manifest_url = update_url_query(manifest_url, {'filter': ''}) - path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path') - if path in paths: - continue - paths.append(path) - - def url_repl(proto, suffix): - return re.sub( - r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub( - r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)', - '.ism/' + suffix, manifest_url)) - - formats = self._extract_mpd_formats( - url_repl('dash', '.mpd'), video_id, - mpd_id='dash', fatal=False) - formats.extend(self._extract_ism_formats( - url_repl('hss', 'Manifest'), - video_id, ism_id='mss', fatal=False)) - formats.extend(self._extract_m3u8_formats( - url_repl('hls', '.m3u8'), video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - if formats: - break - else: - if try_get(info, lambda x: x['rights']['isDrm']): - raise ExtractorError( - 'Video %s is DRM protected' % video_id, expected=True) - if try_get(config, lambda x: x['boards']['geoBlocking']['block']): - raise self.raise_geo_restricted() - if not info.get('free', True): - raise ExtractorError( - 'Video %s is not available for free' % video_id, expected=True) - - description = source.get('description') - thumbnail = url_or_none(source.get('poster')) - timestamp = unified_timestamp(source.get('previewStart')) - duration = parse_duration(source.get('length')) - - series = source.get('format') - season_number = int_or_none(self._search_regex( - r'staffel-(\d+)', url, 'season number', default=None)) - episode_number = int_or_none(self._search_regex( - r'episode-(\d+)', url, 'episode number', default=None)) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'series': series, - 'season_number': season_number, - 'episode_number': episode_number, - 'episode': title, - 'formats': formats, - } - - def _real_extract(self, url): - display_id, video_id = self._match_valid_url(url).groups() - info = self._call_api('player/' + video_id, video_id) - return self._extract_video(info, video_id, display_id) - - -class TVNowFilmIE(TVNowIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'''(?x) - (?P<base_url>https?:// - (?:www\.)?tvnow\.(?:de|at|ch)/ - (?:filme))/ - (?P<title>[^/?$&]+)-(?P<id>\d+) - ''' - _TESTS = [{ - 'url': 'https://www.tvnow.de/filme/lord-of-war-haendler-des-todes-7959', - 'info_dict': { - 'id': '1426690', - 'display_id': 'lord-of-war-haendler-des-todes', - 'ext': 'mp4', - 'title': 'Lord of War', - 'description': 'md5:5eda15c0d5b8cb70dac724c8a0ff89a9', - 'timestamp': 1550010000, - 'upload_date': '20190212', - 'duration': 7016, - }, - }, { - 'url': 'https://www.tvnow.de/filme/the-machinist-12157', - 'info_dict': { - 'id': '328160', - 'display_id': 'the-machinist', - 'ext': 'mp4', - 'title': 'The Machinist', - 'description': 'md5:9a0e363fdd74b3a9e1cdd9e21d0ecc28', - 'timestamp': 1496469720, - 'upload_date': '20170603', - 'duration': 5836, - }, - }, { - 'url': 'https://www.tvnow.de/filme/horst-schlaemmer-isch-kandidiere-17777', - 'only_matching': True, # DRM protected - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - display_id = mobj.group('title') - - webpage = self._download_webpage(url, display_id, fatal=False) - if not webpage: - raise ExtractorError('Cannot download "%s"' % url, expected=True) - - json_text = get_element_by_id('now-web-state', webpage) - if not json_text: - raise ExtractorError('Cannot read video data', expected=True) - - json_data = self._parse_json( - json_text, - display_id, - transform_source=lambda x: x.replace('&q;', '"'), - fatal=False) - if not json_data: - raise ExtractorError('Cannot read video data', expected=True) - - player_key = next( - (key for key in json_data.keys() if 'module/player' in key), - None) - page_key = next( - (key for key in json_data.keys() if 'page/filme' in key), - None) - movie_id = try_get( - json_data, - [ - lambda x: x[player_key]['body']['id'], - lambda x: x[page_key]['body']['modules'][0]['id'], - lambda x: x[page_key]['body']['modules'][1]['id']], - int) - if not movie_id: - raise ExtractorError('Cannot extract movie ID', expected=True) - - info = self._call_api('player/%d' % movie_id, display_id) - return self._extract_video(info, url, display_id) -""" - - -class TVNowListBaseIE(TVNowNewBaseIE): - _SHOW_VALID_URL = r'''(?x) - (?P<base_url> - https?:// - (?:www\.)?tvnow\.(?:de|at|ch)/(?:shows|serien)/ - [^/?#&]+-(?P<show_id>\d+) - ) - ''' - - @classmethod - def suitable(cls, url): - return (False if TVNowNewIE.suitable(url) - else super(TVNowListBaseIE, cls).suitable(url)) - - def _extract_items(self, url, show_id, list_id, query): - items = self._call_api( - 'teaserrow/format/episode/' + show_id, list_id, - query=query)['items'] - - entries = [] - for item in items: - if not isinstance(item, dict): - continue - item_url = urljoin(url, item.get('url')) - if not item_url: - continue - video_id = str_or_none(item.get('id') or item.get('videoId')) - item_title = item.get('subheadline') or item.get('text') - entries.append(self.url_result( - item_url, ie=TVNowNewIE.ie_key(), video_id=video_id, - video_title=item_title)) - - return self.playlist_result(entries, '%s/%s' % (show_id, list_id)) - - -class TVNowSeasonIE(TVNowListBaseIE): - _VALID_URL = r'%s/staffel-(?P<id>\d+)' % TVNowListBaseIE._SHOW_VALID_URL - _TESTS = [{ - 'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13', - 'info_dict': { - 'id': '1815/13', - }, - 'playlist_mincount': 22, - }] - - def _real_extract(self, url): - _, show_id, season_id = self._match_valid_url(url).groups() - return self._extract_items( - url, show_id, season_id, {'season': season_id}) - - -class TVNowAnnualIE(TVNowListBaseIE): - _VALID_URL = r'%s/(?P<year>\d{4})-(?P<month>\d{2})' % TVNowListBaseIE._SHOW_VALID_URL - _TESTS = [{ - 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05', - 'info_dict': { - 'id': '1669/2017-05', - }, - 'playlist_mincount': 2, - }] - - def _real_extract(self, url): - _, show_id, year, month = self._match_valid_url(url).groups() - return self._extract_items( - url, show_id, '%s-%s' % (year, month), { - 'year': int(year), - 'month': int(month), - }) - - -class TVNowShowIE(TVNowListBaseIE): - _VALID_URL = TVNowListBaseIE._SHOW_VALID_URL - _TESTS = [{ - # annual navigationType - 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669', - 'info_dict': { - 'id': '1669', - }, - 'playlist_mincount': 73, - }, { - # season navigationType - 'url': 'https://www.tvnow.de/shows/armes-deutschland-11471', - 'info_dict': { - 'id': '11471', - }, - 'playlist_mincount': 3, - }] - - @classmethod - def suitable(cls, url): - return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url) - else super(TVNowShowIE, cls).suitable(url)) - - def _real_extract(self, url): - base_url, show_id = self._match_valid_url(url).groups() - - result = self._call_api( - 'teaserrow/format/navigation/' + show_id, show_id) - - items = result['items'] - - entries = [] - navigation = result.get('navigationType') - if navigation == 'annual': - for item in items: - if not isinstance(item, dict): - continue - year = int_or_none(item.get('year')) - if year is None: - continue - months = item.get('months') - if not isinstance(months, list): - continue - for month_dict in months: - if not isinstance(month_dict, dict) or not month_dict: - continue - month_number = int_or_none(list(month_dict.keys())[0]) - if month_number is None: - continue - entries.append(self.url_result( - '%s/%04d-%02d' % (base_url, year, month_number), - ie=TVNowAnnualIE.ie_key())) - elif navigation == 'season': - for item in items: - if not isinstance(item, dict): - continue - season_number = int_or_none(item.get('season')) - if season_number is None: - continue - entries.append(self.url_result( - '%s/staffel-%d' % (base_url, season_number), - ie=TVNowSeasonIE.ie_key())) - else: - raise ExtractorError('Unknown navigationType') - - return self.playlist_result(entries, show_id) diff --git a/yt_dlp/extractor/twentyfourvideo.py b/yt_dlp/extractor/twentyfourvideo.py deleted file mode 100644 index baeb85d47..000000000 --- a/yt_dlp/extractor/twentyfourvideo.py +++ /dev/null @@ -1,128 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - parse_iso8601, - int_or_none, - xpath_attr, - xpath_element, -) - - -class TwentyFourVideoIE(InfoExtractor): - IE_NAME = '24video' - _VALID_URL = r'''(?x) - https?:// - (?P<host> - (?:(?:www|porno?)\.)?24video\. - (?:net|me|xxx|sexy?|tube|adult|site|vip) - )/ - (?: - video/(?:(?:view|xml)/)?| - player/new24_play\.swf\?id= - ) - (?P<id>\d+) - ''' - - _TESTS = [{ - 'url': 'http://www.24video.net/video/view/1044982', - 'md5': 'e09fc0901d9eaeedac872f154931deeb', - 'info_dict': { - 'id': '1044982', - 'ext': 'mp4', - 'title': 'Эротика каменного века', - 'description': 'Как смотрели порно в каменном веке.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'SUPERTELO', - 'duration': 31, - 'timestamp': 1275937857, - 'upload_date': '20100607', - 'age_limit': 18, - 'like_count': int, - 'dislike_count': int, - }, - }, { - 'url': 'http://www.24video.net/player/new24_play.swf?id=1044982', - 'only_matching': True, - }, { - 'url': 'http://www.24video.me/video/view/1044982', - 'only_matching': True, - }, { - 'url': 'http://www.24video.tube/video/view/2363750', - 'only_matching': True, - }, { - 'url': 'https://www.24video.site/video/view/2640421', - 'only_matching': True, - }, { - 'url': 'https://porno.24video.net/video/2640421-vsya-takaya-gibkaya-i-v-masle', - 'only_matching': True, - }, { - 'url': 'https://www.24video.vip/video/view/1044982', - 'only_matching': True, - }, { - 'url': 'https://porn.24video.net/video/2640421-vsya-takay', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - host = mobj.group('host') - - webpage = self._download_webpage( - 'http://%s/video/view/%s' % (host, video_id), video_id) - - title = self._og_search_title(webpage) - description = self._html_search_regex( - r'<(p|span)[^>]+itemprop="description"[^>]*>(?P<description>[^<]+)</\1>', - webpage, 'description', fatal=False, group='description') - thumbnail = self._og_search_thumbnail(webpage) - duration = int_or_none(self._og_search_property( - 'duration', webpage, 'duration', fatal=False)) - timestamp = parse_iso8601(self._search_regex( - r'<time[^>]+\bdatetime="([^"]+)"[^>]+itemprop="uploadDate"', - webpage, 'upload date', fatal=False)) - - uploader = self._html_search_regex( - r'class="video-uploaded"[^>]*>\s*<a href="/jsecUser/movies/[^"]+"[^>]*>([^<]+)</a>', - webpage, 'uploader', fatal=False) - - view_count = int_or_none(self._html_search_regex( - r'<span class="video-views">(\d+) просмотр', - webpage, 'view count', fatal=False)) - comment_count = int_or_none(self._html_search_regex( - r'<a[^>]+href="#tab-comments"[^>]*>(\d+) комментари', - webpage, 'comment count', default=None)) - - # Sets some cookies - self._download_xml( - r'http://%s/video/xml/%s?mode=init' % (host, video_id), - video_id, 'Downloading init XML') - - video_xml = self._download_xml( - 'http://%s/video/xml/%s?mode=play' % (host, video_id), - video_id, 'Downloading video XML') - - video = xpath_element(video_xml, './/video', 'video', fatal=True) - - formats = [{ - 'url': xpath_attr(video, '', 'url', 'video URL', fatal=True), - }] - - like_count = int_or_none(video.get('ratingPlus')) - dislike_count = int_or_none(video.get('ratingMinus')) - age_limit = 18 if video.get('adult') == 'true' else 0 - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'duration': duration, - 'timestamp': timestamp, - 'view_count': view_count, - 'comment_count': comment_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'age_limit': age_limit, - 'formats': formats, - } diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py index 540e217fd..28ea16cc2 100644 --- a/yt_dlp/extractor/twitcasting.py +++ b/yt_dlp/extractor/twitcasting.py @@ -142,7 +142,7 @@ def _real_extract(self, url): 'https://twitcasting.tv/streamserver.php?target=%s&mode=client' % uploader_id, video_id, 'Downloading live info', fatal=False) - is_live = 'data-status="online"' in webpage + is_live = any(f'data-{x}' in webpage for x in ['is-onlive="true"', 'live-type="live"', 'status="online"']) if not traverse_obj(stream_server_data, 'llfmp4') and is_live: self.raise_login_required(method='cookies') @@ -247,25 +247,26 @@ def _real_extract(self, url): 'Downloading live video of user {0}. ' 'Pass "https://twitcasting.tv/{0}/show" to download the history'.format(uploader_id)) - webpage = self._download_webpage(url, uploader_id) - current_live = self._search_regex( - (r'data-type="movie" data-id="(\d+)">', - r'tw-sound-flag-open-link" data-id="(\d+)" style=',), - webpage, 'current live ID', default=None) - if not current_live: - # fetch unfiltered /show to find running livestreams; we can't get ID of the password-protected livestream above - webpage = self._download_webpage( - f'https://twitcasting.tv/{uploader_id}/show/', uploader_id, - note='Downloading live history') - is_live = self._search_regex(r'(?s)(<span\s*class="tw-movie-thumbnail-badge"\s*data-status="live">\s*LIVE)', webpage, 'is live?', default=None) - if is_live: - # get the first live; running live is always at the first - current_live = self._search_regex( - r'(?s)<a\s+class="tw-movie-thumbnail"\s*href="/[^/]+/movie/(?P<video_id>\d+)"\s*>.+?</a>', - webpage, 'current live ID 2', default=None, group='video_id') - if not current_live: + is_live = traverse_obj(self._download_json( + f'https://frontendapi.twitcasting.tv/watch/user/{uploader_id}', + uploader_id, 'Checking live status', data=b'', fatal=False), ('is_live', {bool})) + if is_live is False: # only raise here if API response was as expected raise UserNotLive(video_id=uploader_id) - return self.url_result('https://twitcasting.tv/%s/movie/%s' % (uploader_id, current_live)) + + # Use /show/ page so that password-protected and members-only livestreams can be found + webpage = self._download_webpage( + f'https://twitcasting.tv/{uploader_id}/show/', uploader_id, 'Downloading live history') + is_live = is_live or self._search_regex( + r'(?s)(<span\s*class="tw-movie-thumbnail2-badge"\s*data-status="live">\s*LIVE)', + webpage, 'is live?', default=False) + # Current live is always the first match + current_live = self._search_regex( + r'(?s)<a\s+class="tw-movie-thumbnail2"\s+href="/[^/"]+/movie/(?P<video_id>\d+)"', + webpage, 'current live ID', default=None, group='video_id') + if not is_live or not current_live: + raise UserNotLive(video_id=uploader_id) + + return self.url_result(f'https://twitcasting.tv/{uploader_id}/movie/{current_live}', TwitCastingIE) class TwitCastingUserIE(InfoExtractor): @@ -288,8 +289,7 @@ def _entries(self, uploader_id): webpage = self._download_webpage( next_url, uploader_id, query={'filter': 'watchable'}, note='Downloading page %d' % page_num) matches = re.finditer( - r'''(?isx)<a\s+class="tw-movie-thumbnail"\s*href="(?P<url>/[^/]+/movie/\d+)"\s*>.+?</a>''', - webpage) + r'(?s)<a\s+class="tw-movie-thumbnail2"\s+href="(?P<url>/[^/"]+/movie/\d+)"', webpage) for mobj in matches: yield self.url_result(urljoin(base_url, mobj.group('url'))) diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py index 3297ef091..6dc0993af 100644 --- a/yt_dlp/extractor/twitch.py +++ b/yt_dlp/extractor/twitch.py @@ -8,7 +8,6 @@ from ..compat import ( compat_parse_qs, compat_str, - compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, ) from ..utils import ( @@ -191,6 +190,20 @@ def _get_thumbnails(self, thumbnail): 'url': thumbnail, }] if thumbnail else None + def _extract_twitch_m3u8_formats(self, video_id, token, signature): + """Subclasses must define _M3U8_PATH""" + return self._extract_m3u8_formats( + f'{self._USHER_BASE}/{self._M3U8_PATH}/{video_id}.m3u8', video_id, 'mp4', query={ + 'allow_source': 'true', + 'allow_audio_only': 'true', + 'allow_spectre': 'true', + 'p': random.randint(1000000, 10000000), + 'player': 'twitchweb', + 'playlist_include_framerate': 'true', + 'sig': signature, + 'token': token, + }) + class TwitchVodIE(TwitchBaseIE): IE_NAME = 'twitch:vod' @@ -203,6 +216,7 @@ class TwitchVodIE(TwitchBaseIE): ) (?P<id>\d+) ''' + _M3U8_PATH = 'vod' _TESTS = [{ 'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s', @@ -532,20 +546,8 @@ def _real_extract(self, url): info = self._extract_info_gql(video, vod_id) access_token = self._download_access_token(vod_id, 'video', 'id') - formats = self._extract_m3u8_formats( - '%s/vod/%s.m3u8?%s' % ( - self._USHER_BASE, vod_id, - compat_urllib_parse_urlencode({ - 'allow_source': 'true', - 'allow_audio_only': 'true', - 'allow_spectre': 'true', - 'player': 'twitchweb', - 'playlist_include_framerate': 'true', - 'nauth': access_token['value'], - 'nauthsig': access_token['signature'], - })), - vod_id, 'mp4', entry_protocol='m3u8_native') - + formats = self._extract_twitch_m3u8_formats( + vod_id, access_token['value'], access_token['signature']) formats.extend(self._extract_storyboard(vod_id, video.get('storyboard'), info.get('duration'))) self._prefer_source(formats) @@ -924,6 +926,7 @@ class TwitchStreamIE(TwitchBaseIE): ) (?P<id>[^/#?]+) ''' + _M3U8_PATH = 'api/channel/hls' _TESTS = [{ 'url': 'http://www.twitch.tv/shroomztv', @@ -1026,23 +1029,10 @@ def _real_extract(self, url): access_token = self._download_access_token( channel_name, 'stream', 'channelName') - token = access_token['value'] stream_id = stream.get('id') or channel_name - query = { - 'allow_source': 'true', - 'allow_audio_only': 'true', - 'allow_spectre': 'true', - 'p': random.randint(1000000, 10000000), - 'player': 'twitchweb', - 'playlist_include_framerate': 'true', - 'segment_preference': '4', - 'sig': access_token['signature'].encode('utf-8'), - 'token': token.encode('utf-8'), - } - formats = self._extract_m3u8_formats( - '%s/api/channel/hls/%s.m3u8' % (self._USHER_BASE, channel_name), - stream_id, 'mp4', query=query) + formats = self._extract_twitch_m3u8_formats( + channel_name, access_token['value'], access_token['signature']) self._prefer_source(formats) view_count = stream.get('viewers') diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 4065acbaa..c3a6e406c 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -10,6 +10,7 @@ compat_urllib_parse_unquote, compat_urllib_parse_urlparse, ) +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, dict_get, @@ -479,9 +480,9 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': [], 'age_limit': 18, + '_old_archive_ids': ['twitter 643211948184596480'], }, }, { 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', @@ -515,6 +516,7 @@ class TwitterIE(TwitterBaseIE): 'like_count': int, 'tags': ['TV', 'StarWars', 'TheForceAwakens'], 'age_limit': 0, + '_old_archive_ids': ['twitter 665052190608723968'], }, }, { 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880', @@ -558,9 +560,9 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': ['Damndaniel'], 'age_limit': 0, + '_old_archive_ids': ['twitter 700207533655363584'], }, }, { 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609', @@ -599,9 +601,9 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': [], 'age_limit': 0, + '_old_archive_ids': ['twitter 719944021058060289'], }, }, { 'url': 'https://twitter.com/OPP_HSD/status/779210622571536384', @@ -616,6 +618,7 @@ class TwitterIE(TwitterBaseIE): 'thumbnail': r're:^https?://.*\.jpg', }, 'add_ie': ['Periscope'], + 'skip': 'Broadcast not found', }, { # has mp4 formats via mobile API 'url': 'https://twitter.com/news_al3alm/status/852138619213144067', @@ -635,9 +638,9 @@ class TwitterIE(TwitterBaseIE): 'thumbnail': r're:^https?://.*\.jpg', 'tags': [], 'repost_count': int, - 'view_count': int, 'like_count': int, 'comment_count': int, + '_old_archive_ids': ['twitter 852138619213144067'], }, }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', @@ -657,9 +660,9 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': ['Maria'], 'age_limit': 0, + '_old_archive_ids': ['twitter 910031516746514432'], }, 'params': { 'skip_download': True, # requires ffmpeg @@ -683,9 +686,9 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': [], 'age_limit': 0, + '_old_archive_ids': ['twitter 1001551623938805763'], }, 'params': { 'skip_download': True, # requires ffmpeg @@ -749,6 +752,7 @@ class TwitterIE(TwitterBaseIE): 'like_count': int, 'tags': [], 'age_limit': 0, + '_old_archive_ids': ['twitter 1349794411333394432'], }, 'params': { 'skip_download': True, @@ -771,18 +775,18 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': [], 'age_limit': 0, + '_old_archive_ids': ['twitter 1577855540407197696'], }, 'params': {'skip_download': True}, }, { 'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464', 'info_dict': { 'id': '1577719286659006464', - 'title': 'Ultima📛| New Era - Test', + 'title': 'Ultima - Test', 'description': 'Test https://t.co/Y3KEZD7Dad', - 'uploader': 'Ultima📛| New Era', + 'uploader': 'Ultima', 'uploader_id': 'UltimaShadowX', 'uploader_url': 'https://twitter.com/UltimaShadowX', 'upload_date': '20221005', @@ -813,9 +817,9 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, - 'view_count': int, 'tags': ['HurricaneIan'], 'age_limit': 0, + '_old_archive_ids': ['twitter 1575560063510810624'], }, }, { # Adult content, fails if not logged in @@ -951,10 +955,10 @@ class TwitterIE(TwitterBaseIE): 'uploader_url': 'https://twitter.com/CTVJLaidlaw', 'display_id': '1600649710662213632', 'like_count': int, - 'view_count': int, 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c', 'upload_date': '20221208', 'age_limit': 0, + '_old_archive_ids': ['twitter 1600649710662213632'], }, 'params': {'noplaylist': True}, }, { @@ -979,7 +983,7 @@ class TwitterIE(TwitterBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, - 'view_count': int, + '_old_archive_ids': ['twitter 1621117700482416640'], }, }, { 'url': 'https://twitter.com/hlo_again/status/1599108751385972737/video/2', @@ -995,13 +999,13 @@ class TwitterIE(TwitterBaseIE): 'repost_count': int, 'duration': 9.531, 'comment_count': int, - 'view_count': int, 'upload_date': '20221203', 'age_limit': 0, 'timestamp': 1670092210.0, 'tags': [], 'uploader': '\u06ea', 'description': '\U0001F48B https://t.co/bTj9Qz7vQP', + '_old_archive_ids': ['twitter 1599108751385972737'], }, 'params': {'noplaylist': True}, }, { @@ -1012,7 +1016,6 @@ class TwitterIE(TwitterBaseIE): 'ext': 'mp4', 'uploader_url': 'https://twitter.com/MunTheShinobi', 'description': 'This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525 https://t.co/cNsA0MoOml', - 'view_count': int, 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig', 'age_limit': 0, 'uploader': 'Mün', @@ -1025,6 +1028,7 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': 'MunTheShinobi', 'duration': 139.987, 'timestamp': 1670306984.0, + '_old_archive_ids': ['twitter 1600009574919962625'], }, }, { # retweeted_status (private) @@ -1068,8 +1072,8 @@ class TwitterIE(TwitterBaseIE): 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', 'like_count': int, 'repost_count': int, - 'view_count': int, 'comment_count': int, + '_old_archive_ids': ['twitter 1695424220702888009'], }, }, { # retweeted_status w/ legacy API @@ -1091,18 +1095,24 @@ class TwitterIE(TwitterBaseIE): 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', 'like_count': int, 'repost_count': int, + '_old_archive_ids': ['twitter 1695424220702888009'], }, 'params': {'extractor_args': {'twitter': {'api': ['legacy']}}}, }, { # Broadcast embedded in tweet - 'url': 'https://twitter.com/JessicaDobsonWX/status/1693057346933600402', + 'url': 'https://twitter.com/JessicaDobsonWX/status/1731121063248175384', 'info_dict': { - 'id': '1yNGaNLjEblJj', + 'id': '1rmxPMjLzAXKN', 'ext': 'mp4', - 'title': 'Jessica Dobson - WAVE Weather Now - Saturday 8/19/23 Update', + 'title': 'WAVE Weather Now - Saturday 12/2/23 Update', 'uploader': 'Jessica Dobson', - 'uploader_id': '1DZEoDwDovRQa', - 'thumbnail': r're:^https?://.*\.jpg', + 'uploader_id': 'JessicaDobsonWX', + 'uploader_url': 'https://twitter.com/JessicaDobsonWX', + 'timestamp': 1701566398, + 'upload_date': '20231203', + 'live_status': 'was_live', + 'thumbnail': r're:https://[^/]+pscp\.tv/.+\.jpg', + 'concurrent_view_count': int, 'view_count': int, }, 'add_ie': ['TwitterBroadcast'], @@ -1125,6 +1135,30 @@ class TwitterIE(TwitterBaseIE): }, 'params': {'extractor_args': {'twitter': {'api': ['syndication']}}}, 'expected_warnings': ['Not all metadata'], + }, { + # "stale tweet" with typename "TweetWithVisibilityResults" + 'url': 'https://twitter.com/RobertKennedyJr/status/1724884212803834154', + 'md5': '62b1e11cdc2cdd0e527f83adb081f536', + 'info_dict': { + 'id': '1724883339285544960', + 'ext': 'mp4', + 'title': 'md5:cc56716f9ed0b368de2ba54c478e493c', + 'description': 'md5:9dc14f5b0f1311fc7caf591ae253a164', + 'display_id': '1724884212803834154', + 'uploader': 'Robert F. Kennedy Jr', + 'uploader_id': 'RobertKennedyJr', + 'uploader_url': 'https://twitter.com/RobertKennedyJr', + 'upload_date': '20231115', + 'timestamp': 1700079417.0, + 'duration': 341.048, + 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', + 'tags': ['Kennedy24'], + 'repost_count': int, + 'like_count': int, + 'comment_count': int, + 'age_limit': 0, + '_old_archive_ids': ['twitter 1724884212803834154'], + }, }, { # onion route 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', @@ -1179,19 +1213,23 @@ def _graphql_to_legacy(self, data, twid): ), default={}, get_all=False) if self.is_logged_in else traverse_obj( data, ('tweetResult', 'result', {dict}), default={}) - if result.get('__typename') not in ('Tweet', 'TweetTombstone', 'TweetUnavailable', None): - self.report_warning(f'Unknown typename: {result.get("__typename")}', twid, only_once=True) + typename = result.get('__typename') + if typename not in ('Tweet', 'TweetWithVisibilityResults', 'TweetTombstone', 'TweetUnavailable', None): + self.report_warning(f'Unknown typename: {typename}', twid, only_once=True) if 'tombstone' in result: cause = remove_end(traverse_obj(result, ('tombstone', 'text', 'text', {str})), '. Learn more') raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True) - elif result.get('__typename') == 'TweetUnavailable': + elif typename == 'TweetUnavailable': reason = result.get('reason') if reason == 'NsfwLoggedOut': self.raise_login_required('NSFW tweet requires authentication') elif reason == 'Protected': self.raise_login_required('You are not authorized to view this protected tweet') raise ExtractorError(reason or 'Requested tweet is unavailable', expected=True) + # Result for "stale tweet" needs additional transformation + elif typename == 'TweetWithVisibilityResults': + result = traverse_obj(result, ('tweet', {dict})) or {} status = result.get('legacy', {}) status.update(traverse_obj(result, { @@ -1280,41 +1318,51 @@ def _build_graphql_query(self, media_id): } } - def _extract_status(self, twid): - if self.is_logged_in or self._selected_api == 'graphql': - status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid) - - elif self._selected_api == 'legacy': - status = self._call_api(f'statuses/show/{twid}.json', twid, { - 'cards_platform': 'Web-12', - 'include_cards': 1, - 'include_reply_count': 1, - 'include_user_entities': 0, - 'tweet_mode': 'extended', + def _call_syndication_api(self, twid): + self.report_warning( + 'Not all metadata or media is available via syndication endpoint', twid, only_once=True) + status = self._download_json( + 'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON', + headers={'User-Agent': 'Googlebot'}, query={ + 'id': twid, + # TODO: token = ((Number(twid) / 1e15) * Math.PI).toString(36).replace(/(0+|\.)/g, '') + 'token': ''.join(random.choices('123456789abcdefghijklmnopqrstuvwxyz', k=10)), }) + if not status: + raise ExtractorError('Syndication endpoint returned empty JSON response') + # Transform the result so its structure matches that of legacy/graphql + media = [] + for detail in traverse_obj(status, ((None, 'quoted_tweet'), 'mediaDetails', ..., {dict})): + detail['id_str'] = traverse_obj(detail, ( + 'video_info', 'variants', ..., 'url', {self._MEDIA_ID_RE.search}, 1), get_all=False) or twid + media.append(detail) + status['extended_entities'] = {'media': media} - elif self._selected_api == 'syndication': - self.report_warning( - 'Not all metadata or media is available via syndication endpoint', twid, only_once=True) - status = self._download_json( - 'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON', - headers={'User-Agent': 'Googlebot'}, query={ - 'id': twid, - # TODO: token = ((Number(twid) / 1e15) * Math.PI).toString(36).replace(/(0+|\.)/g, '') - 'token': ''.join(random.choices('123456789abcdefghijklmnopqrstuvwxyz', k=10)), + return status + + def _extract_status(self, twid): + if self._selected_api not in ('graphql', 'legacy', 'syndication'): + raise ExtractorError(f'{self._selected_api!r} is not a valid API selection', expected=True) + + try: + if self.is_logged_in or self._selected_api == 'graphql': + status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid) + elif self._selected_api == 'legacy': + status = self._call_api(f'statuses/show/{twid}.json', twid, { + 'cards_platform': 'Web-12', + 'include_cards': 1, + 'include_reply_count': 1, + 'include_user_entities': 0, + 'tweet_mode': 'extended', }) - if not status: - raise ExtractorError('Syndication endpoint returned empty JSON response') - # Transform the result so its structure matches that of legacy/graphql - media = [] - for detail in traverse_obj(status, ((None, 'quoted_tweet'), 'mediaDetails', ..., {dict})): - detail['id_str'] = traverse_obj(detail, ( - 'video_info', 'variants', ..., 'url', {self._MEDIA_ID_RE.search}, 1), get_all=False) or twid - media.append(detail) - status['extended_entities'] = {'media': media} + except ExtractorError as e: + if not isinstance(e.cause, HTTPError) or not e.cause.status == 429: + raise + self.report_warning('Rate-limit exceeded; falling back to syndication endpoint') + status = self._call_syndication_api(twid) - else: - raise ExtractorError(f'"{self._selected_api}" is not a valid API selection', expected=True) + if self._selected_api == 'syndication': + status = self._call_syndication_api(twid) return traverse_obj(status, 'retweeted_status', None, expected_type=dict) or {} @@ -1377,10 +1425,10 @@ def add_thumbnail(name, size): 'formats': formats, 'subtitles': subtitles, 'thumbnails': thumbnails, - 'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), + 'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), # No longer available 'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000), - # The codec of http formats are unknown - '_format_sort_fields': ('res', 'br', 'size', 'proto'), + # Prioritize m3u8 formats for compat, see https://github.com/yt-dlp/yt-dlp/issues/8117 + '_format_sort_fields': ('res', 'proto:m3u8', 'br', 'size'), # http format codec is unknown } def extract_from_card_info(card): @@ -1563,7 +1611,7 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): IE_NAME = 'twitter:broadcast' _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/broadcasts/(?P<id>[0-9a-zA-Z]{13})' - _TEST = { + _TESTS = [{ # untitled Periscope video 'url': 'https://twitter.com/i/broadcasts/1yNGaQLWpejGj', 'info_dict': { @@ -1571,11 +1619,42 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'ext': 'mp4', 'title': 'Andrea May Sahouri - Periscope Broadcast', 'uploader': 'Andrea May Sahouri', - 'uploader_id': '1PXEdBZWpGwKe', + 'uploader_id': 'andreamsahouri', + 'uploader_url': 'https://twitter.com/andreamsahouri', + 'timestamp': 1590973638, + 'upload_date': '20200601', 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', 'view_count': int, }, - } + }, { + 'url': 'https://twitter.com/i/broadcasts/1ZkKzeyrPbaxv', + 'info_dict': { + 'id': '1ZkKzeyrPbaxv', + 'ext': 'mp4', + 'title': 'Starship | SN10 | High-Altitude Flight Test', + 'uploader': 'SpaceX', + 'uploader_id': 'SpaceX', + 'uploader_url': 'https://twitter.com/SpaceX', + 'timestamp': 1614812942, + 'upload_date': '20210303', + 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', + 'view_count': int, + }, + }, { + 'url': 'https://twitter.com/i/broadcasts/1OyKAVQrgzwGb', + 'info_dict': { + 'id': '1OyKAVQrgzwGb', + 'ext': 'mp4', + 'title': 'Starship Flight Test', + 'uploader': 'SpaceX', + 'uploader_id': 'SpaceX', + 'uploader_url': 'https://twitter.com/SpaceX', + 'timestamp': 1681993964, + 'upload_date': '20230420', + 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', + 'view_count': int, + }, + }] def _real_extract(self, url): broadcast_id = self._match_id(url) @@ -1585,6 +1664,12 @@ def _real_extract(self, url): if not broadcast: raise ExtractorError('Broadcast no longer exists', expected=True) info = self._parse_broadcast_data(broadcast, broadcast_id) + info['title'] = broadcast.get('status') or info.get('title') + info['uploader_id'] = broadcast.get('twitter_username') or info.get('uploader_id') + info['uploader_url'] = format_field(broadcast, 'twitter_username', 'https://twitter.com/%s', default=None) + if info['live_status'] == 'is_upcoming': + return info + media_key = broadcast['media_key'] source = self._call_api( f'live_video_stream/status/{media_key}', media_key)['source'] @@ -1741,7 +1826,7 @@ def _real_extract(self, url): class TwitterShortenerIE(TwitterBaseIE): IE_NAME = 'twitter:shortener' - _VALID_URL = r'https?://t.co/(?P<id>[^?]+)|tco:(?P<eid>[^?]+)' + _VALID_URL = r'https?://t\.co/(?P<id>[^?#]+)|tco:(?P<eid>[^?#]+)' _BASE_URL = 'https://t.co/' def _real_extract(self, url): diff --git a/yt_dlp/extractor/unscripted.py b/yt_dlp/extractor/unscripted.py deleted file mode 100644 index 6643a71b1..000000000 --- a/yt_dlp/extractor/unscripted.py +++ /dev/null @@ -1,53 +0,0 @@ -from .common import InfoExtractor -from ..utils import parse_duration, traverse_obj - - -class UnscriptedNewsVideoIE(InfoExtractor): - _VALID_URL = r'https?://www\.unscripted\.news/videos/(?P<id>[\w-]+)' - _TESTS = [{ - 'url': 'https://www.unscripted.news/videos/a-day-at-the-farmers-protest', - 'info_dict': { - 'id': '60c0a55cd1e99b1079918a57', - 'display_id': 'a-day-at-the-farmers-protest', - 'ext': 'mp4', - 'title': 'A Day at the Farmers\' Protest', - 'description': 'md5:4b3df22747a03e8f14f746dd72190384', - 'thumbnail': 'https://s3.unscripted.news/anj2/60c0a55cd1e99b1079918a57/5f199a65-c803-4a5c-8fce-2077359c3b72.jpg', - 'duration': 2251.0, - 'series': 'Ground Reports', - } - }, { - 'url': 'https://www.unscripted.news/videos/you-get-the-politicians-you-deserve-ft-shashi-tharoor', - 'info_dict': { - 'id': '5fb3afbf18ac817d341a74d8', - 'display_id': 'you-get-the-politicians-you-deserve-ft-shashi-tharoor', - 'ext': 'mp4', - 'cast': ['Avalok Langer', 'Ashwin Mehta'], - 'thumbnail': 'https://s3.unscripted.news/anj2/5fb3afbf18ac817d341a74d8/82bd7942-4f20-4cd8-98ae-83f9e814f998.jpg', - 'description': 'md5:1e91b069238a705ca3a40f87e6f1182c', - 'duration': 1046.0, - 'series': 'Dumb Questions Only', - 'title': 'You Get The Politicians You Deserve! ft. Shashi Tharoor', - } - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - nextjs_data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['dataLocal'] - - # TODO: get subtitle from srt key - formats, subtitles = self._extract_m3u8_formats_and_subtitles(nextjs_data['alt_content'], display_id) - - return { - 'id': nextjs_data['_id'], - 'display_id': display_id, - 'title': nextjs_data.get('title') or self._og_search_title(webpage), - 'description': nextjs_data.get('sh_heading') or self._og_search_description(webpage), - 'formats': formats, - 'subtitles': subtitles, - 'thumbnail': self._og_search_thumbnail(webpage), - 'duration': parse_duration(nextjs_data.get('duration')), - 'series': traverse_obj(nextjs_data, ('show', 'topic')), - 'cast': traverse_obj(nextjs_data, ('cast_crew', ..., 'displayname')), - } diff --git a/yt_dlp/extractor/unsupported.py b/yt_dlp/extractor/unsupported.py index bbcbf3acb..a3f9911e2 100644 --- a/yt_dlp/extractor/unsupported.py +++ b/yt_dlp/extractor/unsupported.py @@ -48,6 +48,7 @@ class KnownDRMIE(UnsupportedInfoExtractor): r'joyn\.de', r'amazon\.(?:\w{2}\.)?\w+/gp/video', r'music\.amazon\.(?:\w{2}\.)?\w+', + r'(?:watch|front)\.njpwworld\.com', ) _TESTS = [{ @@ -141,6 +142,13 @@ class KnownDRMIE(UnsupportedInfoExtractor): # https://github.com/yt-dlp/yt-dlp/issues/5767 'url': 'https://www.hulu.com/movie/anthem-6b25fac9-da2b-45a3-8e09-e4156b0471cc', 'only_matching': True, + }, { + # https://github.com/yt-dlp/yt-dlp/pull/8570 + 'url': 'https://watch.njpwworld.com/player/36447/series?assetType=series', + 'only_matching': True, + }, { + 'url': 'https://front.njpwworld.com/p/s_series_00563_16_bs', + 'only_matching': True, }] def _real_extract(self, url): @@ -164,11 +172,15 @@ class KnownPiracyIE(UnsupportedInfoExtractor): r'viewsb\.com', r'filemoon\.sx', r'hentai\.animestigma\.com', + r'thisav\.com', ) _TESTS = [{ 'url': 'http://dood.to/e/5s1wmbdacezb', 'only_matching': True, + }, { + 'url': 'https://thisav.com/en/terms', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/utreon.py b/yt_dlp/extractor/utreon.py index 90c10c051..8a9169101 100644 --- a/yt_dlp/extractor/utreon.py +++ b/yt_dlp/extractor/utreon.py @@ -10,7 +10,7 @@ class UtreonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?utreon.com/v/(?P<id>[a-zA-Z0-9_-]+)' + _VALID_URL = r'https?://(?:www\.)?utreon\.com/v/(?P<id>[\w-]+)' _TESTS = [{ 'url': 'https://utreon.com/v/z_I7ikQbuDw', 'info_dict': { diff --git a/yt_dlp/extractor/veehd.py b/yt_dlp/extractor/veehd.py deleted file mode 100644 index 5ecd88726..000000000 --- a/yt_dlp/extractor/veehd.py +++ /dev/null @@ -1,116 +0,0 @@ -import re -import json - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urlparse, -) -from ..utils import ( - ExtractorError, - clean_html, - get_element_by_id, -) - - -class VeeHDIE(InfoExtractor): - _VALID_URL = r'https?://veehd\.com/video/(?P<id>\d+)' - - # Seems VeeHD videos have multiple copies on several servers, all of - # whom have different MD5 checksums, so omit md5 field in all tests - _TESTS = [{ - 'url': 'http://veehd.com/video/4639434_Solar-Sinter', - 'info_dict': { - 'id': '4639434', - 'ext': 'mp4', - 'title': 'Solar Sinter', - 'uploader_id': 'VideoEyes', - 'description': 'md5:46a840e8692ddbaffb5f81d9885cb457', - }, - 'skip': 'Video deleted', - }, { - 'url': 'http://veehd.com/video/4905758_Elysian-Fields-Channeling', - 'info_dict': { - 'id': '4905758', - 'ext': 'mp4', - 'title': 'Elysian Fields - Channeling', - 'description': 'md5:360e4e95fdab58aefbea0f2a19e5604b', - 'uploader_id': 'spotted', - } - }, { - 'url': 'http://veehd.com/video/2046729_2012-2009-DivX-Trailer', - 'info_dict': { - 'id': '2046729', - 'ext': 'avi', - 'title': '2012 (2009) DivX Trailer', - 'description': 'md5:75435ee95255e6a9838ac6f6f3a2396b', - 'uploader_id': 'Movie_Trailers', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - # VeeHD seems to send garbage on the first request. - # See https://github.com/ytdl-org/youtube-dl/issues/2102 - self._download_webpage(url, video_id, 'Requesting webpage') - webpage = self._download_webpage(url, video_id) - - if 'This video has been removed<' in webpage: - raise ExtractorError('Video %s has been removed' % video_id, expected=True) - - player_path = self._search_regex( - r'\$\("#playeriframe"\).attr\({src : "(.+?)"', - webpage, 'player path') - player_url = compat_urlparse.urljoin(url, player_path) - - self._download_webpage(player_url, video_id, 'Requesting player page') - player_page = self._download_webpage( - player_url, video_id, 'Downloading player page') - - video_url = None - - config_json = self._search_regex( - r'value=\'config=({.+?})\'', player_page, 'config json', default=None) - - if config_json: - config = json.loads(config_json) - video_url = compat_urllib_parse_unquote(config['clip']['url']) - - if not video_url: - video_url = self._html_search_regex( - r'<embed[^>]+type="video/divx"[^>]+src="([^"]+)"', - player_page, 'video url', default=None) - - if not video_url: - iframe_src = self._search_regex( - r'<iframe[^>]+src="/?([^"]+)"', player_page, 'iframe url') - iframe_url = 'http://veehd.com/%s' % iframe_src - - self._download_webpage(iframe_url, video_id, 'Requesting iframe page') - iframe_page = self._download_webpage( - iframe_url, video_id, 'Downloading iframe page') - - video_url = self._search_regex( - r"file\s*:\s*'([^']+)'", iframe_page, 'video url') - - title = clean_html(get_element_by_id('videoName', webpage).rpartition('|')[0]) - uploader_id = self._html_search_regex( - r'<a href="/profile/\d+">(.+?)</a>', - webpage, 'uploader') - thumbnail = self._search_regex( - r'<img id="veehdpreview" src="(.+?)"', - webpage, 'thumbnail') - description = self._html_search_regex( - r'<td class="infodropdown".*?<div>(.*?)<ul', - webpage, 'description', flags=re.DOTALL) - - return { - '_type': 'video', - 'id': video_id, - 'title': title, - 'url': video_url, - 'uploader_id': uploader_id, - 'thumbnail': thumbnail, - 'description': description, - } diff --git a/yt_dlp/extractor/vice.py b/yt_dlp/extractor/vice.py index 8a7126853..1a2d667e7 100644 --- a/yt_dlp/extractor/vice.py +++ b/yt_dlp/extractor/vice.py @@ -302,12 +302,6 @@ def _url_res(video_url, ie_key): if vice_url: return _url_res(vice_url, ViceIE.ie_key()) - embed_code = self._search_regex( - r'embedCode=([^&\'"]+)', body, - 'ooyala embed code', default=None) - if embed_code: - return _url_res('ooyala:%s' % embed_code, 'Ooyala') - youtube_url = YoutubeIE._extract_url(body) if youtube_url: return _url_res(youtube_url, YoutubeIE.ie_key()) diff --git a/yt_dlp/extractor/vidbit.py b/yt_dlp/extractor/vidbit.py deleted file mode 100644 index 2813032db..000000000 --- a/yt_dlp/extractor/vidbit.py +++ /dev/null @@ -1,82 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - int_or_none, - js_to_json, - remove_end, - unified_strdate, -) - - -class VidbitIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vidbit\.co/(?:watch|embed)\?.*?\bv=(?P<id>[\da-zA-Z]+)' - _TESTS = [{ - 'url': 'http://www.vidbit.co/watch?v=jkL2yDOEq2', - 'md5': '1a34b7f14defe3b8fafca9796892924d', - 'info_dict': { - 'id': 'jkL2yDOEq2', - 'ext': 'mp4', - 'title': 'Intro to VidBit', - 'description': 'md5:5e0d6142eec00b766cbf114bfd3d16b7', - 'thumbnail': r're:https?://.*\.jpg$', - 'upload_date': '20160618', - 'view_count': int, - 'comment_count': int, - } - }, { - 'url': 'http://www.vidbit.co/embed?v=jkL2yDOEq2&auto=0&water=0', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - compat_urlparse.urljoin(url, '/watch?v=%s' % video_id), video_id) - - video_url, title = [None] * 2 - - config = self._parse_json(self._search_regex( - r'(?s)\.setup\(({.+?})\);', webpage, 'setup', default='{}'), - video_id, transform_source=js_to_json) - if config: - if config.get('file'): - video_url = compat_urlparse.urljoin(url, config['file']) - title = config.get('title') - - if not video_url: - video_url = compat_urlparse.urljoin(url, self._search_regex( - r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'video URL', group='url')) - - if not title: - title = remove_end( - self._html_search_regex( - (r'<h1>(.+?)</h1>', r'<title>(.+?)'), - webpage, 'title', default=None) or self._og_search_title(webpage), - ' - VidBit') - - description = self._html_search_meta( - ('description', 'og:description', 'twitter:description'), - webpage, 'description') - - upload_date = unified_strdate(self._html_search_meta( - 'datePublished', webpage, 'upload date')) - - view_count = int_or_none(self._search_regex( - r'(\d+) views', - webpage, 'view count', fatal=False)) - comment_count = int_or_none(self._search_regex( - r'id=["\']cmt_num["\'][^>]*>\((\d+)\)', - webpage, 'comment count', fatal=False)) - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'description': description, - 'thumbnail': self._og_search_thumbnail(webpage), - 'upload_date': upload_date, - 'view_count': view_count, - 'comment_count': comment_count, - } diff --git a/yt_dlp/extractor/videoken.py b/yt_dlp/extractor/videoken.py index 560b41a6d..eaf0cc8ae 100644 --- a/yt_dlp/extractor/videoken.py +++ b/yt_dlp/extractor/videoken.py @@ -11,6 +11,7 @@ ExtractorError, InAdvancePagedList, int_or_none, + remove_start, traverse_obj, update_url_query, url_or_none, @@ -39,11 +40,11 @@ def _create_slideslive_url(self, video_url, video_id, referer): if not video_url and not video_id: return elif not video_url or 'embed/sign-in' in video_url: - video_url = f'https://slideslive.com/embed/{video_id.lstrip("slideslive-")}' + video_url = f'https://slideslive.com/embed/{remove_start(video_id, "slideslive-")}' if url_or_none(referer): return update_url_query(video_url, { 'embed_parent_url': referer, - 'embed_container_origin': f'https://{urllib.parse.urlparse(referer).netloc}', + 'embed_container_origin': f'https://{urllib.parse.urlparse(referer).hostname}', }) return video_url @@ -57,12 +58,12 @@ def _extract_videos(self, videos, url): video_url = video_id ie_key = 'Youtube' else: - video_url = traverse_obj(video, 'embed_url', 'embeddableurl') - if urllib.parse.urlparse(video_url).netloc == 'slideslive.com': + video_url = traverse_obj(video, 'embed_url', 'embeddableurl', expected_type=url_or_none) + if not video_url: + continue + elif urllib.parse.urlparse(video_url).hostname == 'slideslive.com': ie_key = SlidesLiveIE video_url = self._create_slideslive_url(video_url, video_id, url) - if not video_url: - continue yield self.url_result(video_url, ie_key, video_id) @@ -178,7 +179,7 @@ def _real_extract(self, url): return self.url_result( self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id) elif re.match(r'^[\w-]{11}$', video_id): - self.url_result(video_id, 'Youtube', video_id) + return self.url_result(video_id, 'Youtube', video_id) else: raise ExtractorError('Unable to extract without VideoKen API response') diff --git a/yt_dlp/extractor/vidly.py b/yt_dlp/extractor/vidly.py new file mode 100644 index 000000000..49a196041 --- /dev/null +++ b/yt_dlp/extractor/vidly.py @@ -0,0 +1,83 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + mimetype2ext, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class VidlyIE(InfoExtractor): + _VALID_URL = r'https?://(?:vid\.ly/|(?:s\.)?vid\.ly/embeded\.html\?(?:[^#]+&)?link=)(?P\w+)' + _EMBED_REGEX = [r']+\bsrc=[\'"](?P(?:https?:)?//vid\.ly/\w+/embed[^\'"]+)', + r']+\bsrc=[\'"](?P(?:https?:)?//(?:s\.)?vid\.ly/embeded\.html\?(?:[^#\'"]+&)?link=\w+[^\'"]+)'] + _TESTS = [{ + # JWPlayer 7, Embeds forbidden + 'url': 'https://vid.ly/2i3o9j/embed', + 'info_dict': { + 'id': '2i3o9j', + 'ext': 'mp4', + 'title': '2i3o9j', + 'thumbnail': r're:https://\w+\.cloudfront\.net/', + }, + }, { + # JWPlayer 6 + 'url': 'http://s.vid.ly/embeded.html?link=jw_test&new=1&autoplay=true&controls=true', + 'info_dict': { + 'id': 'jw_test', + 'ext': 'mp4', + 'title': '2x8m8t', + 'thumbnail': r're:https://\w+\.cloudfront\.net/', + }, + }, { + # Vidlyplayer + 'url': 'https://vid.ly/7x0e6l', + 'info_dict': { + 'id': '7x0e6l', + 'ext': 'mp4', + 'title': '7x0e6l', + }, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.petfinder.com/dog/gus-57378930/tn/ooltewah/furever-furkids-rescue-tn592/', + 'info_dict': { + 'id': 'w8p5b0', + 'ext': 'mp4', + 'title': 'w8p5b0', + 'thumbnail': r're:https://\w+\.cloudfront\.net/', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + embed_script = self._download_webpage( + f'https://vid.ly/{video_id}/embed', video_id, headers={'Referer': 'https://vid.ly/'}) + player = self._search_json(r'initCallback\(', embed_script, 'player', video_id) + + player_type = player.get('player') or '' + if player_type.startswith('jwplayer'): + return self._parse_jwplayer_data(player['config'], video_id) + elif not player_type.startswith('vidly'): + raise ExtractorError(f'Unknown player type {player_type!r}') + + formats = [] + ext = mimetype2ext(traverse_obj(player, ('config', 'type'))) + for source, fid in [('source', 'sd'), ('source_hd', 'hd')]: + if traverse_obj(player, ('config', source, {url_or_none})): + formats.append({ + 'url': player['config'][source], + 'format_id': f'http-{fid}', + 'ext': ext, + }) + # Has higher quality formats + formats.extend(self._extract_m3u8_formats( + f'https://d3fenhwk93s16g.cloudfront.net/{video_id}/hls.m3u8', video_id, + fatal=False, note='Requesting higher quality m3u8 formats', + errnote='No higher quality m3u8 formats found') or []) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index e72fa50fa..e5e8144bb 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -37,14 +37,14 @@ class VimeoBaseInfoExtractor(InfoExtractor): @staticmethod def _smuggle_referrer(url, referrer_url): - return smuggle_url(url, {'http_headers': {'Referer': referrer_url}}) + return smuggle_url(url, {'referer': referrer_url}) def _unsmuggle_headers(self, url): """@returns (url, smuggled_data, headers)""" url, data = unsmuggle_url(url, {}) headers = self.get_param('http_headers').copy() - if 'http_headers' in data: - headers.update(data['http_headers']) + if 'referer' in data: + headers['Referer'] = data['referer'] return url, data, headers def _perform_login(self, username, password): diff --git a/yt_dlp/extractor/vimple.py b/yt_dlp/extractor/vimple.py deleted file mode 100644 index fdccf465e..000000000 --- a/yt_dlp/extractor/vimple.py +++ /dev/null @@ -1,58 +0,0 @@ -from .common import InfoExtractor -from ..utils import int_or_none - - -class SprutoBaseIE(InfoExtractor): - def _extract_spruto(self, spruto, video_id): - playlist = spruto['playlist'][0] - title = playlist['title'] - video_id = playlist.get('videoId') or video_id - thumbnail = playlist.get('posterUrl') or playlist.get('thumbnailUrl') - duration = int_or_none(playlist.get('duration')) - - formats = [{ - 'url': f['url'], - } for f in playlist['video']] - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } - - -class VimpleIE(SprutoBaseIE): - IE_DESC = 'Vimple - one-click video hosting' - _VALID_URL = r'https?://(?:player\.vimple\.(?:ru|co)/iframe|vimple\.(?:ru|co))/(?P[\da-f-]{32,36})' - _TESTS = [{ - 'url': 'http://vimple.ru/c0f6b1687dcd4000a97ebe70068039cf', - 'md5': '2e750a330ed211d3fd41821c6ad9a279', - 'info_dict': { - 'id': 'c0f6b168-7dcd-4000-a97e-be70068039cf', - 'ext': 'mp4', - 'title': 'Sunset', - 'duration': 20, - 'thumbnail': r're:https?://.*?\.jpg', - }, - }, { - 'url': 'http://player.vimple.ru/iframe/52e1beec-1314-4a83-aeac-c61562eadbf9', - 'only_matching': True, - }, { - 'url': 'http://vimple.co/04506a053f124483b8fb05ed73899f19', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://player.vimple.ru/iframe/%s' % video_id, video_id) - - spruto = self._parse_json( - self._search_regex( - r'sprutoData\s*:\s*({.+?}),\r\n', webpage, 'spruto data'), - video_id) - - return self._extract_spruto(spruto, video_id) diff --git a/yt_dlp/extractor/viously.py b/yt_dlp/extractor/viously.py new file mode 100644 index 000000000..9ec7ed35f --- /dev/null +++ b/yt_dlp/extractor/viously.py @@ -0,0 +1,60 @@ +import base64 +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + parse_iso8601, +) +from ..utils.traversal import traverse_obj + + +class ViouslyIE(InfoExtractor): + _VALID_URL = False + _WEBPAGE_TESTS = [{ + 'url': 'http://www.turbo.fr/videos-voiture/454443-turbo-du-07-09-2014-renault-twingo-3-bentley-continental-gt-speed-ces-guide-achat-dacia.html', + 'md5': '37a6c3381599381ff53a7e1e0575c0bc', + 'info_dict': { + 'id': 'F_xQzS2jwb3', + 'ext': 'mp4', + 'title': 'Turbo du 07/09/2014\xa0: Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...', + 'description': 'Turbo du 07/09/2014\xa0: Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...', + 'age_limit': 0, + 'upload_date': '20230328', + 'timestamp': 1680037507, + 'duration': 3716, + 'categories': ['motors'], + } + }] + + def _extract_from_webpage(self, url, webpage): + viously_players = re.findall(r']*class="(?:[^"]*\s)?v(?:iou)?sly-player(?:\s[^"]*)?"[^>]*>', webpage) + if not viously_players: + return + + def custom_decode(text): + STANDARD_ALPHABET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=' + CUSTOM_ALPHABET = 'VIOUSLYABCDEFGHJKMNPQRTWXZviouslyabcdefghjkmnpqrtwxz9876543210+/=' + data = base64.b64decode(text.translate(str.maketrans(CUSTOM_ALPHABET, STANDARD_ALPHABET))) + return data.decode('utf-8').strip('\x00') + + for video_id in traverse_obj(viously_players, (..., {extract_attributes}, 'id')): + formats = self._extract_m3u8_formats( + f'https://www.viously.com/video/hls/{video_id}/index.m3u8', video_id, fatal=False) + if not formats: + continue + data = self._download_json( + f'https://www.viously.com/export/json/{video_id}', video_id, + transform_source=custom_decode, fatal=False) + yield { + 'id': video_id, + 'formats': formats, + **traverse_obj(data, ('video', { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'duration': ('duration', {int_or_none}), + 'timestamp': ('iso_date', {parse_iso8601}), + 'categories': ('category', 'name', {str}, {lambda x: [x] if x else None}), + })), + } diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 915422817..c12e87362 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -97,12 +97,12 @@ class VKIE(VKBaseIE): (?: (?: (?:(?:m|new)\.)?vk\.com/video_| - (?:www\.)?daxab.com/ + (?:www\.)?daxab\.com/ ) ext\.php\?(?P.*?\boid=(?P-?\d+).*?\bid=(?P\d+).*)| (?: (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?(?:video|clip)| - (?:www\.)?daxab.com/embed/ + (?:www\.)?daxab\.com/embed/ ) (?P-?\d+_\d+)(?:.*\blist=(?P([\da-f]+)|(ln-[\da-zA-Z]+)))? ) diff --git a/yt_dlp/extractor/vocaroo.py b/yt_dlp/extractor/vocaroo.py index d98fbfd2d..e30c9597f 100644 --- a/yt_dlp/extractor/vocaroo.py +++ b/yt_dlp/extractor/vocaroo.py @@ -57,7 +57,7 @@ def _real_extract(self, url): 'title': '', 'url': url, 'ext': 'mp3', - 'timestamp': float_or_none(resp.getheader('x-bz-upload-timestamp'), scale=1000), + 'timestamp': float_or_none(resp.headers.get('x-bz-upload-timestamp'), scale=1000), 'vcodec': 'none', 'http_headers': http_headers, } diff --git a/yt_dlp/extractor/vodlocker.py b/yt_dlp/extractor/vodlocker.py deleted file mode 100644 index b215d6c9d..000000000 --- a/yt_dlp/extractor/vodlocker.py +++ /dev/null @@ -1,73 +0,0 @@ -from .common import InfoExtractor -from ..networking import Request -from ..utils import NO_DEFAULT, ExtractorError, urlencode_postdata - - -class VodlockerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vodlocker\.(?:com|city)/(?:embed-)?(?P[0-9a-zA-Z]+)(?:\..*?)?' - - _TESTS = [{ - 'url': 'http://vodlocker.com/e8wvyzz4sl42', - 'md5': 'ce0c2d18fa0735f1bd91b69b0e54aacf', - 'info_dict': { - 'id': 'e8wvyzz4sl42', - 'ext': 'mp4', - 'title': 'Germany vs Brazil', - 'thumbnail': r're:http://.*\.jpg', - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - if any(p in webpage for p in ( - '>THIS FILE WAS DELETED<', - '>File Not Found<', - 'The file you were looking for could not be found, sorry for any inconvenience.<', - '>The file was removed')): - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - fields = self._hidden_inputs(webpage) - - if fields['op'] == 'download1': - self._sleep(3, video_id) # they do detect when requests happen too fast! - post = urlencode_postdata(fields) - req = Request(url, post) - req.headers['Content-type'] = 'application/x-www-form-urlencoded' - webpage = self._download_webpage( - req, video_id, 'Downloading video page') - - def extract_file_url(html, default=NO_DEFAULT): - return self._search_regex( - r'file:\s*"(http[^\"]+)",', html, 'file url', default=default) - - video_url = extract_file_url(webpage, default=None) - - if not video_url: - embed_url = self._search_regex( - r']+src=(["\'])(?P(?:https?://)?vodlocker\.(?:com|city)/embed-.+?)\1', - webpage, 'embed url', group='url') - embed_webpage = self._download_webpage( - embed_url, video_id, 'Downloading embed webpage') - video_url = extract_file_url(embed_webpage) - thumbnail_webpage = embed_webpage - else: - thumbnail_webpage = webpage - - title = self._search_regex( - r'id="file_title".*?>\s*(.*?)\s*<(?:br|span)', webpage, 'title') - thumbnail = self._search_regex( - r'image:\s*"(http[^\"]+)",', thumbnail_webpage, 'thumbnail', fatal=False) - - formats = [{ - 'format_id': 'sd', - 'url': video_url, - }] - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - } diff --git a/yt_dlp/extractor/voicerepublic.py b/yt_dlp/extractor/voicerepublic.py deleted file mode 100644 index 47502afb4..000000000 --- a/yt_dlp/extractor/voicerepublic.py +++ /dev/null @@ -1,59 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - determine_ext, - int_or_none, - urljoin, -) - - -class VoiceRepublicIE(InfoExtractor): - _VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P[0-9a-z-]+)' - _TESTS = [{ - 'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state', - 'md5': 'b9174d651323f17783000876347116e3', - 'info_dict': { - 'id': '2296', - 'display_id': 'watching-the-watchers-building-a-sousveillance-state', - 'ext': 'm4a', - 'title': 'Watching the Watchers: Building a Sousveillance State', - 'description': 'Secret surveillance programs have metadata too. The people and companies that operate secret surveillance programs can be surveilled.', - 'duration': 1556, - 'view_count': int, - } - }, { - 'url': 'http://voicerepublic.com/embed/watching-the-watchers-building-a-sousveillance-state', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - if '>Queued for processing, please stand by...<' in webpage: - raise ExtractorError( - 'Audio is still queued for processing', expected=True) - - talk = self._parse_json(self._search_regex( - r'initialSnapshot\s*=\s*({.+?});', - webpage, 'talk'), display_id)['talk'] - title = talk['title'] - formats = [{ - 'url': urljoin(url, talk_url), - 'format_id': format_id, - 'ext': determine_ext(talk_url) or format_id, - 'vcodec': 'none', - } for format_id, talk_url in talk['media_links'].items()] - - return { - 'id': compat_str(talk.get('id') or display_id), - 'display_id': display_id, - 'title': title, - 'description': talk.get('teaser'), - 'thumbnail': talk.get('image_url'), - 'duration': int_or_none(talk.get('archived_duration')), - 'view_count': int_or_none(talk.get('play_count')), - 'formats': formats, - } diff --git a/yt_dlp/extractor/voot.py b/yt_dlp/extractor/voot.py index b19a27934..ef77bedd2 100644 --- a/yt_dlp/extractor/voot.py +++ b/yt_dlp/extractor/voot.py @@ -81,6 +81,7 @@ def _real_initialize(self): class VootIE(VootBaseIE): + _WORKING = False _VALID_URL = r'''(?x) (?: voot:| @@ -169,6 +170,7 @@ def _real_extract(self, url): class VootSeriesIE(VootBaseIE): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?voot\.com/shows/[^/]+/(?P\d{3,})' _TESTS = [{ 'url': 'https://www.voot.com/shows/chakravartin-ashoka-samrat/100002', diff --git a/yt_dlp/extractor/voxmedia.py b/yt_dlp/extractor/voxmedia.py index f9362002f..f36908754 100644 --- a/yt_dlp/extractor/voxmedia.py +++ b/yt_dlp/extractor/voxmedia.py @@ -51,7 +51,7 @@ def _real_extract(self, url): info['duration'] = int_or_none(asset.get('duration')) return info - for provider_video_type in ('ooyala', 'youtube', 'brightcove'): + for provider_video_type in ('youtube', 'brightcove'): provider_video_id = video_data.get('%s_id' % provider_video_type) if not provider_video_id: continue @@ -177,7 +177,6 @@ def _real_extract(self, url): def create_entry(provider_video_id, provider_video_type, title=None, description=None): video_url = { 'youtube': '%s', - 'ooyala': 'ooyala:%s', 'volume': 'http://volume.vox-cdn.com/embed/%s', }[provider_video_type] % provider_video_id return { @@ -205,11 +204,6 @@ def create_entry(provider_video_id, provider_video_type, title=None, description provider_video_id, provider_video_type, video_data.get('title'), video_data.get('description'))) - provider_video_id = self._search_regex( - r'data-ooyala-id="([^"]+)"', webpage, 'ooyala id', default=None) - if provider_video_id: - entries.append(create_entry(provider_video_id, 'ooyala')) - volume_uuid = self._search_regex( r'data-volume-uuid="([^"]+)"', webpage, 'volume uuid', default=None) if volume_uuid: diff --git a/yt_dlp/extractor/vrak.py b/yt_dlp/extractor/vrak.py deleted file mode 100644 index 198c0a294..000000000 --- a/yt_dlp/extractor/vrak.py +++ /dev/null @@ -1,77 +0,0 @@ -import re - -from .common import InfoExtractor -from .brightcove import BrightcoveNewIE -from ..utils import ( - int_or_none, - parse_age_limit, - smuggle_url, - unescapeHTML, -) - - -class VrakIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vrak\.tv/videos\?.*?\btarget=(?P[\d.]+)' - _TEST = { - 'url': 'http://www.vrak.tv/videos?target=1.2306782&filtre=emission&id=1.1806721', - 'info_dict': { - 'id': '5345661243001', - 'ext': 'mp4', - 'title': 'Obésité, film de hockey et Roseline Filion', - 'timestamp': 1488492126, - 'upload_date': '20170302', - 'uploader_id': '2890187628001', - 'creator': 'VRAK.TV', - 'age_limit': 8, - 'series': 'ALT (Actualité Légèrement Tordue)', - 'episode': 'Obésité, film de hockey et Roseline Filion', - 'tags': list, - }, - 'params': { - 'skip_download': True, - }, - } - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/2890187628001/default_default/index.html?videoId=%s' - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex( - r']+\bclass=["\']videoTitle["\'][^>]*>([^<]+)', - webpage, 'title', default=None) or self._og_search_title(webpage) - - content = self._parse_json( - self._search_regex( - r'data-player-options-content=(["\'])(?P{.+?})\1', - webpage, 'content', default='{}', group='content'), - video_id, transform_source=unescapeHTML) - - ref_id = content.get('refId') or self._search_regex( - r'refId":"([^&]+)"', webpage, 'ref id') - - brightcove_id = self._search_regex( - r'''(?x) - java\.lang\.String\s+value\s*=\s*["']brightcove\.article\.\d+\.%s - [^>]* - java\.lang\.String\s+value\s*=\s*["'](\d+) - ''' % re.escape(ref_id), webpage, 'brightcove id') - - return { - '_type': 'url_transparent', - 'ie_key': BrightcoveNewIE.ie_key(), - 'url': smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, - {'geo_countries': ['CA']}), - 'id': brightcove_id, - 'description': content.get('description'), - 'creator': content.get('brand'), - 'age_limit': parse_age_limit(content.get('rating')), - 'series': content.get('showName') or content.get( - 'episodeName'), # this is intentional - 'season_number': int_or_none(content.get('seasonNumber')), - 'episode': title, - 'episode_number': int_or_none(content.get('episodeNumber')), - 'tags': content.get('tags', []), - } diff --git a/yt_dlp/extractor/vrv.py b/yt_dlp/extractor/vrv.py deleted file mode 100644 index 523c442e6..000000000 --- a/yt_dlp/extractor/vrv.py +++ /dev/null @@ -1,269 +0,0 @@ -import base64 -import hashlib -import hmac -import json -import random -import string -import time -import urllib.parse - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode -from ..networking.exceptions import HTTPError -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - join_nonempty, - traverse_obj, -) - - -class VRVBaseIE(InfoExtractor): - _API_DOMAIN = None - _API_PARAMS = {} - _CMS_SIGNING = {} - _TOKEN = None - _TOKEN_SECRET = '' - - def _call_api(self, path, video_id, note, data=None): - # https://tools.ietf.org/html/rfc5849#section-3 - base_url = self._API_DOMAIN + '/core/' + path - query = [ - ('oauth_consumer_key', self._API_PARAMS['oAuthKey']), - ('oauth_nonce', ''.join(random.choices(string.ascii_letters, k=32))), - ('oauth_signature_method', 'HMAC-SHA1'), - ('oauth_timestamp', int(time.time())), - ] - if self._TOKEN: - query.append(('oauth_token', self._TOKEN)) - encoded_query = compat_urllib_parse_urlencode(query) - headers = self.geo_verification_headers() - if data: - data = json.dumps(data).encode() - headers['Content-Type'] = 'application/json' - base_string = '&'.join([ - 'POST' if data else 'GET', - urllib.parse.quote(base_url, ''), - urllib.parse.quote(encoded_query, '')]) - oauth_signature = base64.b64encode(hmac.new( - (self._API_PARAMS['oAuthSecret'] + '&' + self._TOKEN_SECRET).encode('ascii'), - base_string.encode(), hashlib.sha1).digest()).decode() - encoded_query += '&oauth_signature=' + urllib.parse.quote(oauth_signature, '') - try: - return self._download_json( - '?'.join([base_url, encoded_query]), video_id, - note='Downloading %s JSON metadata' % note, headers=headers, data=data) - except ExtractorError as e: - if isinstance(e.cause, HTTPError) and e.cause.status == 401: - raise ExtractorError(json.loads(e.cause.response.read().decode())['message'], expected=True) - raise - - def _call_cms(self, path, video_id, note): - if not self._CMS_SIGNING: - index = self._call_api('index', video_id, 'CMS Signing') - self._CMS_SIGNING = index.get('cms_signing') or {} - if not self._CMS_SIGNING: - for signing_policy in index.get('signing_policies', []): - signing_path = signing_policy.get('path') - if signing_path and signing_path.startswith('/cms/'): - name, value = signing_policy.get('name'), signing_policy.get('value') - if name and value: - self._CMS_SIGNING[name] = value - return self._download_json( - self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING, - note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers()) - - def _get_cms_resource(self, resource_key, video_id): - return self._call_api( - 'cms_resource', video_id, 'resource path', data={ - 'resource_key': resource_key, - })['__links__']['cms_resource']['href'] - - def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): - if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'): - return [] - format_id = join_nonempty( - stream_format, - audio_lang and 'audio-%s' % audio_lang, - hardsub_lang and 'hardsub-%s' % hardsub_lang) - if 'hls' in stream_format: - adaptive_formats = self._extract_m3u8_formats( - url, video_id, 'mp4', m3u8_id=format_id, - note='Downloading %s information' % format_id, - fatal=False) - elif stream_format == 'dash': - adaptive_formats = self._extract_mpd_formats( - url, video_id, mpd_id=format_id, - note='Downloading %s information' % format_id, - fatal=False) - if audio_lang: - for f in adaptive_formats: - if f.get('acodec') != 'none': - f['language'] = audio_lang - return adaptive_formats - - def _set_api_params(self): - webpage = self._download_webpage( - 'https://vrv.co/', None, headers=self.geo_verification_headers()) - self._API_PARAMS = self._parse_json(self._search_regex( - [ - r'window\.__APP_CONFIG__\s*=\s*({.+?})(?:|;)', - r'window\.__APP_CONFIG__\s*=\s*({.+})' - ], webpage, 'app config'), None)['cxApiParams'] - self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co') - - -class VRVIE(VRVBaseIE): - IE_NAME = 'vrv' - _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P[A-Z0-9]+)' - _TESTS = [{ - 'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT', - 'info_dict': { - 'id': 'GR9PNZ396', - 'ext': 'mp4', - 'title': 'BOSTON: WHERE THE PAST IS THE PRESENT', - 'description': 'md5:4ec8844ac262ca2df9e67c0983c6b83f', - 'uploader_id': 'seeso', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # movie listing - 'url': 'https://vrv.co/watch/G6NQXZ1J6/Lily-CAT', - 'info_dict': { - 'id': 'G6NQXZ1J6', - 'title': 'Lily C.A.T', - 'description': 'md5:988b031e7809a6aeb60968be4af7db07', - }, - 'playlist_count': 2, - }] - _NETRC_MACHINE = 'vrv' - - def _perform_login(self, username, password): - token_credentials = self._call_api( - 'authenticate/by:credentials', None, 'Token Credentials', data={ - 'email': username, - 'password': password, - }) - self._TOKEN = token_credentials['oauth_token'] - self._TOKEN_SECRET = token_credentials['oauth_token_secret'] - - def _initialize_pre_login(self): - return self._set_api_params() - - def _real_extract(self, url): - video_id = self._match_id(url) - - object_data = self._call_cms(self._get_cms_resource( - 'cms:/objects/' + video_id, video_id), video_id, 'object')['items'][0] - resource_path = object_data['__links__']['resource']['href'] - video_data = self._call_cms(resource_path, video_id, 'video') - title = video_data['title'] - description = video_data.get('description') - - if video_data.get('__class__') == 'movie_listing': - items = self._call_cms( - video_data['__links__']['movie_listing/movies']['href'], - video_id, 'movie listing').get('items') or [] - if len(items) != 1: - entries = [] - for item in items: - item_id = item.get('id') - if not item_id: - continue - entries.append(self.url_result( - 'https://vrv.co/watch/' + item_id, - self.ie_key(), item_id, item.get('title'))) - return self.playlist_result(entries, video_id, title, description) - video_data = items[0] - - streams_path = video_data['__links__'].get('streams', {}).get('href') - if not streams_path: - self.raise_login_required() - streams_json = self._call_cms(streams_path, video_id, 'streams') - - audio_locale = streams_json.get('audio_locale') - formats = [] - for stream_type, streams in streams_json.get('streams', {}).items(): - if stream_type in ('adaptive_hls', 'adaptive_dash'): - for stream in streams.values(): - formats.extend(self._extract_vrv_formats( - stream.get('url'), video_id, stream_type.split('_')[1], - audio_locale, stream.get('hardsub_locale'))) - - subtitles = {} - for k in ('captions', 'subtitles'): - for subtitle in streams_json.get(k, {}).values(): - subtitle_url = subtitle.get('url') - if not subtitle_url: - continue - subtitles.setdefault(subtitle.get('locale', 'en-US'), []).append({ - 'url': subtitle_url, - 'ext': subtitle.get('format', 'ass'), - }) - - thumbnails = [] - for thumbnail in traverse_obj(video_data, ('images', 'thumbnail', ..., ...)) or []: - thumbnail_url = thumbnail.get('source') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - }) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, - 'thumbnails': thumbnails, - 'description': description, - 'duration': float_or_none(video_data.get('duration_ms'), 1000), - 'uploader_id': video_data.get('channel_id'), - 'series': video_data.get('series_title'), - 'season': video_data.get('season_title'), - 'season_number': int_or_none(video_data.get('season_number')), - 'season_id': video_data.get('season_id'), - 'episode': title, - 'episode_number': int_or_none(video_data.get('episode_number')), - 'episode_id': video_data.get('production_episode_id'), - } - - -class VRVSeriesIE(VRVBaseIE): - IE_NAME = 'vrv:series' - _VALID_URL = r'https?://(?:www\.)?vrv\.co/series/(?P[A-Z0-9]+)' - _TEST = { - 'url': 'https://vrv.co/series/G68VXG3G6/The-Perfect-Insider', - 'info_dict': { - 'id': 'G68VXG3G6', - }, - 'playlist_mincount': 11, - } - - def _initialize_pre_login(self): - return self._set_api_params() - - def _real_extract(self, url): - series_id = self._match_id(url) - - seasons_path = self._get_cms_resource( - 'cms:/seasons?series_id=' + series_id, series_id) - seasons_data = self._call_cms(seasons_path, series_id, 'seasons') - - entries = [] - for season in seasons_data.get('items', []): - episodes_path = season['__links__']['season/episodes']['href'] - episodes = self._call_cms(episodes_path, series_id, 'episodes') - for episode in episodes.get('items', []): - episode_id = episode['id'] - entries.append(self.url_result( - 'https://vrv.co/watch/' + episode_id, - 'VRV', episode_id, episode.get('title'))) - - return self.playlist_result(entries, series_id) diff --git a/yt_dlp/extractor/vshare.py b/yt_dlp/extractor/vshare.py deleted file mode 100644 index 443ed43cc..000000000 --- a/yt_dlp/extractor/vshare.py +++ /dev/null @@ -1,57 +0,0 @@ -from .common import InfoExtractor -from ..utils import ExtractorError, decode_packed_codes - - -class VShareIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vshare\.io/[dv]/(?P[^/?#&]+)' - _EMBED_REGEX = [r']+?src=["\'](?P(?:https?:)?//(?:www\.)?vshare\.io/v/[^/?#&]+)'] - _TESTS = [{ - 'url': 'https://vshare.io/d/0f64ce6', - 'md5': '17b39f55b5497ae8b59f5fbce8e35886', - 'info_dict': { - 'id': '0f64ce6', - 'title': 'vl14062007715967', - 'ext': 'mp4', - } - }, { - 'url': 'https://vshare.io/v/0f64ce6/width-650/height-430/1', - 'only_matching': True, - }] - - def _extract_packed(self, webpage): - packed = self._search_regex( - r'(eval\(function.+)', webpage, 'packed code') - unpacked = decode_packed_codes(packed) - digits = self._search_regex(r'\[([\d,]+)\]', unpacked, 'digits') - digits = [int(digit) for digit in digits.split(',')] - key_digit = self._search_regex( - r'fromCharCode\(.+?(\d+)\)}', unpacked, 'key digit') - chars = [chr(d - int(key_digit)) for d in digits] - return ''.join(chars) - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'https://vshare.io/v/%s/width-650/height-430/1' % video_id, - video_id, headers={'Referer': url}) - - title = self._html_extract_title(webpage) - title = title.split(' - ')[0] - - error = self._html_search_regex( - r'(?s)]+\bclass=["\']xxx-error[^>]+>(.+?)%s' % self._extract_packed(webpage), - video_id)[0] - - info.update({ - 'id': video_id, - 'title': title, - }) - - return info diff --git a/yt_dlp/extractor/vupload.py b/yt_dlp/extractor/vupload.py deleted file mode 100644 index 23ea70c77..000000000 --- a/yt_dlp/extractor/vupload.py +++ /dev/null @@ -1,52 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - parse_duration, - parse_filesize, - extract_attributes, - int_or_none, - js_to_json -) - - -class VuploadIE(InfoExtractor): - _VALID_URL = r'https://vupload\.com/v/(?P[a-z0-9]+)' - _TESTS = [{ - 'url': 'https://vupload.com/v/u28d0pl2tphy', - 'md5': '9b42a4a193cca64d80248e58527d83c8', - 'info_dict': { - 'id': 'u28d0pl2tphy', - 'ext': 'mp4', - 'description': 'md5:e9e6c0045c78cbf0d5bb19a55ce199fb', - 'title': 'md5:e9e6c0045c78cbf0d5bb19a55ce199fb', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_extract_title(webpage) - video_json = self._parse_json(self._html_search_regex(r'sources:\s*(.+?]),', webpage, 'video'), video_id, transform_source=js_to_json) - formats = [] - for source in video_json: - if source['src'].endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(source['src'], video_id, m3u8_id='hls')) - duration = parse_duration(self._html_search_regex( - r'\s*([\d:]+)\s*
', webpage, 'duration', fatal=False)) - filesize_approx = parse_filesize(self._html_search_regex( - r'\s*([^<]+)\s*', webpage, 'filesize', fatal=False)) - extra_video_info = extract_attributes(self._html_search_regex( - r'(]+>)', webpage, 'video_info', fatal=False)) - description = self._html_search_meta('description', webpage) - - return { - 'id': video_id, - 'formats': formats, - 'duration': duration, - 'filesize_approx': filesize_approx, - 'width': int_or_none(extra_video_info.get('width')), - 'height': int_or_none(extra_video_info.get('height')), - 'format_id': extra_video_info.get('height', '') + 'p', - 'title': title, - 'description': description, - } diff --git a/yt_dlp/extractor/vvvvid.py b/yt_dlp/extractor/vvvvid.py index ed725a55d..b42ba8537 100644 --- a/yt_dlp/extractor/vvvvid.py +++ b/yt_dlp/extractor/vvvvid.py @@ -1,3 +1,4 @@ +import functools import re from .common import InfoExtractor @@ -14,21 +15,21 @@ class VVVVIDIE(InfoExtractor): _VALID_URL = r'%s(?P\d+)/[^/]+/(?P\d+)/(?P[0-9]+)' % _VALID_URL_BASE _TESTS = [{ # video_type == 'video/vvvvid' - 'url': 'https://www.vvvvid.it/#!show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048/ping-pong', - 'md5': 'b8d3cecc2e981adc3835adf07f6df91b', + 'url': 'https://www.vvvvid.it/show/498/the-power-of-computing/518/505692/playstation-vr-cambiera-il-nostro-modo-di-giocare', 'info_dict': { - 'id': '489048', + 'id': '505692', 'ext': 'mp4', - 'title': 'Ping Pong', - 'duration': 239, - 'series': '"Perché dovrei guardarlo?" di Dario Moccia', - 'season_id': '437', - 'episode': 'Ping Pong', - 'episode_number': 1, - 'episode_id': '3334', + 'title': 'Playstation VR cambierà il nostro modo di giocare', + 'duration': 93, + 'series': 'The Power of Computing', + 'season_id': '518', + 'episode': 'Playstation VR cambierà il nostro modo di giocare', + 'episode_number': None, + 'episode_id': '4747', 'view_count': int, 'like_count': int, 'repost_count': int, + 'thumbnail': 'https://static.vvvvid.it/img/zoomin/28CA2409-E663-34F0-2B02E72356556EA3_500k.jpg', }, 'params': { 'skip_download': True, @@ -36,7 +37,6 @@ class VVVVIDIE(InfoExtractor): }, { # video_type == 'video/rcs' 'url': 'https://www.vvvvid.it/#!show/376/death-note-live-action/377/482493/episodio-01', - 'md5': '33e0edfba720ad73a8782157fdebc648', 'info_dict': { 'id': '482493', 'ext': 'mp4', @@ -45,6 +45,7 @@ class VVVVIDIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Every video/rcs is not working even in real website', }, { # video_type == 'video/youtube' 'url': 'https://www.vvvvid.it/show/404/one-punch-man/406/486683/trailer', @@ -55,19 +56,54 @@ class VVVVIDIE(InfoExtractor): 'title': 'Trailer', 'upload_date': '20150906', 'description': 'md5:a5e802558d35247fee285875328c0b80', - 'uploader_id': 'BandaiVisual', - 'uploader': 'BANDAI NAMCO Arts Channel', + 'uploader_id': '@EMOTIONLabelChannel', + 'uploader': 'EMOTION Label Channel', + 'episode_number': None, + 'episode_id': '3115', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'availability': str, + 'categories': list, + 'age_limit': 0, + 'channel': 'EMOTION Label Channel', + 'channel_follower_count': int, + 'channel_id': 'UCQ5URCSs1f5Cz9rh-cDGxNQ', + 'channel_url': 'https://www.youtube.com/channel/UCQ5URCSs1f5Cz9rh-cDGxNQ', + 'comment_count': int, + 'duration': 133, + 'episode': 'Trailer', + 'heatmap': list, + 'live_status': 'not_live', + 'playable_in_embed': True, + 'season_id': '406', + 'series': 'One-Punch Man', + 'tags': list, + 'uploader_url': 'https://www.youtube.com/@EMOTIONLabelChannel', + 'thumbnail': 'https://i.ytimg.com/vi/RzmFKUDOUgw/maxresdefault.jpg', }, 'params': { 'skip_download': True, }, }, { # video_type == 'video/dash' - 'url': 'https://www.vvvvid.it/show/683/made-in-abyss/1542/693786/nanachi', + 'url': 'https://www.vvvvid.it/show/844/le-bizzarre-avventure-di-jojo-vento-aureo/938/527551/golden-wind', 'info_dict': { - 'id': '693786', + 'id': '527551', 'ext': 'mp4', - 'title': 'Nanachi', + 'title': 'Golden Wind', + 'duration': 1430, + 'series': 'Le bizzarre avventure di Jojo - Vento Aureo', + 'season_id': '938', + 'episode': 'Golden Wind', + 'episode_number': 1, + 'episode_id': '9089', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'thumbnail': 'https://static.vvvvid.it/img/thumbs/Dynit/Jojo/Jojo_S05Ep01-t.jpg', + 'season': 'Season 5', + 'season_number': 5, }, 'params': { 'skip_download': True, @@ -79,10 +115,17 @@ class VVVVIDIE(InfoExtractor): }] _conn_id = None + @functools.cached_property + def _headers(self): + return { + **self.geo_verification_headers(), + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.50 Safari/537.37', + } + def _real_initialize(self): self._conn_id = self._download_json( 'https://www.vvvvid.it/user/login', - None, headers=self.geo_verification_headers())['data']['conn_id'] + None, headers=self._headers)['data']['conn_id'] def _download_info(self, show_id, path, video_id, fatal=True, query=None): q = { @@ -92,7 +135,7 @@ def _download_info(self, show_id, path, video_id, fatal=True, query=None): q.update(query) response = self._download_json( 'https://www.vvvvid.it/vvvvid/ondemand/%s/%s' % (show_id, path), - video_id, headers=self.geo_verification_headers(), query=q, fatal=fatal) + video_id, headers=self._headers, query=q, fatal=fatal) if not (response or fatal): return if response.get('result') == 'error': @@ -219,7 +262,7 @@ def metadata_from_url(r_url): embed_code, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: formats.extend(self._extract_wowza_formats( - 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) + 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id, skip_protocols=['f4m'])) metadata_from_url(embed_code) if not is_youtube: diff --git a/yt_dlp/extractor/vyborymos.py b/yt_dlp/extractor/vyborymos.py deleted file mode 100644 index 386518795..000000000 --- a/yt_dlp/extractor/vyborymos.py +++ /dev/null @@ -1,52 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_str - - -class VyboryMosIE(InfoExtractor): - _VALID_URL = r'https?://vybory\.mos\.ru/(?:#precinct/|account/channels\?.*?\bstation_id=)(?P\d+)' - _TESTS = [{ - 'url': 'http://vybory.mos.ru/#precinct/13636', - 'info_dict': { - 'id': '13636', - 'ext': 'mp4', - 'title': 're:^Участковая избирательная комиссия №2231 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'Россия, Москва, улица Введенского, 32А', - 'is_live': True, - }, - 'params': { - 'skip_download': True, - } - }, { - 'url': 'http://vybory.mos.ru/account/channels?station_id=13636', - 'only_matching': True, - }] - - def _real_extract(self, url): - station_id = self._match_id(url) - - channels = self._download_json( - 'http://vybory.mos.ru/account/channels?station_id=%s' % station_id, - station_id, 'Downloading channels JSON') - - formats = [] - for cam_num, (sid, hosts, name, _) in enumerate(channels, 1): - for num, host in enumerate(hosts, 1): - formats.append({ - 'url': 'http://%s/master.m3u8?sid=%s' % (host, sid), - 'ext': 'mp4', - 'format_id': 'camera%d-host%d' % (cam_num, num), - 'format_note': '%s, %s' % (name, host), - }) - - info = self._download_json( - 'http://vybory.mos.ru/json/voting_stations/%s/%s.json' - % (compat_str(station_id)[:3], station_id), - station_id, 'Downloading station JSON', fatal=False) or {} - - return { - 'id': station_id, - 'title': info.get('name') or station_id, - 'description': info.get('address'), - 'is_live': True, - 'formats': formats, - } diff --git a/yt_dlp/extractor/vzaar.py b/yt_dlp/extractor/vzaar.py deleted file mode 100644 index 19908a929..000000000 --- a/yt_dlp/extractor/vzaar.py +++ /dev/null @@ -1,100 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - float_or_none, - unified_timestamp, - url_or_none, -) - - -class VzaarIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|view)\.)?vzaar\.com/(?:videos/)?(?P\d+)' - _EMBED_REGEX = [r']+src=["\'](?P(?:https?:)?//(?:view\.vzaar\.com)/[0-9]+)'] - _TESTS = [{ - # HTTP and HLS - 'url': 'https://vzaar.com/videos/1152805', - 'md5': 'bde5ddfeb104a6c56a93a06b04901dbf', - 'info_dict': { - 'id': '1152805', - 'ext': 'mp4', - 'title': 'sample video (public)', - }, - }, { - 'url': 'https://view.vzaar.com/27272/player', - 'md5': '3b50012ac9bbce7f445550d54e0508f2', - 'info_dict': { - 'id': '27272', - 'ext': 'mp3', - 'title': 'MP3', - }, - }, { - # hlsAes = true - 'url': 'https://view.vzaar.com/11379930/player', - 'info_dict': { - 'id': '11379930', - 'ext': 'mp4', - 'title': 'Videoaula', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # with null videoTitle - 'url': 'https://view.vzaar.com/20313539/download', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - video_data = self._download_json( - 'http://view.vzaar.com/v2/%s/video' % video_id, video_id) - - title = video_data.get('videoTitle') or video_id - - formats = [] - - source_url = url_or_none(video_data.get('sourceUrl')) - if source_url: - f = { - 'url': source_url, - 'format_id': 'http', - 'quality': 1, - } - if 'audio' in source_url: - f.update({ - 'vcodec': 'none', - 'ext': 'mp3', - }) - else: - f.update({ - 'width': int_or_none(video_data.get('width')), - 'height': int_or_none(video_data.get('height')), - 'ext': 'mp4', - 'fps': float_or_none(video_data.get('fps')), - }) - formats.append(f) - - video_guid = video_data.get('guid') - usp = video_data.get('usp') - if video_data.get('uspEnabled') and isinstance(video_guid, compat_str) and isinstance(usp, dict): - hls_aes = video_data.get('hlsAes') - qs = '&'.join('%s=%s' % (k, v) for k, v in usp.items()) - url_templ = 'http://%%s.vzaar.com/v5/usp%s/%s/%s.ism%%s?' % ('aes' if hls_aes else '', video_guid, video_id) - m3u8_formats = self._extract_m3u8_formats( - url_templ % ('fable', '/.m3u8') + qs, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False) - if hls_aes: - for f in m3u8_formats: - f['hls_aes'] = {'uri': url_templ % ('goose', '') + qs} - formats.extend(m3u8_formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': self._proto_relative_url(video_data.get('poster')), - 'duration': float_or_none(video_data.get('videoDuration')), - 'timestamp': unified_timestamp(video_data.get('ts')), - 'formats': formats, - } diff --git a/yt_dlp/extractor/wakanim.py b/yt_dlp/extractor/wakanim.py deleted file mode 100644 index 155008f8c..000000000 --- a/yt_dlp/extractor/wakanim.py +++ /dev/null @@ -1,75 +0,0 @@ -from urllib.parse import unquote - -from .common import InfoExtractor -from ..utils import ( - merge_dicts, - urljoin, -) - - -class WakanimIE(InfoExtractor): - _VALID_URL = r'https://(?:www\.)?wakanim\.tv/[^/]+/v2/catalogue/episode/(?P\d+)' - _TESTS = [{ - 'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/2997/the-asterisk-war-omu-staffel-1-episode-02-omu', - 'info_dict': { - 'id': '2997', - 'ext': 'mp4', - 'title': 'Episode 02', - 'description': 'md5:2927701ea2f7e901de8bfa8d39b2852d', - 'series': 'The Asterisk War (OmU.)', - 'season_number': 1, - 'episode': 'Episode 02', - 'episode_number': 2, - }, - 'params': { - 'skip_download': True, - }, - }, { - # DRM Protected - 'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/7843/sword-art-online-alicization-omu-arc-2-folge-15-omu', - 'only_matching': True, - }] - _GEO_BYPASS = False - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - if 'Geoblocking' in webpage: - if '/de/' in url: - self.raise_geo_restricted(countries=['DE', 'AT', 'CH']) - else: - self.raise_geo_restricted(countries=['RU']) - - manifest_url = urljoin(url, self._search_regex( - r'file\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'manifest url', - group='url')) - if not self.get_param('allow_unplayable_formats'): - # https://docs.microsoft.com/en-us/azure/media-services/previous/media-services-content-protection-overview#streaming-urls - encryption = self._search_regex( - r'encryption%3D(c(?:enc|bc(?:s-aapl)?))', - manifest_url, 'encryption', default=None) - if encryption in ('cenc', 'cbcs-aapl'): - self.report_drm(video_id) - - if 'format=mpd-time-cmaf' in unquote(manifest_url): - formats = self._extract_mpd_formats( - manifest_url, video_id, mpd_id='dash') - else: - formats = self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - - info = self._search_json_ld(webpage, video_id, default={}) - - title = self._search_regex( - (r']+\bclass=["\']episode_h1[^>]+\btitle=(["\'])(?P(?:(?!\1).)+)\1', - r'<span[^>]+\bclass=["\']episode_title["\'][^>]*>(?P<title>[^<]+)'), - webpage, 'title', default=None, group='title') - - return merge_dicts(info, { - 'id': video_id, - 'title': title, - 'formats': formats, - }) diff --git a/yt_dlp/extractor/watchbox.py b/yt_dlp/extractor/watchbox.py deleted file mode 100644 index c973ca998..000000000 --- a/yt_dlp/extractor/watchbox.py +++ /dev/null @@ -1,153 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - js_to_json, - strip_or_none, - try_get, - unescapeHTML, - unified_timestamp, -) - - -class WatchBoxIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?watchbox\.de/(?P<kind>serien|filme)/(?:[^/]+/)*[^/]+-(?P<id>\d+)' - _TESTS = [{ - # film - 'url': 'https://www.watchbox.de/filme/free-jimmy-12325.html', - 'info_dict': { - 'id': '341368', - 'ext': 'mp4', - 'title': 'Free Jimmy', - 'description': 'md5:bcd8bafbbf9dc0ef98063d344d7cc5f6', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 4890, - 'age_limit': 16, - 'release_year': 2009, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Failed to download m3u8 information'], - }, { - # episode - 'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-1/date-in-der-hoelle-328286.html', - 'info_dict': { - 'id': '328286', - 'ext': 'mp4', - 'title': 'S01 E01 - Date in der Hölle', - 'description': 'md5:2f31c74a8186899f33cb5114491dae2b', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1291, - 'age_limit': 12, - 'release_year': 2010, - 'series': 'Ugly Americans', - 'season_number': 1, - 'episode': 'Date in der Hölle', - 'episode_number': 1, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Failed to download m3u8 information'], - }, { - 'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-2/der-ring-des-powers-328270', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - kind, video_id = mobj.group('kind', 'id') - - webpage = self._download_webpage(url, video_id) - - player_config = self._parse_json( - self._search_regex( - r'data-player-conf=(["\'])(?P<data>{.+?})\1', webpage, - 'player config', default='{}', group='data'), - video_id, transform_source=unescapeHTML, fatal=False) - - if not player_config: - player_config = self._parse_json( - self._search_regex( - r'playerConf\s*=\s*({.+?})\s*;', webpage, 'player config', - default='{}'), - video_id, transform_source=js_to_json, fatal=False) or {} - - source = player_config.get('source') or {} - - video_id = compat_str(source.get('videoId') or video_id) - - devapi = self._download_json( - 'http://api.watchbox.de/devapi/id/%s' % video_id, video_id, query={ - 'format': 'json', - 'apikey': 'hbbtv', - }, fatal=False) - - item = try_get(devapi, lambda x: x['items'][0], dict) or {} - - title = item.get('title') or try_get( - item, lambda x: x['movie']['headline_movie'], - compat_str) or source['title'] - - formats = [] - hls_url = item.get('media_videourl_hls') or source.get('hls') - if hls_url: - formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - dash_url = item.get('media_videourl_wv') or source.get('dash') - if dash_url: - formats.extend(self._extract_mpd_formats( - dash_url, video_id, mpd_id='dash', fatal=False)) - mp4_url = item.get('media_videourl') - if mp4_url: - formats.append({ - 'url': mp4_url, - 'format_id': 'mp4', - 'width': int_or_none(item.get('width')), - 'height': int_or_none(item.get('height')), - 'tbr': int_or_none(item.get('bitrate')), - }) - - description = strip_or_none(item.get('descr')) - thumbnail = item.get('media_content_thumbnail_large') or source.get('poster') or item.get('media_thumbnail') - duration = int_or_none(item.get('media_length') or source.get('length')) - timestamp = unified_timestamp(item.get('pubDate')) - view_count = int_or_none(item.get('media_views')) - age_limit = int_or_none(try_get(item, lambda x: x['movie']['fsk'])) - release_year = int_or_none(try_get(item, lambda x: x['movie']['rel_year'])) - - info = { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'view_count': view_count, - 'age_limit': age_limit, - 'release_year': release_year, - 'formats': formats, - } - - if kind.lower() == 'serien': - series = try_get( - item, lambda x: x['special']['title'], - compat_str) or source.get('format') - season_number = int_or_none(self._search_regex( - r'^S(\d{1,2})\s*E\d{1,2}', title, 'season number', - default=None) or self._search_regex( - r'/staffel-(\d+)/', url, 'season number', default=None)) - episode = source.get('title') - episode_number = int_or_none(self._search_regex( - r'^S\d{1,2}\s*E(\d{1,2})', title, 'episode number', - default=None)) - info.update({ - 'series': series, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - }) - - return info diff --git a/yt_dlp/extractor/watchindianporn.py b/yt_dlp/extractor/watchindianporn.py deleted file mode 100644 index 3ded2d1d4..000000000 --- a/yt_dlp/extractor/watchindianporn.py +++ /dev/null @@ -1,65 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import parse_duration - - -class WatchIndianPornIE(InfoExtractor): - IE_DESC = 'Watch Indian Porn' - _VALID_URL = r'https?://(?:www\.)?watchindianporn\.net/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' - _TEST = { - 'url': 'http://www.watchindianporn.net/video/hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera-RZa2avywNPa.html', - 'md5': '249589a164dde236ec65832bfce17440', - 'info_dict': { - 'id': 'RZa2avywNPa', - 'display_id': 'hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera', - 'ext': 'mp4', - 'title': 'Hot milf from kerala shows off her gorgeous large breasts on camera', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 226, - 'view_count': int, - 'categories': list, - 'age_limit': 18, - } - } - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] - - title = self._html_search_regex(( - r'<title>(.+?)\s*-\s*Indian\s+Porn', - r'

(.+?)

' - ), webpage, 'title') - - duration = parse_duration(self._search_regex( - r'Time:\s*\s*(.+?)\s*', - webpage, 'duration', fatal=False)) - - view_count = int(self._search_regex( - r'(?s)Time:\s*.*?.*?\s*(\d+)\s*', - webpage, 'view count', fatal=False)) - - categories = re.findall( - r']+class=[\'"]categories[\'"][^>]*>\s*([^<]+)\s*', - webpage) - - info_dict.update({ - 'id': video_id, - 'display_id': display_id, - 'http_headers': { - 'Referer': url, - }, - 'title': title, - 'duration': duration, - 'view_count': view_count, - 'categories': categories, - 'age_limit': 18, - }) - - return info_dict diff --git a/yt_dlp/extractor/weibo.py b/yt_dlp/extractor/weibo.py index b0c3052b6..2fca745aa 100644 --- a/yt_dlp/extractor/weibo.py +++ b/yt_dlp/extractor/weibo.py @@ -1,3 +1,4 @@ +import json import random import itertools import urllib.parse @@ -18,24 +19,33 @@ class WeiboBaseIE(InfoExtractor): - def _update_visitor_cookies(self, video_id): + def _update_visitor_cookies(self, visitor_url, video_id): + headers = {'Referer': visitor_url} + chrome_ver = self._search_regex( + r'Chrome/(\d+)', self.get_param('http_headers')['User-Agent'], 'user agent version', default='90') visitor_data = self._download_json( 'https://passport.weibo.com/visitor/genvisitor', video_id, note='Generating first-visit guest request', - transform_source=strip_jsonp, + headers=headers, transform_source=strip_jsonp, data=urlencode_postdata({ 'cb': 'gen_callback', - 'fp': '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}', - })) + 'fp': json.dumps({ + 'os': '1', + 'browser': f'Chrome{chrome_ver},0,0,0', + 'fonts': 'undefined', + 'screenInfo': '1920*1080*24', + 'plugins': '' + }, separators=(',', ':'))}))['data'] self._download_webpage( 'https://passport.weibo.com/visitor/visitor', video_id, note='Running first-visit callback to get guest cookies', - query={ + headers=headers, query={ 'a': 'incarnate', - 't': visitor_data['data']['tid'], - 'w': 2, - 'c': '%03d' % visitor_data['data']['confidence'], + 't': visitor_data['tid'], + 'w': 3 if visitor_data.get('new_tid') else 2, + 'c': f'{visitor_data.get("confidence", 100):03d}', + 'gc': '', 'cb': 'cross_domain', 'from': 'weibo', '_rand': random.random(), @@ -44,7 +54,7 @@ def _update_visitor_cookies(self, video_id): def _weibo_download_json(self, url, video_id, *args, fatal=True, note='Downloading JSON metadata', **kwargs): webpage, urlh = self._download_webpage_handle(url, video_id, *args, fatal=fatal, note=note, **kwargs) if urllib.parse.urlparse(urlh.url).netloc == 'passport.weibo.com': - self._update_visitor_cookies(video_id) + self._update_visitor_cookies(urlh.url, video_id) webpage = self._download_webpage(url, video_id, *args, fatal=fatal, note=note, **kwargs) return self._parse_json(webpage, video_id, fatal=fatal) diff --git a/yt_dlp/extractor/weverse.py b/yt_dlp/extractor/weverse.py index bbf62856a..c94ca9db9 100644 --- a/yt_dlp/extractor/weverse.py +++ b/yt_dlp/extractor/weverse.py @@ -45,10 +45,10 @@ def _perform_login(self, username, password): 'x-acc-trace-id': str(uuid.uuid4()), 'x-clog-user-device-id': str(uuid.uuid4()), } - check_username = self._download_json( - f'{self._ACCOUNT_API_BASE}/signup/email/status', None, - note='Checking username', query={'email': username}, headers=headers) - if not check_username.get('hasPassword'): + valid_username = traverse_obj(self._download_json( + f'{self._ACCOUNT_API_BASE}/signup/email/status', None, note='Checking username', + query={'email': username}, headers=headers, expected_status=(400, 404)), 'hasPassword') + if not valid_username: raise ExtractorError('Invalid username provided', expected=True) headers['content-type'] = 'application/json' @@ -182,7 +182,7 @@ def _extract_live_status(self, data): class WeverseIE(WeverseBaseIE): - _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/live/(?P[\d-]+)' + _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/live/(?P[\d-]+)' _TESTS = [{ 'url': 'https://weverse.io/billlie/live/0-107323480', 'md5': '1fa849f00181eef9100d3c8254c47979', @@ -344,7 +344,7 @@ def _real_extract(self, url): class WeverseMediaIE(WeverseBaseIE): - _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/media/(?P[\d-]+)' + _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/media/(?P[\d-]+)' _TESTS = [{ 'url': 'https://weverse.io/billlie/media/4-116372884', 'md5': '8efc9cfd61b2f25209eb1a5326314d28', @@ -420,7 +420,7 @@ def _real_extract(self, url): class WeverseMomentIE(WeverseBaseIE): - _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/moment/(?P[\da-f]+)/post/(?P[\d-]+)' + _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/moment/(?P[\da-f]+)/post/(?P[\d-]+)' _TESTS = [{ 'url': 'https://weverse.io/secretnumber/moment/66a07e164b56a696ee71c99315ffe27b/post/1-117229444', 'md5': '87733ac19a54081b7dfc2442036d282b', @@ -516,7 +516,7 @@ def _real_extract(self, url): class WeverseLiveTabIE(WeverseTabBaseIE): - _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/live/?(?:[?#]|$)' + _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/live/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://weverse.io/billlie/live/', 'playlist_mincount': 55, @@ -534,7 +534,7 @@ class WeverseLiveTabIE(WeverseTabBaseIE): class WeverseMediaTabIE(WeverseTabBaseIE): - _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/media(?:/|/all|/new)?(?:[?#]|$)' + _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/media(?:/|/all|/new)?(?:[?#]|$)' _TESTS = [{ 'url': 'https://weverse.io/billlie/media/', 'playlist_mincount': 231, @@ -558,7 +558,7 @@ class WeverseMediaTabIE(WeverseTabBaseIE): class WeverseLiveIE(WeverseBaseIE): - _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P[^/?#]+)/?(?:[?#]|$)' + _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://weverse.io/purplekiss', 'info_dict': { diff --git a/yt_dlp/extractor/willow.py b/yt_dlp/extractor/willow.py deleted file mode 100644 index 0ec9c9d6e..000000000 --- a/yt_dlp/extractor/willow.py +++ /dev/null @@ -1,56 +0,0 @@ -from ..utils import ExtractorError -from .common import InfoExtractor - - -class WillowIE(InfoExtractor): - _VALID_URL = r'https?://(www\.)?willow\.tv/videos/(?P[0-9a-z-_]+)' - _GEO_COUNTRIES = ['US'] - - _TESTS = [{ - 'url': 'http://willow.tv/videos/d5winning-moment-eng-vs-ind-streaming-online-4th-test-india-tour-of-england-2021', - 'info_dict': { - 'id': '169662', - 'display_id': 'd5winning-moment-eng-vs-ind-streaming-online-4th-test-india-tour-of-england-2021', - 'ext': 'mp4', - 'title': 'Winning Moment: 4th Test, England vs India', - 'thumbnail': 'https://aimages.willow.tv/ytThumbnails/6748_D5winning_moment.jpg', - 'duration': 233, - 'timestamp': 1630947954, - 'upload_date': '20210906', - 'location': 'Kennington Oval, London', - 'series': 'India tour of England 2021', - }, - 'params': { - 'skip_download': True, # AES-encrypted m3u8 - }, - }, { - 'url': 'http://willow.tv/videos/highlights-short-ind-vs-nz-streaming-online-2nd-t20i-new-zealand-tour-of-india-2021', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_data = self._parse_json(self._html_search_regex( - r'var\s+data_js\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, - 'data_js'), video_id) - - video = next((v for v in video_data.get('trending_videos') or [] - if v.get('secureurl')), None) - if not video: - raise ExtractorError('No videos found') - - formats = self._extract_m3u8_formats(video['secureurl'], video_id, 'mp4') - - return { - 'id': str(video.get('content_id')), - 'display_id': video.get('video_slug'), - 'title': video.get('video_name') or self._html_search_meta('twitter:title', webpage), - 'formats': formats, - 'thumbnail': video.get('yt_thumb_url') or self._html_search_meta( - 'twitter:image', webpage, default=None), - 'duration': video.get('duration_seconds'), - 'timestamp': video.get('created_date'), - 'location': video.get('venue'), - 'series': video.get('series_name'), - } diff --git a/yt_dlp/extractor/wimtv.py b/yt_dlp/extractor/wimtv.py index 571112390..f9bf092df 100644 --- a/yt_dlp/extractor/wimtv.py +++ b/yt_dlp/extractor/wimtv.py @@ -11,7 +11,7 @@ class WimTVIE(InfoExtractor): _player = None _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' _VALID_URL = r'''(?x: - https?://platform.wim.tv/ + https?://platform\.wim\.tv/ (?: (?:embed/)?\? |\#/webtv/.+?/ diff --git a/yt_dlp/extractor/wordpress.py b/yt_dlp/extractor/wordpress.py index 53820b57a..378d99dbc 100644 --- a/yt_dlp/extractor/wordpress.py +++ b/yt_dlp/extractor/wordpress.py @@ -70,7 +70,7 @@ def _extract_from_webpage(self, url, webpage): 'height': int_or_none(traverse_obj(track, ('dimensions', 'original', 'height'))), 'width': int_or_none(traverse_obj(track, ('dimensions', 'original', 'width'))), } for track in traverse_obj(playlist_json, ('tracks', ...), expected_type=dict)] - yield self.playlist_result(entries, self._generic_id(url) + f'-wp-playlist-{i+1}', 'Wordpress Playlist') + yield self.playlist_result(entries, self._generic_id(url) + f'-wp-playlist-{i + 1}', 'Wordpress Playlist') class WordpressMiniAudioPlayerEmbedIE(InfoExtractor): diff --git a/yt_dlp/extractor/xbef.py b/yt_dlp/extractor/xbef.py deleted file mode 100644 index ac69528a3..000000000 --- a/yt_dlp/extractor/xbef.py +++ /dev/null @@ -1,42 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote - - -class XBefIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?xbef\.com/video/(?P[0-9]+)' - _TEST = { - 'url': 'http://xbef.com/video/5119-glamourous-lesbians-smoking-drinking-and-fucking', - 'md5': 'a478b565baff61634a98f5e5338be995', - 'info_dict': { - 'id': '5119', - 'ext': 'mp4', - 'title': 'md5:7358a9faef8b7b57acda7c04816f170e', - 'age_limit': 18, - 'thumbnail': r're:^http://.*\.jpg', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex( - r']*>(.*?)', webpage, 'title') - - config_url_enc = self._download_webpage( - 'http://xbef.com/Main/GetVideoURLEncoded/%s' % video_id, video_id, - note='Retrieving config URL') - config_url = compat_urllib_parse_unquote(config_url_enc) - config = self._download_xml( - config_url, video_id, note='Retrieving config') - - video_url = config.find('./file').text - thumbnail = config.find('./image').text - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'age_limit': 18, - } diff --git a/yt_dlp/extractor/xhamster.py b/yt_dlp/extractor/xhamster.py index aec1f20bb..01ac5ddb6 100644 --- a/yt_dlp/extractor/xhamster.py +++ b/yt_dlp/extractor/xhamster.py @@ -24,7 +24,7 @@ class XHamsterIE(InfoExtractor): _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com|xhday\.com|xhvid\.com)' _VALID_URL = r'''(?x) https?:// - (?:.+?\.)?%s/ + (?:[^/?#]+\.)?%s/ (?: movies/(?P[\dA-Za-z]+)/(?P[^/]*)\.html| videos/(?P[^/]*)-(?P[\dA-Za-z]+) @@ -372,7 +372,7 @@ def get_height(s): class XHamsterEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?%s/xembed\.php\?video=(?P\d+)' % XHamsterIE._DOMAINS + _VALID_URL = r'https?://(?:[^/?#]+\.)?%s/xembed\.php\?video=(?P\d+)' % XHamsterIE._DOMAINS _EMBED_REGEX = [r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1'] _TEST = { 'url': 'http://xhamster.com/xembed.php?video=3328539', diff --git a/yt_dlp/extractor/xtube.py b/yt_dlp/extractor/xtube.py deleted file mode 100644 index db8292589..000000000 --- a/yt_dlp/extractor/xtube.py +++ /dev/null @@ -1,214 +0,0 @@ -import itertools -import re - -from .common import InfoExtractor -from ..networking import Request -from ..utils import ( - int_or_none, - js_to_json, - orderedSet, - parse_duration, - str_to_int, - url_or_none, -) - - -class XTubeIE(InfoExtractor): - _VALID_URL = r'''(?x) - (?: - xtube:| - https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?:embedded/)?(?P[^/]+)-) - ) - (?P[^/?&#]+) - ''' - - _TESTS = [{ - # old URL schema - 'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_', - 'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab', - 'info_dict': { - 'id': 'kVTUy_G222_', - 'ext': 'mp4', - 'title': 'strange erotica', - 'description': 'contains:an ET kind of thing', - 'uploader': 'greenshowers', - 'duration': 450, - 'view_count': int, - 'comment_count': int, - 'age_limit': 18, - } - }, { - # new URL schema - 'url': 'http://www.xtube.com/video-watch/strange-erotica-625837', - 'only_matching': True, - }, { - 'url': 'xtube:625837', - 'only_matching': True, - }, { - 'url': 'xtube:kVTUy_G222_', - 'only_matching': True, - }, { - 'url': 'https://www.xtube.com/video-watch/embedded/milf-tara-and-teen-shared-and-cum-covered-extreme-bukkake-32203482?embedsize=big', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - if not display_id: - display_id = video_id - - if video_id.isdigit() and len(video_id) < 11: - url_pattern = 'http://www.xtube.com/video-watch/-%s' - else: - url_pattern = 'http://www.xtube.com/watch.php?v=%s' - - webpage = self._download_webpage( - url_pattern % video_id, display_id, headers={ - 'Cookie': 'age_verified=1; cookiesAccepted=1', - }) - - title, thumbnail, duration, sources, media_definition = [None] * 5 - - config = self._parse_json(self._search_regex( - r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf|playerWrapper)', webpage, 'config', - default='{}'), video_id, transform_source=js_to_json, fatal=False) - if config: - config = config.get('mainRoll') - if isinstance(config, dict): - title = config.get('title') - thumbnail = config.get('poster') - duration = int_or_none(config.get('duration')) - sources = config.get('sources') or config.get('format') - media_definition = config.get('mediaDefinition') - - if not isinstance(sources, dict) and not media_definition: - sources = self._parse_json(self._search_regex( - r'(["\'])?sources\1?\s*:\s*(?P{.+?}),', - webpage, 'sources', group='sources'), video_id, - transform_source=js_to_json) - - formats = [] - format_urls = set() - - if isinstance(sources, dict): - for format_id, format_url in sources.items(): - format_url = url_or_none(format_url) - if not format_url: - continue - if format_url in format_urls: - continue - format_urls.add(format_url) - formats.append({ - 'url': format_url, - 'format_id': format_id, - 'height': int_or_none(format_id), - }) - - if isinstance(media_definition, list): - for media in media_definition: - video_url = url_or_none(media.get('videoUrl')) - if not video_url: - continue - if video_url in format_urls: - continue - format_urls.add(video_url) - format_id = media.get('format') - if format_id == 'hls': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif format_id == 'mp4': - height = int_or_none(media.get('quality')) - formats.append({ - 'url': video_url, - 'format_id': '%s-%d' % (format_id, height) if height else format_id, - 'height': height, - }) - - self._remove_duplicate_formats(formats) - - if not title: - title = self._search_regex( - (r'

\s*(?P[^<]+?)\s*</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), - webpage, 'title', group='title') - description = self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'twitter:description', webpage, default=None) or self._search_regex( - r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False) - uploader = self._search_regex( - (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"', - r'<span[^>]+class="nickname"[^>]*>([^<]+)'), - webpage, 'uploader', fatal=False) - if not duration: - duration = parse_duration(self._search_regex( - r'<dt>Runtime:?</dt>\s*<dd>([^<]+)</dd>', - webpage, 'duration', fatal=False)) - view_count = str_to_int(self._search_regex( - (r'["\']viewsCount["\'][^>]*>(\d+)\s+views', - r'<dt>Views:?</dt>\s*<dd>([\d,\.]+)</dd>'), - webpage, 'view count', fatal=False)) - comment_count = str_to_int(self._html_search_regex( - r'>Comments? \(([\d,\.]+)\)<', - webpage, 'comment count', fatal=False)) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, - 'age_limit': 18, - 'formats': formats, - } - - -class XTubeUserIE(InfoExtractor): - IE_DESC = 'XTube user profile' - _VALID_URL = r'https?://(?:www\.)?xtube\.com/profile/(?P<id>[^/]+-\d+)' - _TEST = { - 'url': 'http://www.xtube.com/profile/greenshowers-4056496', - 'info_dict': { - 'id': 'greenshowers-4056496', - 'age_limit': 18, - }, - 'playlist_mincount': 154, - } - - def _real_extract(self, url): - user_id = self._match_id(url) - - entries = [] - for pagenum in itertools.count(1): - request = Request( - 'http://www.xtube.com/profile/%s/videos/%d' % (user_id, pagenum), - headers={ - 'Cookie': 'popunder=4', - 'X-Requested-With': 'XMLHttpRequest', - 'Referer': url, - }) - - page = self._download_json( - request, user_id, 'Downloading videos JSON page %d' % pagenum) - - html = page.get('html') - if not html: - break - - for video_id in orderedSet([video_id for _, video_id in re.findall( - r'data-plid=(["\'])(.+?)\1', html)]): - entries.append(self.url_result('xtube:%s' % video_id, XTubeIE.ie_key())) - - page_count = int_or_none(page.get('pageCount')) - if not page_count or pagenum == page_count: - break - - playlist = self.playlist_result(entries, user_id) - playlist['age_limit'] = 18 - return playlist diff --git a/yt_dlp/extractor/xuite.py b/yt_dlp/extractor/xuite.py deleted file mode 100644 index 71ddadd42..000000000 --- a/yt_dlp/extractor/xuite.py +++ /dev/null @@ -1,149 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - get_element_by_attribute, - parse_iso8601, - remove_end, -) - - -class XuiteIE(InfoExtractor): - IE_DESC = '隨意窩Xuite影音' - _REGEX_BASE64 = r'(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?' - _VALID_URL = r'https?://vlog\.xuite\.net/(?:play|embed)/(?P<id>%s)' % _REGEX_BASE64 - _TESTS = [{ - # Audio - 'url': 'http://vlog.xuite.net/play/RGkzc1ZULTM4NjA5MTQuZmx2', - 'md5': 'e79284c87b371424885448d11f6398c8', - 'info_dict': { - 'id': '3860914', - 'ext': 'mp3', - 'title': '孤單南半球-歐德陽', - 'description': '孤單南半球-歐德陽', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 247.246, - 'timestamp': 1314932940, - 'upload_date': '20110902', - 'uploader': '阿能', - 'uploader_id': '15973816', - 'categories': ['個人短片'], - }, - }, { - # Video with only one format - 'url': 'http://vlog.xuite.net/play/WUxxR2xCLTI1OTI1MDk5LmZsdg==', - 'md5': '21f7b39c009b5a4615b4463df6eb7a46', - 'info_dict': { - 'id': '25925099', - 'ext': 'mp4', - 'title': 'BigBuckBunny_320x180', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 596.458, - 'timestamp': 1454242500, - 'upload_date': '20160131', - 'uploader': '屁姥', - 'uploader_id': '12158353', - 'categories': ['個人短片'], - 'description': 'http://download.blender.org/peach/bigbuckbunny_movies/BigBuckBunny_320x180.mp4', - }, - }, { - # Video with two formats - 'url': 'http://vlog.xuite.net/play/bWo1N1pLLTIxMzAxMTcwLmZsdg==', - 'md5': '1166e0f461efe55b62e26a2d2a68e6de', - 'info_dict': { - 'id': '21301170', - 'ext': 'mp4', - 'title': '暗殺教室 02', - 'description': '字幕:【極影字幕社】', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1384.907, - 'timestamp': 1421481240, - 'upload_date': '20150117', - 'uploader': '我只是想認真點', - 'uploader_id': '242127761', - 'categories': ['電玩動漫'], - }, - 'skip': 'Video removed', - }, { - # Video with encoded media id - # from http://forgetfulbc.blogspot.com/2016/06/date.html - 'url': 'http://vlog.xuite.net/embed/cE1xbENoLTI3NDQ3MzM2LmZsdg==?ar=0&as=0', - 'info_dict': { - 'id': '27447336', - 'ext': 'mp4', - 'title': '男女平權只是口號?專家解釋約會時男生是否該幫女生付錢 (中字)', - 'description': 'md5:1223810fa123b179083a3aed53574706', - 'timestamp': 1466160960, - 'upload_date': '20160617', - 'uploader': 'B.C. & Lowy', - 'uploader_id': '232279340', - }, - }, { - 'url': 'http://vlog.xuite.net/play/S1dDUjdyLTMyOTc3NjcuZmx2/%E5%AD%AB%E7%87%95%E5%A7%BF-%E7%9C%BC%E6%B7%9A%E6%88%90%E8%A9%A9', - 'only_matching': True, - }] - - def _real_extract(self, url): - # /play/ URLs provide embedded video URL and more metadata - url = url.replace('/embed/', '/play/') - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - error_msg = self._search_regex( - r'<div id="error-message-content">([^<]+)', - webpage, 'error message', default=None) - if error_msg: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_msg), - expected=True) - - media_info = self._parse_json(self._search_regex( - r'var\s+mediaInfo\s*=\s*({.*});', webpage, 'media info'), video_id) - - video_id = media_info['MEDIA_ID'] - - formats = [] - for key in ('html5Url', 'html5HQUrl'): - video_url = media_info.get(key) - if not video_url: - continue - format_id = self._search_regex( - r'\bq=(.+?)\b', video_url, 'format id', default=None) - formats.append({ - 'url': video_url, - 'ext': 'mp4' if format_id.isnumeric() else format_id, - 'format_id': format_id, - 'height': int(format_id) if format_id.isnumeric() else None, - }) - - timestamp = media_info.get('PUBLISH_DATETIME') - if timestamp: - timestamp = parse_iso8601(timestamp + ' +0800', ' ') - - category = media_info.get('catName') - categories = [category] if category else [] - - uploader = media_info.get('NICKNAME') - uploader_url = None - - author_div = get_element_by_attribute('itemprop', 'author', webpage) - if author_div: - uploader = uploader or self._html_search_meta('name', author_div) - uploader_url = self._html_search_regex( - r'<link[^>]+itemprop="url"[^>]+href="([^"]+)"', author_div, - 'uploader URL', fatal=False) - - return { - 'id': video_id, - 'title': media_info['TITLE'], - 'description': remove_end(media_info.get('metaDesc'), ' (Xuite 影音)'), - 'thumbnail': media_info.get('ogImageUrl'), - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': media_info.get('MEMBER_ID'), - 'uploader_url': uploader_url, - 'duration': float_or_none(media_info.get('MEDIA_DURATION'), 1000000), - 'categories': categories, - 'formats': formats, - } diff --git a/yt_dlp/extractor/yandexvideo.py b/yt_dlp/extractor/yandexvideo.py index 727250ee8..4382a5684 100644 --- a/yt_dlp/extractor/yandexvideo.py +++ b/yt_dlp/extractor/yandexvideo.py @@ -194,7 +194,7 @@ class ZenYandexIE(InfoExtractor): 'id': '60c7c443da18892ebfe85ed7', 'ext': 'mp4', 'title': 'ВОТ ЭТО Focus. Деды Морозы на гидроциклах', - 'description': 'md5:f3db3d995763b9bbb7b56d4ccdedea89', + 'description': 'md5:8684912f6086f298f8078d4af0e8a600', 'thumbnail': 're:^https://avatars.dzeninfra.ru/', 'uploader': 'AcademeG DailyStream' }, @@ -209,7 +209,7 @@ class ZenYandexIE(InfoExtractor): 'id': '60c7c443da18892ebfe85ed7', 'ext': 'mp4', 'title': 'ВОТ ЭТО Focus. Деды Морозы на гидроциклах', - 'description': 'md5:f3db3d995763b9bbb7b56d4ccdedea89', + 'description': 'md5:8684912f6086f298f8078d4af0e8a600', 'thumbnail': r're:^https://avatars\.dzeninfra\.ru/', 'uploader': 'AcademeG DailyStream', 'upload_date': '20191111', @@ -258,7 +258,7 @@ def _real_extract(self, url): video_id = self._match_id(redirect) webpage = self._download_webpage(redirect, video_id, note='Redirecting') data_json = self._search_json( - r'data\s*=', webpage, 'metadata', video_id, contains_pattern=r'{["\']_*serverState_*video.+}') + r'("data"\s*:|data\s*=)', webpage, 'metadata', video_id, contains_pattern=r'{["\']_*serverState_*video.+}') serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', webpage, 'server state').replace('State', 'Settings') uploader = self._search_regex(r'(<a\s*class=["\']card-channel-link[^"\']+["\'][^>]+>)', @@ -266,22 +266,25 @@ def _real_extract(self, url): uploader_name = extract_attributes(uploader).get('aria-label') video_json = try_get(data_json, lambda x: x[serverstate]['exportData']['video'], dict) stream_urls = try_get(video_json, lambda x: x['video']['streams']) - formats = [] + formats, subtitles = [], {} for s_url in stream_urls: ext = determine_ext(s_url) if ext == 'mpd': - formats.extend(self._extract_mpd_formats(s_url, video_id, mpd_id='dash')) + fmts, subs = self._extract_mpd_formats_and_subtitles(s_url, video_id, mpd_id='dash') elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats(s_url, video_id, 'mp4')) + fmts, subs = self._extract_m3u8_formats_and_subtitles(s_url, video_id, 'mp4') + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) return { 'id': video_id, 'title': video_json.get('title') or self._og_search_title(webpage), 'formats': formats, + 'subtitles': subtitles, 'duration': int_or_none(video_json.get('duration')), 'view_count': int_or_none(video_json.get('views')), 'timestamp': int_or_none(video_json.get('publicationDate')), 'uploader': uploader_name or data_json.get('authorName') or try_get(data_json, lambda x: x['publisher']['name']), - 'description': self._og_search_description(webpage) or try_get(data_json, lambda x: x['og']['description']), + 'description': video_json.get('description') or self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage) or try_get(data_json, lambda x: x['og']['imageUrl']), } @@ -296,6 +299,7 @@ class ZenYandexChannelIE(InfoExtractor): 'description': 'md5:a9e5b3c247b7fe29fd21371a428bcf56', }, 'playlist_mincount': 169, + 'skip': 'The page does not exist', }, { 'url': 'https://dzen.ru/tok_media', 'info_dict': { @@ -304,6 +308,7 @@ class ZenYandexChannelIE(InfoExtractor): 'description': 'md5:a9e5b3c247b7fe29fd21371a428bcf56', }, 'playlist_mincount': 169, + 'skip': 'The page does not exist', }, { 'url': 'https://zen.yandex.ru/id/606fd806cc13cb3c58c05cf5', 'info_dict': { @@ -318,21 +323,21 @@ class ZenYandexChannelIE(InfoExtractor): 'url': 'https://zen.yandex.ru/jony_me', 'info_dict': { 'id': 'jony_me', - 'description': 'md5:a2c62b4ef5cf3e3efb13d25f61f739e1', + 'description': 'md5:ce0a5cad2752ab58701b5497835b2cc5', 'title': 'JONY ', }, - 'playlist_count': 20, + 'playlist_count': 18, }, { # Test that the playlist extractor finishes extracting when the # channel has more than one page of entries 'url': 'https://zen.yandex.ru/tatyanareva', 'info_dict': { 'id': 'tatyanareva', - 'description': 'md5:296b588d60841c3756c9105f237b70c6', + 'description': 'md5:40a1e51f174369ec3ba9d657734ac31f', 'title': 'Татьяна Рева', 'entries': 'maxcount:200', }, - 'playlist_count': 46, + 'playlist_mincount': 46, }, { 'url': 'https://dzen.ru/id/606fd806cc13cb3c58c05cf5', 'info_dict': { @@ -375,7 +380,7 @@ def _real_extract(self, url): item_id = self._match_id(redirect) webpage = self._download_webpage(redirect, item_id, note='Redirecting') data = self._search_json( - r'var\s+data\s*=', webpage, 'channel data', item_id, contains_pattern=r'{\"__serverState__.+}') + r'("data"\s*:|data\s*=)', webpage, 'channel data', item_id, contains_pattern=r'{\"__serverState__.+}') server_state_json = traverse_obj(data, lambda k, _: k.startswith('__serverState__'), get_all=False) server_settings_json = traverse_obj(data, lambda k, _: k.startswith('__serverSettings__'), get_all=False) diff --git a/yt_dlp/extractor/yesjapan.py b/yt_dlp/extractor/yesjapan.py deleted file mode 100644 index 94e41660d..000000000 --- a/yt_dlp/extractor/yesjapan.py +++ /dev/null @@ -1,56 +0,0 @@ -from .common import InfoExtractor -from ..networking import HEADRequest -from ..utils import get_element_by_attribute, parse_iso8601 - - -class YesJapanIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?yesjapan\.com/video/(?P<slug>[A-Za-z0-9\-]*)_(?P<id>[A-Za-z0-9]+)\.html' - _TEST = { - 'url': 'http://www.yesjapan.com/video/japanese-in-5-20-wa-and-ga-particle-usages_726497834.html', - 'md5': 'f0be416314e5be21a12b499b330c21cf', - 'info_dict': { - 'id': '726497834', - 'title': 'Japanese in 5! #20 - WA And GA Particle Usages', - 'description': 'This should clear up some issues most students of Japanese encounter with WA and GA....', - 'ext': 'mp4', - 'timestamp': 1416391590, - 'upload_date': '20141119', - 'thumbnail': r're:^https?://.*\.jpg$', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage) - video_url = self._og_search_video_url(webpage) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - - timestamp = None - submit_info = get_element_by_attribute('class', 'pm-submit-data', webpage) - if submit_info: - timestamp = parse_iso8601(self._search_regex( - r'datetime="([^"]+)"', submit_info, 'upload date', fatal=False, default=None)) - - # attempt to resolve the final URL in order to get a proper extension - redirect_req = HEADRequest(video_url) - req = self._request_webpage( - redirect_req, video_id, note='Resolving final URL', errnote='Could not resolve final URL', fatal=False) - if req: - video_url = req.url - - formats = [{ - 'format_id': 'sd', - 'url': video_url, - }] - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'description': description, - 'timestamp': timestamp, - 'thumbnail': thumbnail, - } diff --git a/yt_dlp/extractor/yinyuetai.py b/yt_dlp/extractor/yinyuetai.py deleted file mode 100644 index b2e3172f9..000000000 --- a/yt_dlp/extractor/yinyuetai.py +++ /dev/null @@ -1,52 +0,0 @@ -from .common import InfoExtractor -from ..utils import ExtractorError - - -class YinYueTaiIE(InfoExtractor): - IE_NAME = 'yinyuetai:video' - IE_DESC = '音悦Tai' - _VALID_URL = r'https?://v\.yinyuetai\.com/video(?:/h5)?/(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://v.yinyuetai.com/video/2322376', - 'md5': '6e3abe28d38e3a54b591f9f040595ce0', - 'info_dict': { - 'id': '2322376', - 'ext': 'mp4', - 'title': '少女时代_PARTY_Music Video Teaser', - 'creator': '少女时代', - 'duration': 25, - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, { - 'url': 'http://v.yinyuetai.com/video/h5/2322376', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - info = self._download_json( - 'http://ext.yinyuetai.com/main/get-h-mv-info?json=true&videoId=%s' % video_id, video_id, - 'Downloading mv info')['videoInfo']['coreVideoInfo'] - - if info['error']: - raise ExtractorError(info['errorMsg'], expected=True) - - formats = [{ - 'url': format_info['videoUrl'], - 'format_id': format_info['qualityLevel'], - 'format': format_info.get('qualityLevelName'), - 'filesize': format_info.get('fileSize'), - # though URLs ends with .flv, the downloaded files are in fact mp4 - 'ext': 'mp4', - 'tbr': format_info.get('bitrate'), - } for format_info in info['videoUrlModels']] - - return { - 'id': video_id, - 'title': info['videoName'], - 'thumbnail': info.get('bigHeadImage'), - 'creator': info.get('artistNames'), - 'duration': info.get('duration'), - 'formats': formats, - } diff --git a/yt_dlp/extractor/ynet.py b/yt_dlp/extractor/ynet.py deleted file mode 100644 index a7d7371f3..000000000 --- a/yt_dlp/extractor/ynet.py +++ /dev/null @@ -1,48 +0,0 @@ -import json -import re -import urllib.parse - -from .common import InfoExtractor - - -class YnetIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?ynet\.co\.il/(?:.+?/)?0,7340,(?P<id>L(?:-[0-9]+)+),00\.html' - _TESTS = [ - { - 'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html', - 'info_dict': { - 'id': 'L-11659-99244', - 'ext': 'flv', - 'title': 'איש לא יודע מאיפה באנו', - 'thumbnail': r're:^https?://.*\.jpg', - } - }, { - 'url': 'http://hot.ynet.co.il/home/0,7340,L-8859-84418,00.html', - 'info_dict': { - 'id': 'L-8859-84418', - 'ext': 'flv', - 'title': "צפו: הנשיקה הלוהטת של תורגי' ויוליה פלוטקין", - 'thumbnail': r're:^https?://.*\.jpg', - } - } - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - content = urllib.parse.unquote_plus(self._og_search_video_url(webpage)) - config = json.loads(self._search_regex(r'config=({.+?})$', content, 'video config')) - f4m_url = config['clip']['url'] - title = self._og_search_title(webpage) - m = re.search(r'ynet - HOT -- (["\']+)(?P<title>.+?)\1', title) - if m: - title = m.group('title') - formats = self._extract_f4m_formats(f4m_url, video_id) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': self._og_search_thumbnail(webpage), - } diff --git a/yt_dlp/extractor/youku.py b/yt_dlp/extractor/youku.py index 7ecd9f183..e35176586 100644 --- a/yt_dlp/extractor/youku.py +++ b/yt_dlp/extractor/youku.py @@ -20,7 +20,7 @@ class YoukuIE(InfoExtractor): _VALID_URL = r'''(?x) (?: https?://( - (?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| + (?:v|play(?:er)?)\.(?:youku|tudou)\.com/(?:v_show/id_|player\.php/sid/)| video\.tudou\.com/v/)| youku:) (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|) @@ -87,6 +87,19 @@ class YoukuIE(InfoExtractor): 'uploader_url': 'https://www.youku.com/profile/index/?uid=UNjU2MzY1MzM1Ng==', 'tags': list, }, + }, { + 'url': 'https://play.tudou.com/v_show/id_XNjAxNjI2OTU3Ng==.html?', + 'info_dict': { + 'id': 'XNjAxNjI2OTU3Ng', + 'ext': 'mp4', + 'title': '阿斯塔意识到哈里杀了人,自己被骗了', + 'thumbnail': 'https://m.ykimg.com/0541010164F732752794D4D7B70331D1', + 'uploader_id': '88758207', + 'tags': [], + 'uploader_url': 'https://www.youku.com/profile/index/?uid=UMzU1MDMyODI4', + 'uploader': '英美剧场', + 'duration': 72.91, + }, }] @staticmethod diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index ac99819ec..6b15d06ff 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -428,7 +428,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'(?:www\.)?piped\.adminforge\.de', r'(?:www\.)?watch\.whatevertinfoil\.de', r'(?:www\.)?piped\.qdi\.fi', - r'(?:www\.)?piped\.video', + r'(?:(?:www|cf)\.)?piped\.video', r'(?:www\.)?piped\.aeong\.one', r'(?:www\.)?piped\.moomoo\.me', r'(?:www\.)?piped\.chauvet\.pro', @@ -947,7 +947,10 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers icd_rm = next(icd_retries) main_retries = iter(self.RetryManager()) main_rm = next(main_retries) - for _ in range(main_rm.retries + icd_rm.retries + 1): + # Manual retry loop for multiple RetryManagers + # The proper RetryManager MUST be advanced after an error + # and its result MUST be checked if the manager is non fatal + while True: try: response = self._call_api( ep=ep, fatal=True, headers=headers, @@ -2069,7 +2072,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'track': 'Voyeur Girl', 'album': 'it\'s too much love to know my dear', 'release_date': '20190313', - 'release_year': 2019, 'alt_title': 'Voyeur Girl', 'view_count': int, 'playable_in_embed': True, @@ -4510,14 +4512,13 @@ def process_language(container, base_url, lang_code, sub_name, query): if mobj: info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) break - sbr_tooltip = try_get( - vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) - if sbr_tooltip: - like_count, dislike_count = sbr_tooltip.split(' / ') - info.update({ - 'like_count': str_to_int(like_count), - 'dislike_count': str_to_int(dislike_count), - }) + + info['like_count'] = traverse_obj(vpir, ( + 'videoActions', 'menuRenderer', 'topLevelButtons', ..., + 'segmentedLikeDislikeButtonViewModel', 'likeButtonViewModel', 'likeButtonViewModel', + 'toggleButtonViewModel', 'toggleButtonViewModel', 'defaultButtonViewModel', + 'buttonViewModel', 'accessibilityText', {parse_count}), get_all=False) + vcr = traverse_obj(vpir, ('viewCount', 'videoViewCountRenderer')) if vcr: vc = self._get_count(vcr, 'viewCount') @@ -4589,6 +4590,14 @@ def process_language(container, base_url, lang_code, sub_name, query): self._parse_time_text(self._get_text(vpir, 'dateText'))) or upload_date info['upload_date'] = upload_date + if upload_date and live_status not in ('is_live', 'post_live', 'is_upcoming'): + # Newly uploaded videos' HLS formats are potentially problematic and need to be checked + upload_datetime = datetime_from_str(upload_date).replace(tzinfo=datetime.timezone.utc) + if upload_datetime >= datetime_from_str('today-2days'): + for fmt in info['formats']: + if fmt.get('protocol') == 'm3u8_native': + fmt['__needs_testing'] = True + for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]: v = info.get(s_k) if v: @@ -5320,6 +5329,7 @@ def _extract_webpage(self, url, item_id, fatal=True): # See: https://github.com/yt-dlp/yt-dlp/issues/116 if not traverse_obj(data, 'contents', 'currentVideoEndpoint', 'onResponseReceivedActions'): retry.error = ExtractorError('Incomplete yt initial data received') + data = None continue return webpage, data @@ -6491,6 +6501,9 @@ def _extract_tab_id_and_name(self, tab, base_url='https://www.youtube.com'): def _has_tab(self, tabs, tab_id): return any(self._extract_tab_id_and_name(tab)[0] == tab_id for tab in tabs) + def _empty_playlist(self, item_id, data): + return self.playlist_result([], item_id, **self._extract_metadata_from_tabs(item_id, data)) + @YoutubeTabBaseInfoExtractor.passthrough_smuggled_data def _real_extract(self, url, smuggled_data): item_id = self._match_id(url) @@ -6556,6 +6569,10 @@ def _real_extract(self, url, smuggled_data): selected_tab_id, selected_tab_name = self._extract_tab_id_and_name(selected_tab, url) # NB: Name may be translated self.write_debug(f'Selected tab: {selected_tab_id!r} ({selected_tab_name}), Requested tab: {original_tab_id!r}') + # /about is no longer a tab + if original_tab_id == 'about': + return self._empty_playlist(item_id, data) + if not original_tab_id and selected_tab_name: self.to_screen('Downloading all uploads of the channel. ' 'To download only the videos in a specific tab, pass the tab\'s URL') @@ -6568,7 +6585,7 @@ def _real_extract(self, url, smuggled_data): if not extra_tabs and selected_tab_id != 'videos': # Channel does not have streams, shorts or videos tabs if item_id[:2] != 'UC': - raise ExtractorError('This channel has no uploads', expected=True) + return self._empty_playlist(item_id, data) # Topic channels don't have /videos. Use the equivalent playlist instead pl_id = f'UU{item_id[2:]}' @@ -6576,7 +6593,7 @@ def _real_extract(self, url, smuggled_data): try: data, ytcfg = self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True, webpage_fatal=True) except ExtractorError: - raise ExtractorError('This channel has no uploads', expected=True) + return self._empty_playlist(item_id, data) else: item_id, url = pl_id, pl_url self.to_screen( @@ -6708,7 +6725,7 @@ class YoutubePlaylistIE(InfoExtractor): 'uploader_url': 'https://www.youtube.com/@milan5503', 'availability': 'public', }, - 'expected_warnings': [r'[Uu]navailable videos? (is|are|will be) hidden'], + 'expected_warnings': [r'[Uu]navailable videos? (is|are|will be) hidden', 'Retrying', 'Giving up'], }, { 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', 'playlist_mincount': 455, diff --git a/yt_dlp/extractor/zingmp3.py b/yt_dlp/extractor/zingmp3.py index 007658c65..f664d88d8 100644 --- a/yt_dlp/extractor/zingmp3.py +++ b/yt_dlp/extractor/zingmp3.py @@ -5,7 +5,15 @@ import urllib.parse from .common import InfoExtractor -from ..utils import int_or_none, traverse_obj, try_call, urljoin +from ..utils import ( + ExtractorError, + int_or_none, + join_nonempty, + try_call, + urljoin, + url_or_none +) +from ..utils.traversal import traverse_obj class ZingMp3BaseIE(InfoExtractor): @@ -20,9 +28,17 @@ class ZingMp3BaseIE(InfoExtractor): 'video-clip': '/api/v2/page/get/video', 'lyric': '/api/v2/lyric/get/lyric', 'song-streaming': '/api/v2/song/get/streaming', + 'liveradio': '/api/v2/livestream/get/info', + 'eps': '/api/v2/page/get/podcast-episode', + 'episode-streaming': '/api/v2/podcast/episode/get/streaming', # Playlist 'playlist': '/api/v2/page/get/playlist', 'album': '/api/v2/page/get/playlist', + 'pgr': '/api/v2/page/get/podcast-program', + 'pgr-list': '/api/v2/podcast/episode/get/list', + 'cgr': '/api/v2/page/get/podcast-category', + 'cgr-list': '/api/v2/podcast/program/get/list-by-cate', + 'cgrs': '/api/v2/page/get/podcast-categories', # Chart 'zing-chart': '/api/v2/page/get/chart-home', 'zing-chart-tuan': '/api/v2/page/get/week-chart', @@ -33,6 +49,10 @@ class ZingMp3BaseIE(InfoExtractor): 'user-list-song': '/api/v2/song/get/list', 'user-list-video': '/api/v2/video/get/list', 'hub': '/api/v2/page/get/hub-detail', + 'new-release': '/api/v2/chart/get/new-release', + 'top100': '/api/v2/page/get/top-100', + 'podcast-new': '/api/v2/podcast/program/get/list-by-type', + 'top-podcast': '/api/v2/podcast/program/get/top-episode', } def _api_url(self, url_type, params): @@ -78,7 +98,7 @@ def _paged_list(self, _id, url_type): class ZingMp3IE(ZingMp3BaseIE): - _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip|embed' + _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip|embed|eps' IE_NAME = 'zingmp3' IE_DESC = 'zingmp3.vn' _TESTS = [{ @@ -102,7 +122,7 @@ class ZingMp3IE(ZingMp3BaseIE): }, }, { 'url': 'https://zingmp3.vn/video-clip/Suong-Hoa-Dua-Loi-K-ICM-RYO/ZO8ZF7C7.html', - 'md5': '3c2081e79471a2f4a3edd90b70b185ea', + 'md5': '92c6e7a019f06b4682a6c35ae5785fab', 'info_dict': { 'id': 'ZO8ZF7C7', 'title': 'Sương Hoa Đưa Lối', @@ -128,6 +148,20 @@ class ZingMp3IE(ZingMp3BaseIE): 'album': 'Người Yêu Tôi Lạnh Lùng Sắt Đá (Single)', 'album_artist': 'Mr. Siro', }, + }, { + 'url': 'https://zingmp3.vn/eps/Cham-x-Ban-Noi-Goi-La-Nha/ZZD9ACWI.html', + 'md5': 'd52f9f63e2631e004e4f15188eedcf80', + 'info_dict': { + 'id': 'ZZD9ACWI', + 'title': 'Chạm x Bạn - Nơi Gọi Là Nhà', + 'ext': 'mp3', + 'duration': 3716, + 'thumbnail': r're:^https?://.+\.jpg', + 'track': 'Chạm x Bạn - Nơi Gọi Là Nhà', + 'artist': 'On Air', + 'album': 'Top Podcast', + 'album_artist': 'On Air', + }, }, { 'url': 'https://zingmp3.vn/embed/song/ZWZEI76B?start=false', 'only_matching': True, @@ -147,6 +181,8 @@ def _real_extract(self, url): 'http://api.mp3.zing.vn/api/mobile/video/getvideoinfo', item_id, query={'requestdata': json.dumps({'id': item_id})}, note='Downloading mp4 JSON metadata').get('source') + elif url_type == 'eps': + source = self._call_api('episode-streaming', {'id': item_id}) else: source = self._call_api('song-streaming', {'id': item_id}) @@ -189,9 +225,10 @@ def _real_extract(self, url): 'thumbnail': traverse_obj(item, 'thumbnail', 'thumbnailM'), 'duration': int_or_none(item.get('duration')), 'track': traverse_obj(item, 'title', 'alias'), - 'artist': traverse_obj(item, 'artistsNames', 'artists_names'), - 'album': traverse_obj(item, ('album', ('name', 'title')), get_all=False), - 'album_artist': traverse_obj(item, ('album', ('artistsNames', 'artists_names')), get_all=False), + 'artist': traverse_obj(item, 'artistsNames', 'artists_names', ('artists', 0, 'name')), + 'album': traverse_obj(item, ('album', ('name', 'title')), ('genres', 0, 'name'), get_all=False), + 'album_artist': traverse_obj(item, ('album', ('artistsNames', 'artists_names')), + ('artists', 0, 'name'), get_all=False), 'formats': formats, 'subtitles': {'origin': [{'url': lyric}]} if lyric else None, } @@ -200,12 +237,12 @@ def _real_extract(self, url): class ZingMp3AlbumIE(ZingMp3BaseIE): _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'album|playlist' _TESTS = [{ - 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', + 'url': 'https://zingmp3.vn/album/Ca-Phe-Quan-Quen-Hoang-Dung-My-Anh-Da-LAB-Thinh-Suy/ZOC7WUZC.html', 'info_dict': { - 'id': 'ZWZBWDAF', - 'title': 'Lâu Đài Tình Ái', + 'id': 'ZOC7WUZC', + 'title': 'Cà Phê Quán Quen', }, - 'playlist_mincount': 9, + 'playlist_mincount': 10, }, { 'url': 'https://zingmp3.vn/album/Nhung-Bai-Hat-Hay-Nhat-Cua-Mr-Siro-Mr-Siro/ZWZAEZZD.html', 'info_dict': { @@ -231,7 +268,7 @@ def _real_extract(self, url): class ZingMp3ChartHomeIE(ZingMp3BaseIE): - _VALID_URL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?P<id>(?:zing-chart|moi-phat-hanh))/?(?:[#?]|$)' + _VALID_URL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?P<id>(?:zing-chart|moi-phat-hanh|top100|podcast-discover))/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://zingmp3.vn/zing-chart', 'info_dict': { @@ -244,13 +281,34 @@ class ZingMp3ChartHomeIE(ZingMp3BaseIE): 'id': 'moi-phat-hanh', }, 'playlist_mincount': 100, + }, { + 'url': 'https://zingmp3.vn/top100', + 'info_dict': { + 'id': 'top100', + }, + 'playlist_mincount': 50, + }, { + 'url': 'https://zingmp3.vn/podcast-discover', + 'info_dict': { + 'id': 'podcast-discover', + }, + 'playlist_mincount': 4, }] IE_NAME = 'zingmp3:chart-home' def _real_extract(self, url): url_type = self._match_id(url) - data = self._call_api(url_type, {'id': url_type}) - items = traverse_obj(data, ('RTChart', 'items') if url_type == 'zing-chart' else 'items') + params = {'id': url_type} + if url_type == 'podcast-discover': + params['type'] = 'discover' + data = self._call_api(url_type, params) + items = [] + if url_type == 'top100': + items.extend(traverse_obj(data, (..., 'items', ..., {dict}))) + elif url_type == 'zing-chart': + items.extend(traverse_obj(data, ('RTChart', 'items', ..., {dict}))) + else: + items.extend(traverse_obj(data, ('items', ..., {dict}))) return self.playlist_result(self._parse_items(items), url_type) @@ -334,7 +392,7 @@ def _real_extract(self, url): class ZingMp3UserIE(ZingMp3BaseIE): - _VALID_URL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?P<user>[^/]+)/(?P<type>bai-hat|single|album|video)/?(?:[?#]|$)' + _VALID_URL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?P<user>[^/]+)/(?P<type>bai-hat|single|album|video|song)/?(?:[?#]|$)' IE_NAME = 'zingmp3:user' _TESTS = [{ 'url': 'https://zingmp3.vn/Mr-Siro/bai-hat', @@ -368,6 +426,18 @@ class ZingMp3UserIE(ZingMp3BaseIE): 'description': 'md5:5bdcf45e955dc1b8d7f518f322ffef36', }, 'playlist_mincount': 15, + }, { + 'url': 'https://zingmp3.vn/new-release/song', + 'info_dict': { + 'id': 'new-release-song', + }, + 'playlist_mincount': 50, + }, { + 'url': 'https://zingmp3.vn/new-release/album', + 'info_dict': { + 'id': 'new-release-album', + }, + 'playlist_mincount': 20, }] def _fetch_page(self, user_id, url_type, page): @@ -380,20 +450,28 @@ def _fetch_page(self, user_id, url_type, page): }) def _real_extract(self, url): - user_alias, url_type = self._match_valid_url(url).group('user', 'type') + alias, url_type = self._match_valid_url(url).group('user', 'type') if not url_type: url_type = 'bai-hat' - user_info = self._call_api('info-artist', {}, user_alias, query={'alias': user_alias}) - if url_type in ('bai-hat', 'video'): - entries = self._paged_list(user_info['id'], url_type) + user_info = self._call_api('info-artist', {}, alias, query={'alias': alias}) + + # Handle for new-release + if alias == 'new-release' and url_type in ('song', 'album'): + _id = f'{alias}-{url_type}' + return self.playlist_result(self._parse_items( + self._call_api('new-release', params={'type': url_type}, display_id=_id)), _id) else: - entries = self._parse_items(traverse_obj(user_info, ( - 'sections', - lambda _, v: v['sectionId'] == 'aAlbum' if url_type == 'album' else v['sectionId'] == 'aSingle', - 'items', ...))) - return self.playlist_result( - entries, user_info['id'], f'{user_info.get("name")} - {url_type}', user_info.get('biography')) + # Handle for user/artist + if url_type in ('bai-hat', 'video'): + entries = self._paged_list(user_info['id'], url_type) + else: + section_id = 'aAlbum' if url_type == 'album' else 'aSingle' + entries = self._parse_items(traverse_obj(user_info, ( + 'sections', lambda _, v: v['sectionId'] == section_id, 'items', ...))) + return self.playlist_result( + entries, user_info['id'], join_nonempty(user_info.get('name'), url_type, delim=' - '), + user_info.get('biography')) class ZingMp3HubIE(ZingMp3BaseIE): @@ -403,7 +481,7 @@ class ZingMp3HubIE(ZingMp3BaseIE): 'url': 'https://zingmp3.vn/hub/Nhac-Moi/IWZ9Z0CA.html', 'info_dict': { 'id': 'IWZ9Z0CA', - 'title': 'Nhạc Mới', + 'title': 'BXH Nhạc Mới', 'description': 'md5:1cc31b68a6f746427b07b2756c22a558', }, 'playlist_mincount': 20, @@ -424,3 +502,129 @@ def _real_extract(self, url): 'sections', lambda _, v: v['sectionId'] == 'hub', 'items', ...))) return self.playlist_result( entries, song_id, hub_detail.get('title'), hub_detail.get('description')) + + +class ZingMp3LiveRadioIE(ZingMp3BaseIE): + IE_NAME = 'zingmp3:liveradio' + _VALID_URL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?P<type>(?:liveradio))/(?P<id>\w+)(?:\.html|\?)' + _TESTS = [{ + 'url': 'https://zingmp3.vn/liveradio/IWZ979UB.html', + 'info_dict': { + 'id': 'IWZ979UB', + 'title': r're:^V\-POP', + 'description': 'md5:aa857f8a91dc9ce69e862a809e4bdc10', + 'protocol': 'm3u8_native', + 'ext': 'mp4', + 'view_count': int, + 'thumbnail': r're:^https?://.*\.jpg', + 'like_count': int, + 'live_status': 'is_live', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://zingmp3.vn/liveradio/IWZ97CWB.html', + 'info_dict': { + 'id': 'IWZ97CWB', + 'title': r're:^Live\s247', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'protocol': 'm3u8_native', + 'ext': 'm4a', + 'view_count': int, + 'thumbnail': r're:^https?://.*\.jpg', + 'like_count': int, + 'live_status': 'is_live', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + url_type, live_radio_id = self._match_valid_url(url).group('type', 'id') + info = self._call_api(url_type, {'id': live_radio_id}) + manifest_url = info.get('streaming') + if not manifest_url: + raise ExtractorError('This radio is offline.', expected=True) + fmts, subtitles = self._extract_m3u8_formats_and_subtitles(manifest_url, live_radio_id, fatal=False) + return { + 'id': live_radio_id, + 'is_live': True, + 'formats': fmts, + 'subtitles': subtitles, + **traverse_obj(info, { + 'title': 'title', + 'thumbnail': (('thumbnail', 'thumbnailM', 'thumbnailV', 'thumbnailH'), {url_or_none}), + 'view_count': ('activeUsers', {int_or_none}), + 'like_count': ('totalReaction', {int_or_none}), + 'description': 'description', + }, get_all=False), + } + + +class ZingMp3PodcastEpisodeIE(ZingMp3BaseIE): + IE_NAME = 'zingmp3:podcast-episode' + _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'pgr|cgr' + _TESTS = [{ + 'url': 'https://zingmp3.vn/pgr/Nhac-Moi-Moi-Ngay/68Z9W66B.html', + 'info_dict': { + 'id': '68Z9W66B', + 'title': 'Nhạc Mới Mỗi Ngày', + 'description': 'md5:2875dfa951f8e5356742f1610cf20691' + }, + 'playlist_mincount': 20, + }, { + 'url': 'https://zingmp3.vn/cgr/Am-nhac/IWZ980AO.html', + 'info_dict': { + 'id': 'IWZ980AO', + 'title': 'Âm nhạc' + }, + 'playlist_mincount': 2, + }] + + def _fetch_page(self, eps_id, url_type, page): + return self._call_api(url_type, { + 'id': eps_id, + 'page': page, + 'count': self._PER_PAGE + }) + + def _real_extract(self, url): + podcast_id, url_type = self._match_valid_url(url).group('id', 'type') + podcast_info = self._call_api(url_type, {'id': podcast_id}) + entries = self._paged_list(podcast_id, 'pgr-list' if url_type == 'pgr' else 'cgr-list') + return self.playlist_result( + entries, podcast_id, podcast_info.get('title'), podcast_info.get('description')) + + +class ZingMp3PodcastIE(ZingMp3BaseIE): + IE_NAME = 'zingmp3:podcast' + _VALID_URL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?P<id>(?:cgr|top-podcast|podcast-new))/?(?:[#?]|$)' + _TESTS = [{ + 'url': 'https://zingmp3.vn/cgr', + 'info_dict': { + 'id': 'cgr', + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://zingmp3.vn/top-podcast', + 'info_dict': { + 'id': 'top-podcast', + }, + 'playlist_mincount': 7, + }, { + 'url': 'https://zingmp3.vn/podcast-new', + 'info_dict': { + 'id': 'podcast-new', + }, + 'playlist_mincount': 4, + }] + + def _real_extract(self, url): + url_type = self._match_id(url) + params = {'id': url_type} + if url_type == 'podcast-new': + params['type'] = 'new' + items = self._call_api('cgrs' if url_type == 'cgr' else url_type, params)['items'] + return self.playlist_result(self._parse_items(items), url_type) diff --git a/yt_dlp/extractor/zoom.py b/yt_dlp/extractor/zoom.py index 1e41d0434..e2bf81729 100644 --- a/yt_dlp/extractor/zoom.py +++ b/yt_dlp/extractor/zoom.py @@ -2,10 +2,12 @@ from ..utils import ( ExtractorError, int_or_none, - str_or_none, js_to_json, parse_filesize, + parse_resolution, + str_or_none, traverse_obj, + url_basename, urlencode_postdata, urljoin, ) @@ -13,7 +15,7 @@ class ZoomIE(InfoExtractor): IE_NAME = 'zoom' - _VALID_URL = r'(?P<base_url>https?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?P<type>play|share)/(?P<id>[A-Za-z0-9_.-]+)' + _VALID_URL = r'(?P<base_url>https?://(?:[^.]+\.)?zoom\.us/)rec(?:ording)?/(?P<type>play|share)/(?P<id>[\w.-]+)' _TESTS = [{ 'url': 'https://economist.zoom.us/rec/play/dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', 'md5': 'ab445e8c911fddc4f9adc842c2c5d434', @@ -41,6 +43,18 @@ class ZoomIE(InfoExtractor): 'ext': 'mp4', 'title': 'Timea Andrea Lelik\'s Personal Meeting Room', }, + 'skip': 'This recording has expired', + }, { + # view_with_share URL + 'url': 'https://cityofdetroit.zoom.us/rec/share/VjE-5kW3xmgbEYqR5KzRgZ1OFZvtMtiXk5HyRJo5kK4m5PYE6RF4rF_oiiO_9qaM.UTAg1MI7JSnF3ZjX', + 'md5': 'bdc7867a5934c151957fb81321b3c024', + 'info_dict': { + 'id': 'VjE-5kW3xmgbEYqR5KzRgZ1OFZvtMtiXk5HyRJo5kK4m5PYE6RF4rF_oiiO_9qaM.UTAg1MI7JSnF3ZjX', + 'ext': 'mp4', + 'title': 'February 2022 Detroit Revenue Estimating Conference', + 'duration': 7299, + 'formats': 'mincount:3', + }, }] def _get_page_data(self, webpage, video_id): @@ -72,6 +86,7 @@ def _get_real_webpage(self, url, base_url, video_id, url_type): def _real_extract(self, url): base_url, url_type, video_id = self._match_valid_url(url).group('base_url', 'type', 'id') + query = {} if url_type == 'share': webpage = self._get_real_webpage(url, base_url, video_id, 'share') @@ -80,6 +95,7 @@ def _real_extract(self, url): f'{base_url}nws/recording/1.0/play/share-info/{meeting_id}', video_id, note='Downloading share info JSON')['result']['redirectUrl'] url = urljoin(base_url, redirect_path) + query['continueMode'] = 'true' webpage = self._get_real_webpage(url, base_url, video_id, 'play') file_id = self._get_page_data(webpage, video_id)['fileId'] @@ -88,7 +104,7 @@ def _real_extract(self, url): raise ExtractorError('Unable to extract file ID') data = self._download_json( - f'{base_url}nws/recording/1.0/play/info/{file_id}', video_id, + f'{base_url}nws/recording/1.0/play/info/{file_id}', video_id, query=query, note='Downloading play info JSON')['result'] subtitles = {} @@ -104,10 +120,10 @@ def _real_extract(self, url): if data.get('viewMp4Url'): formats.append({ 'format_note': 'Camera stream', - 'url': str_or_none(data.get('viewMp4Url')), + 'url': data['viewMp4Url'], 'width': int_or_none(traverse_obj(data, ('viewResolvtions', 0))), 'height': int_or_none(traverse_obj(data, ('viewResolvtions', 1))), - 'format_id': str_or_none(traverse_obj(data, ('recording', 'id'))), + 'format_id': 'view', 'ext': 'mp4', 'filesize_approx': parse_filesize(str_or_none(traverse_obj(data, ('recording', 'fileSizeInMB')))), 'preference': 0 @@ -116,14 +132,26 @@ def _real_extract(self, url): if data.get('shareMp4Url'): formats.append({ 'format_note': 'Screen share stream', - 'url': str_or_none(data.get('shareMp4Url')), + 'url': data['shareMp4Url'], 'width': int_or_none(traverse_obj(data, ('shareResolvtions', 0))), 'height': int_or_none(traverse_obj(data, ('shareResolvtions', 1))), - 'format_id': str_or_none(traverse_obj(data, ('shareVideo', 'id'))), + 'format_id': 'share', 'ext': 'mp4', 'preference': -1 }) + view_with_share_url = data.get('viewMp4WithshareUrl') + if view_with_share_url: + formats.append({ + **parse_resolution(self._search_regex( + r'_(\d+x\d+)\.mp4', url_basename(view_with_share_url), 'resolution', default=None)), + 'format_note': 'Screen share with camera', + 'url': view_with_share_url, + 'format_id': 'view_with_share', + 'ext': 'mp4', + 'preference': 1 + }) + return { 'id': video_id, 'title': str_or_none(traverse_obj(data, ('meet', 'topic'))), diff --git a/yt_dlp/networking/__init__.py b/yt_dlp/networking/__init__.py index 5b1599a6d..acadc0147 100644 --- a/yt_dlp/networking/__init__.py +++ b/yt_dlp/networking/__init__.py @@ -1,4 +1,6 @@ # flake8: noqa: F401 +import warnings + from .common import ( HEADRequest, PUTRequest, @@ -11,3 +13,18 @@ # isort: split # TODO: all request handlers should be safely imported from . import _urllib +from ..utils import bug_reports_message + +try: + from . import _requests +except ImportError: + pass +except Exception as e: + warnings.warn(f'Failed to import "requests" request handler: {e}' + bug_reports_message()) + +try: + from . import _websockets +except ImportError: + pass +except Exception as e: + warnings.warn(f'Failed to import "websockets" request handler: {e}' + bug_reports_message()) diff --git a/yt_dlp/networking/_helper.py b/yt_dlp/networking/_helper.py index 4c9dbf25d..d79dd7953 100644 --- a/yt_dlp/networking/_helper.py +++ b/yt_dlp/networking/_helper.py @@ -11,7 +11,7 @@ from .exceptions import RequestError, UnsupportedRequest from ..dependencies import certifi -from ..socks import ProxyType +from ..socks import ProxyType, sockssocket from ..utils import format_field, traverse_obj if typing.TYPE_CHECKING: @@ -219,7 +219,25 @@ def _socket_connect(ip_addr, timeout, source_address): sock.bind(source_address) sock.connect(sa) return sock - except socket.error: + except OSError: + sock.close() + raise + + +def create_socks_proxy_socket(dest_addr, proxy_args, proxy_ip_addr, timeout, source_address): + af, socktype, proto, canonname, sa = proxy_ip_addr + sock = sockssocket(af, socktype, proto) + try: + connect_proxy_args = proxy_args.copy() + connect_proxy_args.update({'addr': sa[0], 'port': sa[1]}) + sock.setproxy(**connect_proxy_args) + if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: # noqa: E721 + sock.settimeout(timeout) + if source_address: + sock.bind(source_address) + sock.connect(dest_addr) + return sock + except OSError: sock.close() raise @@ -237,7 +255,7 @@ def create_connection( host, port = address ip_addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) if not ip_addrs: - raise socket.error('getaddrinfo returns an empty list') + raise OSError('getaddrinfo returns an empty list') if source_address is not None: af = socket.AF_INET if ':' not in source_address[0] else socket.AF_INET6 ip_addrs = [addr for addr in ip_addrs if addr[0] == af] @@ -254,7 +272,7 @@ def create_connection( # https://bugs.python.org/issue36820 err = None return sock - except socket.error as e: + except OSError as e: err = e try: diff --git a/yt_dlp/networking/_requests.py b/yt_dlp/networking/_requests.py new file mode 100644 index 000000000..e129110ca --- /dev/null +++ b/yt_dlp/networking/_requests.py @@ -0,0 +1,400 @@ +import contextlib +import functools +import http.client +import logging +import re +import socket +import warnings + +from ..dependencies import brotli, requests, urllib3 +from ..utils import bug_reports_message, int_or_none, variadic + +if requests is None: + raise ImportError('requests module is not installed') + +if urllib3 is None: + raise ImportError('urllib3 module is not installed') + +urllib3_version = tuple(int_or_none(x, default=0) for x in urllib3.__version__.split('.')) + +if urllib3_version < (1, 26, 17): + raise ImportError('Only urllib3 >= 1.26.17 is supported') + +if requests.__build__ < 0x023100: + raise ImportError('Only requests >= 2.31.0 is supported') + +import requests.adapters +import requests.utils +import urllib3.connection +import urllib3.exceptions + +from ._helper import ( + InstanceStoreMixin, + add_accept_encoding_header, + create_connection, + create_socks_proxy_socket, + get_redirect_method, + make_socks_proxy_opts, + select_proxy, +) +from .common import ( + Features, + RequestHandler, + Response, + register_preference, + register_rh, +) +from .exceptions import ( + CertificateVerifyError, + HTTPError, + IncompleteRead, + ProxyError, + RequestError, + SSLError, + TransportError, +) +from ..socks import ProxyError as SocksProxyError + +SUPPORTED_ENCODINGS = [ + 'gzip', 'deflate' +] + +if brotli is not None: + SUPPORTED_ENCODINGS.append('br') + +""" +Override urllib3's behavior to not convert lower-case percent-encoded characters +to upper-case during url normalization process. + +RFC3986 defines that the lower or upper case percent-encoded hexidecimal characters are equivalent +and normalizers should convert them to uppercase for consistency [1]. + +However, some sites may have an incorrect implementation where they provide +a percent-encoded url that is then compared case-sensitively.[2] + +While this is a very rare case, since urllib does not do this normalization step, it +is best to avoid it in requests too for compatability reasons. + +1: https://tools.ietf.org/html/rfc3986#section-2.1 +2: https://github.com/streamlink/streamlink/pull/4003 +""" + + +class Urllib3PercentREOverride: + def __init__(self, r: re.Pattern): + self.re = r + + # pass through all other attribute calls to the original re + def __getattr__(self, item): + return self.re.__getattribute__(item) + + def subn(self, repl, string, *args, **kwargs): + return string, self.re.subn(repl, string, *args, **kwargs)[1] + + +# urllib3 >= 1.25.8 uses subn: +# https://github.com/urllib3/urllib3/commit/a2697e7c6b275f05879b60f593c5854a816489f0 +import urllib3.util.url # noqa: E305 + +if hasattr(urllib3.util.url, 'PERCENT_RE'): + urllib3.util.url.PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url.PERCENT_RE) +elif hasattr(urllib3.util.url, '_PERCENT_RE'): # urllib3 >= 2.0.0 + urllib3.util.url._PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url._PERCENT_RE) +else: + warnings.warn('Failed to patch PERCENT_RE in urllib3 (does the attribute exist?)' + bug_reports_message()) + +""" +Workaround for issue in urllib.util.ssl_.py: ssl_wrap_context does not pass +server_hostname to SSLContext.wrap_socket if server_hostname is an IP, +however this is an issue because we set check_hostname to True in our SSLContext. + +Monkey-patching IS_SECURETRANSPORT forces ssl_wrap_context to pass server_hostname regardless. + +This has been fixed in urllib3 2.0+. +See: https://github.com/urllib3/urllib3/issues/517 +""" + +if urllib3_version < (2, 0, 0): + with contextlib.suppress(): + urllib3.util.IS_SECURETRANSPORT = urllib3.util.ssl_.IS_SECURETRANSPORT = True + + +# Requests will not automatically handle no_proxy by default +# due to buggy no_proxy handling with proxy dict [1]. +# 1. https://github.com/psf/requests/issues/5000 +requests.adapters.select_proxy = select_proxy + + +class RequestsResponseAdapter(Response): + def __init__(self, res: requests.models.Response): + super().__init__( + fp=res.raw, headers=res.headers, url=res.url, + status=res.status_code, reason=res.reason) + + self._requests_response = res + + def read(self, amt: int = None): + try: + # Interact with urllib3 response directly. + return self.fp.read(amt, decode_content=True) + + # See urllib3.response.HTTPResponse.read() for exceptions raised on read + except urllib3.exceptions.SSLError as e: + raise SSLError(cause=e) from e + + except urllib3.exceptions.ProtocolError as e: + # IncompleteRead is always contained within ProtocolError + # See urllib3.response.HTTPResponse._error_catcher() + ir_err = next( + (err for err in (e.__context__, e.__cause__, *variadic(e.args)) + if isinstance(err, http.client.IncompleteRead)), None) + if ir_err is not None: + # `urllib3.exceptions.IncompleteRead` is subclass of `http.client.IncompleteRead` + # but uses an `int` for its `partial` property. + partial = ir_err.partial if isinstance(ir_err.partial, int) else len(ir_err.partial) + raise IncompleteRead(partial=partial, expected=ir_err.expected) from e + raise TransportError(cause=e) from e + + except urllib3.exceptions.HTTPError as e: + # catch-all for any other urllib3 response exceptions + raise TransportError(cause=e) from e + + +class RequestsHTTPAdapter(requests.adapters.HTTPAdapter): + def __init__(self, ssl_context=None, proxy_ssl_context=None, source_address=None, **kwargs): + self._pm_args = {} + if ssl_context: + self._pm_args['ssl_context'] = ssl_context + if source_address: + self._pm_args['source_address'] = (source_address, 0) + self._proxy_ssl_context = proxy_ssl_context or ssl_context + super().__init__(**kwargs) + + def init_poolmanager(self, *args, **kwargs): + return super().init_poolmanager(*args, **kwargs, **self._pm_args) + + def proxy_manager_for(self, proxy, **proxy_kwargs): + extra_kwargs = {} + if not proxy.lower().startswith('socks') and self._proxy_ssl_context: + extra_kwargs['proxy_ssl_context'] = self._proxy_ssl_context + return super().proxy_manager_for(proxy, **proxy_kwargs, **self._pm_args, **extra_kwargs) + + def cert_verify(*args, **kwargs): + # lean on SSLContext for cert verification + pass + + +class RequestsSession(requests.sessions.Session): + """ + Ensure unified redirect method handling with our urllib redirect handler. + """ + + def rebuild_method(self, prepared_request, response): + new_method = get_redirect_method(prepared_request.method, response.status_code) + + # HACK: requests removes headers/body on redirect unless code was a 307/308. + if new_method == prepared_request.method: + response._real_status_code = response.status_code + response.status_code = 308 + + prepared_request.method = new_method + + def rebuild_auth(self, prepared_request, response): + # HACK: undo status code change from rebuild_method, if applicable. + # rebuild_auth runs after requests would remove headers/body based on status code + if hasattr(response, '_real_status_code'): + response.status_code = response._real_status_code + del response._real_status_code + return super().rebuild_auth(prepared_request, response) + + +class Urllib3LoggingFilter(logging.Filter): + + def filter(self, record): + # Ignore HTTP request messages since HTTPConnection prints those + if record.msg == '%s://%s:%s "%s %s %s" %s %s': + return False + return True + + +class Urllib3LoggingHandler(logging.Handler): + """Redirect urllib3 logs to our logger""" + + def __init__(self, logger, *args, **kwargs): + super().__init__(*args, **kwargs) + self._logger = logger + + def emit(self, record): + try: + msg = self.format(record) + if record.levelno >= logging.ERROR: + self._logger.error(msg) + else: + self._logger.stdout(msg) + + except Exception: + self.handleError(record) + + +@register_rh +class RequestsRH(RequestHandler, InstanceStoreMixin): + + """Requests RequestHandler + https://github.com/psf/requests + """ + _SUPPORTED_URL_SCHEMES = ('http', 'https') + _SUPPORTED_ENCODINGS = tuple(SUPPORTED_ENCODINGS) + _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h') + _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY) + RH_NAME = 'requests' + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # Forward urllib3 debug messages to our logger + logger = logging.getLogger('urllib3') + handler = Urllib3LoggingHandler(logger=self._logger) + handler.setFormatter(logging.Formatter('requests: %(message)s')) + handler.addFilter(Urllib3LoggingFilter()) + logger.addHandler(handler) + # TODO: Use a logger filter to suppress pool reuse warning instead + logger.setLevel(logging.ERROR) + + if self.verbose: + # Setting this globally is not ideal, but is easier than hacking with urllib3. + # It could technically be problematic for scripts embedding yt-dlp. + # However, it is unlikely debug traffic is used in that context in a way this will cause problems. + urllib3.connection.HTTPConnection.debuglevel = 1 + logger.setLevel(logging.DEBUG) + # this is expected if we are using --no-check-certificate + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + def close(self): + self._clear_instances() + + def _check_extensions(self, extensions): + super()._check_extensions(extensions) + extensions.pop('cookiejar', None) + extensions.pop('timeout', None) + + def _create_instance(self, cookiejar): + session = RequestsSession() + http_adapter = RequestsHTTPAdapter( + ssl_context=self._make_sslcontext(), + source_address=self.source_address, + max_retries=urllib3.util.retry.Retry(False), + ) + session.adapters.clear() + session.headers = requests.models.CaseInsensitiveDict({'Connection': 'keep-alive'}) + session.mount('https://', http_adapter) + session.mount('http://', http_adapter) + session.cookies = cookiejar + session.trust_env = False # no need, we already load proxies from env + return session + + def _send(self, request): + + headers = self._merge_headers(request.headers) + add_accept_encoding_header(headers, SUPPORTED_ENCODINGS) + + max_redirects_exceeded = False + + session = self._get_instance( + cookiejar=request.extensions.get('cookiejar') or self.cookiejar) + + try: + requests_res = session.request( + method=request.method, + url=request.url, + data=request.data, + headers=headers, + timeout=float(request.extensions.get('timeout') or self.timeout), + proxies=request.proxies or self.proxies, + allow_redirects=True, + stream=True + ) + + except requests.exceptions.TooManyRedirects as e: + max_redirects_exceeded = True + requests_res = e.response + + except requests.exceptions.SSLError as e: + if 'CERTIFICATE_VERIFY_FAILED' in str(e): + raise CertificateVerifyError(cause=e) from e + raise SSLError(cause=e) from e + + except requests.exceptions.ProxyError as e: + raise ProxyError(cause=e) from e + + except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: + raise TransportError(cause=e) from e + + except urllib3.exceptions.HTTPError as e: + # Catch any urllib3 exceptions that may leak through + raise TransportError(cause=e) from e + + except requests.exceptions.RequestException as e: + # Miscellaneous Requests exceptions. May not necessary be network related e.g. InvalidURL + raise RequestError(cause=e) from e + + res = RequestsResponseAdapter(requests_res) + + if not 200 <= res.status < 300: + raise HTTPError(res, redirect_loop=max_redirects_exceeded) + + return res + + +@register_preference(RequestsRH) +def requests_preference(rh, request): + return 100 + + +# Use our socks proxy implementation with requests to avoid an extra dependency. +class SocksHTTPConnection(urllib3.connection.HTTPConnection): + def __init__(self, _socks_options, *args, **kwargs): # must use _socks_options to pass PoolKey checks + self._proxy_args = _socks_options + super().__init__(*args, **kwargs) + + def _new_conn(self): + try: + return create_connection( + address=(self._proxy_args['addr'], self._proxy_args['port']), + timeout=self.timeout, + source_address=self.source_address, + _create_socket_func=functools.partial( + create_socks_proxy_socket, (self.host, self.port), self._proxy_args)) + except (socket.timeout, TimeoutError) as e: + raise urllib3.exceptions.ConnectTimeoutError( + self, f'Connection to {self.host} timed out. (connect timeout={self.timeout})') from e + except SocksProxyError as e: + raise urllib3.exceptions.ProxyError(str(e), e) from e + except OSError as e: + raise urllib3.exceptions.NewConnectionError( + self, f'Failed to establish a new connection: {e}') from e + + +class SocksHTTPSConnection(SocksHTTPConnection, urllib3.connection.HTTPSConnection): + pass + + +class SocksHTTPConnectionPool(urllib3.HTTPConnectionPool): + ConnectionCls = SocksHTTPConnection + + +class SocksHTTPSConnectionPool(urllib3.HTTPSConnectionPool): + ConnectionCls = SocksHTTPSConnection + + +class SocksProxyManager(urllib3.PoolManager): + + def __init__(self, socks_proxy, username=None, password=None, num_pools=10, headers=None, **connection_pool_kw): + connection_pool_kw['_socks_options'] = make_socks_proxy_opts(socks_proxy) + super().__init__(num_pools, headers, **connection_pool_kw) + self.pool_classes_by_scheme = { + 'http': SocksHTTPConnectionPool, + 'https': SocksHTTPSConnectionPool + } + + +requests.adapters.SOCKSProxyManager = SocksProxyManager diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index 9e2bf33e4..68bab2b08 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -3,7 +3,6 @@ import functools import http.client import io -import socket import ssl import urllib.error import urllib.parse @@ -24,6 +23,7 @@ InstanceStoreMixin, add_accept_encoding_header, create_connection, + create_socks_proxy_socket, get_redirect_method, make_socks_proxy_opts, select_proxy, @@ -40,7 +40,6 @@ ) from ..dependencies import brotli from ..socks import ProxyError as SocksProxyError -from ..socks import sockssocket from ..utils import update_url_query from ..utils.networking import normalize_url @@ -190,25 +189,12 @@ class SocksConnection(base_class): _create_connection = create_connection def connect(self): - def sock_socket_connect(ip_addr, timeout, source_address): - af, socktype, proto, canonname, sa = ip_addr - sock = sockssocket(af, socktype, proto) - try: - connect_proxy_args = proxy_args.copy() - connect_proxy_args.update({'addr': sa[0], 'port': sa[1]}) - sock.setproxy(**connect_proxy_args) - if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: # noqa: E721 - sock.settimeout(timeout) - if source_address: - sock.bind(source_address) - sock.connect((self.host, self.port)) - return sock - except socket.error: - sock.close() - raise self.sock = create_connection( - (proxy_args['addr'], proxy_args['port']), timeout=self.timeout, - source_address=self.source_address, _create_socket_func=sock_socket_connect) + (proxy_args['addr'], proxy_args['port']), + timeout=self.timeout, + source_address=self.source_address, + _create_socket_func=functools.partial( + create_socks_proxy_socket, (self.host, self.port), proxy_args)) if isinstance(self, http.client.HTTPSConnection): self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host) diff --git a/yt_dlp/networking/_websockets.py b/yt_dlp/networking/_websockets.py new file mode 100644 index 000000000..ed64080d6 --- /dev/null +++ b/yt_dlp/networking/_websockets.py @@ -0,0 +1,165 @@ +from __future__ import annotations + +import io +import logging +import ssl +import sys + +from ._helper import ( + create_connection, + create_socks_proxy_socket, + make_socks_proxy_opts, + select_proxy, +) +from .common import Features, Response, register_rh +from .exceptions import ( + CertificateVerifyError, + HTTPError, + ProxyError, + RequestError, + SSLError, + TransportError, +) +from .websocket import WebSocketRequestHandler, WebSocketResponse +from ..compat import functools +from ..dependencies import websockets +from ..socks import ProxyError as SocksProxyError +from ..utils import int_or_none + +if not websockets: + raise ImportError('websockets is not installed') + +import websockets.version + +websockets_version = tuple(map(int_or_none, websockets.version.version.split('.'))) +if websockets_version < (12, 0): + raise ImportError('Only websockets>=12.0 is supported') + +import websockets.sync.client +from websockets.uri import parse_uri + + +class WebsocketsResponseAdapter(WebSocketResponse): + + def __init__(self, wsw: websockets.sync.client.ClientConnection, url): + super().__init__( + fp=io.BytesIO(wsw.response.body or b''), + url=url, + headers=wsw.response.headers, + status=wsw.response.status_code, + reason=wsw.response.reason_phrase, + ) + self.wsw = wsw + + def close(self): + self.wsw.close() + super().close() + + def send(self, message): + # https://websockets.readthedocs.io/en/stable/reference/sync/client.html#websockets.sync.client.ClientConnection.send + try: + return self.wsw.send(message) + except (websockets.exceptions.WebSocketException, RuntimeError, TimeoutError) as e: + raise TransportError(cause=e) from e + except SocksProxyError as e: + raise ProxyError(cause=e) from e + except TypeError as e: + raise RequestError(cause=e) from e + + def recv(self): + # https://websockets.readthedocs.io/en/stable/reference/sync/client.html#websockets.sync.client.ClientConnection.recv + try: + return self.wsw.recv() + except SocksProxyError as e: + raise ProxyError(cause=e) from e + except (websockets.exceptions.WebSocketException, RuntimeError, TimeoutError) as e: + raise TransportError(cause=e) from e + + +@register_rh +class WebsocketsRH(WebSocketRequestHandler): + """ + Websockets request handler + https://websockets.readthedocs.io + https://github.com/python-websockets/websockets + """ + _SUPPORTED_URL_SCHEMES = ('wss', 'ws') + _SUPPORTED_PROXY_SCHEMES = ('socks4', 'socks4a', 'socks5', 'socks5h') + _SUPPORTED_FEATURES = (Features.ALL_PROXY, Features.NO_PROXY) + RH_NAME = 'websockets' + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + for name in ('websockets.client', 'websockets.server'): + logger = logging.getLogger(name) + handler = logging.StreamHandler(stream=sys.stdout) + handler.setFormatter(logging.Formatter(f'{self.RH_NAME}: %(message)s')) + logger.addHandler(handler) + if self.verbose: + logger.setLevel(logging.DEBUG) + + def _check_extensions(self, extensions): + super()._check_extensions(extensions) + extensions.pop('timeout', None) + extensions.pop('cookiejar', None) + + def _send(self, request): + timeout = float(request.extensions.get('timeout') or self.timeout) + headers = self._merge_headers(request.headers) + if 'cookie' not in headers: + cookiejar = request.extensions.get('cookiejar') or self.cookiejar + cookie_header = cookiejar.get_cookie_header(request.url) + if cookie_header: + headers['cookie'] = cookie_header + + wsuri = parse_uri(request.url) + create_conn_kwargs = { + 'source_address': (self.source_address, 0) if self.source_address else None, + 'timeout': timeout + } + proxy = select_proxy(request.url, request.proxies or self.proxies or {}) + try: + if proxy: + socks_proxy_options = make_socks_proxy_opts(proxy) + sock = create_connection( + address=(socks_proxy_options['addr'], socks_proxy_options['port']), + _create_socket_func=functools.partial( + create_socks_proxy_socket, (wsuri.host, wsuri.port), socks_proxy_options), + **create_conn_kwargs + ) + else: + sock = create_connection( + address=(wsuri.host, wsuri.port), + **create_conn_kwargs + ) + conn = websockets.sync.client.connect( + sock=sock, + uri=request.url, + additional_headers=headers, + open_timeout=timeout, + user_agent_header=None, + ssl_context=self._make_sslcontext() if wsuri.secure else None, + close_timeout=0, # not ideal, but prevents yt-dlp hanging + ) + return WebsocketsResponseAdapter(conn, url=request.url) + + # Exceptions as per https://websockets.readthedocs.io/en/stable/reference/sync/client.html + except SocksProxyError as e: + raise ProxyError(cause=e) from e + except websockets.exceptions.InvalidURI as e: + raise RequestError(cause=e) from e + except ssl.SSLCertVerificationError as e: + raise CertificateVerifyError(cause=e) from e + except ssl.SSLError as e: + raise SSLError(cause=e) from e + except websockets.exceptions.InvalidStatus as e: + raise HTTPError( + Response( + fp=io.BytesIO(e.response.body), + url=request.url, + headers=e.response.headers, + status=e.response.status_code, + reason=e.response.reason_phrase), + ) from e + except (OSError, TimeoutError, websockets.exceptions.WebSocketException) as e: + raise TransportError(cause=e) from e diff --git a/yt_dlp/networking/exceptions.py b/yt_dlp/networking/exceptions.py index f58dc246e..12441901c 100644 --- a/yt_dlp/networking/exceptions.py +++ b/yt_dlp/networking/exceptions.py @@ -75,7 +75,7 @@ def __repr__(self): class IncompleteRead(TransportError): - def __init__(self, partial: int, expected: int = None, **kwargs): + def __init__(self, partial: int, expected: int | None = None, **kwargs): self.partial = partial self.expected = expected msg = f'{partial} bytes read' diff --git a/yt_dlp/networking/websocket.py b/yt_dlp/networking/websocket.py new file mode 100644 index 000000000..0e7e73c9e --- /dev/null +++ b/yt_dlp/networking/websocket.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +import abc + +from .common import RequestHandler, Response + + +class WebSocketResponse(Response): + + def send(self, message: bytes | str): + """ + Send a message to the server. + + @param message: The message to send. A string (str) is sent as a text frame, bytes is sent as a binary frame. + """ + raise NotImplementedError + + def recv(self): + raise NotImplementedError + + +class WebSocketRequestHandler(RequestHandler, abc.ABC): + pass diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 23ba7015a..a2c7cc7c3 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -478,11 +478,12 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'no-attach-info-json', 'embed-thumbnail-atomicparsley', 'no-external-downloader-progress', 'embed-metadata', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi', 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-youtube-prefer-utc-upload-date', + 'prefer-legacy-http-handler', 'manifest-filesize-approx' }, 'aliases': { - 'youtube-dl': ['all', '-multistreams', '-playlist-match-filter'], - 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat', '-playlist-match-filter'], + 'youtube-dl': ['all', '-multistreams', '-playlist-match-filter', '-manifest-filesize-approx'], + 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat', '-playlist-match-filter', '-manifest-filesize-approx'], '2021': ['2022', 'no-certifi', 'filename-sanitization', 'no-youtube-prefer-utc-upload-date'], - '2022': ['no-external-downloader-progress', 'playlist-match-filter'], + '2022': ['no-external-downloader-progress', 'playlist-match-filter', 'prefer-legacy-http-handler', 'manifest-filesize-approx'], } }, help=( 'Options that can help keep compatibility with youtube-dl or youtube-dlc ' @@ -734,7 +735,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): authentication.add_option( '--video-password', dest='videopassword', metavar='PASSWORD', - help='Video password (vimeo, youku)') + help='Video-specific password') authentication.add_option( '--ap-mso', dest='ap_mso', metavar='MSO', diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 323f4303c..7c904417b 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -780,7 +780,7 @@ def add(meta_list, info_list=None): yield ('-metadata', f'{name}={value}') stream_idx = 0 - for fmt in info.get('requested_formats') or []: + for fmt in info.get('requested_formats') or [info]: stream_count = 2 if 'none' not in (fmt.get('vcodec'), fmt.get('acodec')) else 1 lang = ISO639Utils.short2long(fmt.get('language') or '') or fmt.get('language') for i in range(stream_idx, stream_idx + stream_count): diff --git a/yt_dlp/socks.py b/yt_dlp/socks.py index e7f41d7e2..b4957ac2e 100644 --- a/yt_dlp/socks.py +++ b/yt_dlp/socks.py @@ -49,7 +49,7 @@ class Socks5AddressType: ATYP_IPV6 = 0x04 -class ProxyError(socket.error): +class ProxyError(OSError): ERR_SUCCESS = 0x00 def __init__(self, code=None, msg=None): diff --git a/yt_dlp/update.py b/yt_dlp/update.py index db79df127..ba7eadf81 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import atexit import contextlib import hashlib @@ -7,6 +9,7 @@ import re import subprocess import sys +from dataclasses import dataclass from zipimport import zipimporter from .compat import functools # isort: split @@ -14,24 +17,35 @@ from .networking import Request from .networking.exceptions import HTTPError, network_exceptions from .utils import ( + NO_DEFAULT, Popen, - cached_method, deprecation_warning, + format_field, remove_end, - remove_start, shell_quote, system_identifier, version_tuple, ) -from .version import CHANNEL, UPDATE_HINT, VARIANT, __version__ +from .version import ( + CHANNEL, + ORIGIN, + RELEASE_GIT_HEAD, + UPDATE_HINT, + VARIANT, + __version__, +) UPDATE_SOURCES = { 'stable': 'yt-dlp/yt-dlp', 'nightly': 'yt-dlp/yt-dlp-nightly-builds', + 'master': 'yt-dlp/yt-dlp-master-builds', } REPOSITORY = UPDATE_SOURCES['stable'] +_INVERSE_UPDATE_SOURCES = {value: key for key, value in UPDATE_SOURCES.items()} _VERSION_RE = re.compile(r'(\d+\.)*\d+') +_HASH_PATTERN = r'[\da-f]{40}' +_COMMIT_RE = re.compile(rf'Generated from: https://(?:[^/?#]+/){{3}}commit/(?P<hash>{_HASH_PATTERN})') API_BASE_URL = 'https://api.github.com/repos' @@ -112,8 +126,12 @@ def is_non_updateable(): detect_variant(), _NON_UPDATEABLE_REASONS['unknown' if VARIANT else 'other']) +def _get_binary_name(): + return format_field(_FILE_SUFFIXES, detect_variant(), template='yt-dlp%s', ignore=None, default=None) + + def _get_system_deprecation(): - MIN_SUPPORTED, MIN_RECOMMENDED = (3, 7), (3, 8) + MIN_SUPPORTED, MIN_RECOMMENDED = (3, 8), (3, 8) if sys.version_info > MIN_RECOMMENDED: return None @@ -122,16 +140,8 @@ def _get_system_deprecation(): if sys.version_info < MIN_SUPPORTED: msg = f'Python version {major}.{minor} is no longer supported' else: - msg = f'Support for Python version {major}.{minor} has been deprecated. ' - # Temporary until `win_x86_exe` uses 3.8, which will deprecate Vista and Server 2008 - if detect_variant() == 'win_x86_exe': - platform_name = platform.platform() - if any(platform_name.startswith(f'Windows-{name}') for name in ('Vista', '2008Server')): - msg = 'Support for Windows Vista/Server 2008 has been deprecated. ' - else: - return None - msg += ('See https://github.com/yt-dlp/yt-dlp/issues/7803 for details.' - '\nYou may stop receiving updates on this version at any time') + msg = (f'Support for Python version {major}.{minor} has been deprecated. ' + '\nYou may stop receiving updates on this version at any time') major, minor = MIN_RECOMMENDED return f'{msg}! Please update to Python {major}.{minor} or above' @@ -146,73 +156,118 @@ def _sha256_file(path): return h.hexdigest() +def _make_label(origin, tag, version=None): + if '/' in origin: + channel = _INVERSE_UPDATE_SOURCES.get(origin, origin) + else: + channel = origin + label = f'{channel}@{tag}' + if version and version != tag: + label += f' build {version}' + if channel != origin: + label += f' from {origin}' + return label + + +@dataclass +class UpdateInfo: + """ + Update target information + + Can be created by `query_update()` or manually. + + Attributes: + tag The release tag that will be updated to. If from query_update, + the value is after API resolution and update spec processing. + The only property that is required. + version The actual numeric version (if available) of the binary to be updated to, + after API resolution and update spec processing. (default: None) + requested_version Numeric version of the binary being requested (if available), + after API resolution only. (default: None) + commit Commit hash (if available) of the binary to be updated to, + after API resolution and update spec processing. (default: None) + This value will only match the RELEASE_GIT_HEAD of prerelease builds. + binary_name Filename of the binary to be updated to. (default: current binary name) + checksum Expected checksum (if available) of the binary to be + updated to. (default: None) + """ + tag: str + version: str | None = None + requested_version: str | None = None + commit: str | None = None + + binary_name: str | None = _get_binary_name() + checksum: str | None = None + + _has_update = True + + class Updater: - _exact = True + # XXX: use class variables to simplify testing + _channel = CHANNEL + _origin = ORIGIN + _update_sources = UPDATE_SOURCES - def __init__(self, ydl, target=None): + def __init__(self, ydl, target: str | None = None): self.ydl = ydl + # For backwards compat, target needs to be treated as if it could be None + self.requested_channel, sep, self.requested_tag = (target or self._channel).rpartition('@') + # Check if requested_tag is actually the requested repo/channel + if not sep and ('/' in self.requested_tag or self.requested_tag in self._update_sources): + self.requested_channel = self.requested_tag + self.requested_tag: str = None # type: ignore (we set it later) + elif not self.requested_channel: + # User did not specify a channel, so we are requesting the default channel + self.requested_channel = self._channel.partition('@')[0] - self.target_channel, sep, self.target_tag = (target or CHANNEL).rpartition('@') - # stable => stable@latest - if not sep and ('/' in self.target_tag or self.target_tag in UPDATE_SOURCES): - self.target_channel = self.target_tag - self.target_tag = None - elif not self.target_channel: - self.target_channel = CHANNEL.partition('@')[0] - - if not self.target_tag: - self.target_tag = 'latest' + # --update should not be treated as an exact tag request even if CHANNEL has a @tag + self._exact = bool(target) and target != self._channel + if not self.requested_tag: + # User did not specify a tag, so we request 'latest' and track that no exact tag was passed + self.requested_tag = 'latest' self._exact = False - elif self.target_tag != 'latest': - self.target_tag = f'tags/{self.target_tag}' - if '/' in self.target_channel: - self._target_repo = self.target_channel - if self.target_channel not in (CHANNEL, *UPDATE_SOURCES.values()): + if '/' in self.requested_channel: + # requested_channel is actually a repository + self.requested_repo = self.requested_channel + if not self.requested_repo.startswith('yt-dlp/') and self.requested_repo != self._origin: self.ydl.report_warning( f'You are switching to an {self.ydl._format_err("unofficial", "red")} executable ' - f'from {self.ydl._format_err(self._target_repo, self.ydl.Styles.EMPHASIS)}. ' + f'from {self.ydl._format_err(self.requested_repo, self.ydl.Styles.EMPHASIS)}. ' f'Run {self.ydl._format_err("at your own risk", "light red")}') self._block_restart('Automatically restarting into custom builds is disabled for security reasons') else: - self._target_repo = UPDATE_SOURCES.get(self.target_channel) - if not self._target_repo: + # Check if requested_channel resolves to a known repository or else raise + self.requested_repo = self._update_sources.get(self.requested_channel) + if not self.requested_repo: self._report_error( - f'Invalid update channel {self.target_channel!r} requested. ' - f'Valid channels are {", ".join(UPDATE_SOURCES)}', True) + f'Invalid update channel {self.requested_channel!r} requested. ' + f'Valid channels are {", ".join(self._update_sources)}', True) - def _version_compare(self, a, b, channel=CHANNEL): - if self._exact and channel != self.target_channel: - return False + self._identifier = f'{detect_variant()} {system_identifier()}' - if _VERSION_RE.fullmatch(f'{a}.{b}'): - a, b = version_tuple(a), version_tuple(b) - return a == b if self._exact else a >= b - return a == b + @property + def current_version(self): + """Current version""" + return __version__ - @functools.cached_property - def _tag(self): - if self._version_compare(self.current_version, self.latest_version): - return self.target_tag + @property + def current_commit(self): + """Current commit hash""" + return RELEASE_GIT_HEAD - identifier = f'{detect_variant()} {self.target_channel} {system_identifier()}' - for line in self._download('_update_spec', 'latest').decode().splitlines(): - if not line.startswith('lock '): - continue - _, tag, pattern = line.split(' ', 2) - if re.match(pattern, identifier): - if not self._exact: - return f'tags/{tag}' - elif self.target_tag == 'latest' or not self._version_compare( - tag, self.target_tag[5:], channel=self.target_channel): - self._report_error( - f'yt-dlp cannot be updated above {tag} since you are on an older Python version', True) - return f'tags/{self.current_version}' - return self.target_tag + def _download_asset(self, name, tag=None): + if not tag: + tag = self.requested_tag - @cached_method - def _get_version_info(self, tag): - url = f'{API_BASE_URL}/{self._target_repo}/releases/{tag}' + path = 'latest/download' if tag == 'latest' else f'download/{tag}' + url = f'https://github.com/{self.requested_repo}/releases/{path}/{name}' + self.ydl.write_debug(f'Downloading {name} from {url}') + return self.ydl.urlopen(url).read() + + def _call_api(self, tag): + tag = f'tags/{tag}' if tag != 'latest' else tag + url = f'{API_BASE_URL}/{self.requested_repo}/releases/{tag}' self.ydl.write_debug(f'Fetching release info: {url}') return json.loads(self.ydl.urlopen(Request(url, headers={ 'Accept': 'application/vnd.github+json', @@ -220,105 +275,175 @@ def _get_version_info(self, tag): 'X-GitHub-Api-Version': '2022-11-28', })).read().decode()) - @property - def current_version(self): - """Current version""" - return __version__ + def _get_version_info(self, tag: str) -> tuple[str | None, str | None]: + if _VERSION_RE.fullmatch(tag): + return tag, None - @staticmethod - def _label(channel, tag): - """Label for a given channel and tag""" - return f'{channel}@{remove_start(tag, "tags/")}' + api_info = self._call_api(tag) - def _get_actual_tag(self, tag): - if tag.startswith('tags/'): - return tag[5:] - return self._get_version_info(tag)['tag_name'] + if tag == 'latest': + requested_version = api_info['tag_name'] + else: + match = re.search(rf'\s+(?P<version>{_VERSION_RE.pattern})$', api_info.get('name', '')) + requested_version = match.group('version') if match else None - @property - def new_version(self): - """Version of the latest release we can update to""" - return self._get_actual_tag(self._tag) + if re.fullmatch(_HASH_PATTERN, api_info.get('target_commitish', '')): + target_commitish = api_info['target_commitish'] + else: + match = _COMMIT_RE.match(api_info.get('body', '')) + target_commitish = match.group('hash') if match else None - @property - def latest_version(self): - """Version of the target release""" - return self._get_actual_tag(self.target_tag) + if not (requested_version or target_commitish): + self._report_error('One of either version or commit hash must be available on the release', expected=True) - @property - def has_update(self): - """Whether there is an update available""" - return not self._version_compare(self.current_version, self.new_version) + return requested_version, target_commitish - @functools.cached_property - def filename(self): - """Filename of the executable""" - return compat_realpath(_get_variant_and_executable_path()[1]) + def _download_update_spec(self, source_tags): + for tag in source_tags: + try: + return self._download_asset('_update_spec', tag=tag).decode() + except network_exceptions as error: + if isinstance(error, HTTPError) and error.status == 404: + continue + self._report_network_error(f'fetch update spec: {error}') - def _download(self, name, tag): - slug = 'latest/download' if tag == 'latest' else f'download/{tag[5:]}' - url = f'https://github.com/{self._target_repo}/releases/{slug}/{name}' - self.ydl.write_debug(f'Downloading {name} from {url}') - return self.ydl.urlopen(url).read() - - @functools.cached_property - def release_name(self): - """The release filename""" - return f'yt-dlp{_FILE_SUFFIXES[detect_variant()]}' - - @functools.cached_property - def release_hash(self): - """Hash of the latest release""" - hash_data = dict(ln.split()[::-1] for ln in self._download('SHA2-256SUMS', self._tag).decode().splitlines()) - return hash_data[self.release_name] - - def _report_error(self, msg, expected=False): - self.ydl.report_error(msg, tb=False if expected else None) - self.ydl._download_retcode = 100 - - def _report_permission_error(self, file): - self._report_error(f'Unable to write to {file}; Try running as administrator', True) - - def _report_network_error(self, action, delim=';'): self._report_error( - f'Unable to {action}{delim} visit ' - f'https://github.com/{self._target_repo}/releases/{self.target_tag.replace("tags/", "tag/")}', True) + f'The requested tag {self.requested_tag} does not exist for {self.requested_repo}', True) + return None + + def _process_update_spec(self, lockfile: str, resolved_tag: str): + lines = lockfile.splitlines() + is_version2 = any(line.startswith('lockV2 ') for line in lines) + + for line in lines: + if is_version2: + if not line.startswith(f'lockV2 {self.requested_repo} '): + continue + _, _, tag, pattern = line.split(' ', 3) + else: + if not line.startswith('lock '): + continue + _, tag, pattern = line.split(' ', 2) + + if re.match(pattern, self._identifier): + if _VERSION_RE.fullmatch(tag): + if not self._exact: + return tag + elif self._version_compare(tag, resolved_tag): + return resolved_tag + elif tag != resolved_tag: + continue + + self._report_error( + f'yt-dlp cannot be updated to {resolved_tag} since you are on an older Python version', True) + return None + + return resolved_tag + + def _version_compare(self, a: str, b: str): + """ + Compare two version strings + + This function SHOULD NOT be called if self._exact == True + """ + if _VERSION_RE.fullmatch(f'{a}.{b}'): + return version_tuple(a) >= version_tuple(b) + return a == b + + def query_update(self, *, _output=False) -> UpdateInfo | None: + """Fetches and returns info about the available update""" + if not self.requested_repo: + self._report_error('No target repository could be determined from input') + return None - def check_update(self): - """Report whether there is an update available""" - if not self._target_repo: - return False try: - self.ydl.to_screen(( - f'Available version: {self._label(self.target_channel, self.latest_version)}, ' if self.target_tag == 'latest' else '' - ) + f'Current version: {self._label(CHANNEL, self.current_version)}') + requested_version, target_commitish = self._get_version_info(self.requested_tag) except network_exceptions as e: - return self._report_network_error(f'obtain version info ({e})', delim='; Please try again later or') + self._report_network_error(f'obtain version info ({e})', delim='; Please try again later or') + return None + if self._exact and self._origin != self.requested_repo: + has_update = True + elif requested_version: + if self._exact: + has_update = self.current_version != requested_version + else: + has_update = not self._version_compare(self.current_version, requested_version) + elif target_commitish: + has_update = target_commitish != self.current_commit + else: + has_update = False + + resolved_tag = requested_version if self.requested_tag == 'latest' else self.requested_tag + current_label = _make_label(self._origin, self._channel.partition("@")[2] or self.current_version, self.current_version) + requested_label = _make_label(self.requested_repo, resolved_tag, requested_version) + latest_or_requested = f'{"Latest" if self.requested_tag == "latest" else "Requested"} version: {requested_label}' + if not has_update: + if _output: + self.ydl.to_screen(f'{latest_or_requested}\nyt-dlp is up to date ({current_label})') + return None + + update_spec = self._download_update_spec(('latest', None) if requested_version else (None,)) + if not update_spec: + return None + # `result_` prefixed vars == post-_process_update_spec() values + result_tag = self._process_update_spec(update_spec, resolved_tag) + if not result_tag or result_tag == self.current_version: + return None + elif result_tag == resolved_tag: + result_version = requested_version + elif _VERSION_RE.fullmatch(result_tag): + result_version = result_tag + else: # actual version being updated to is unknown + result_version = None + + checksum = None + # Non-updateable variants can get update_info but need to skip checksum if not is_non_updateable(): - self.ydl.to_screen(f'Current Build Hash: {_sha256_file(self.filename)}') + try: + hashes = self._download_asset('SHA2-256SUMS', result_tag) + except network_exceptions as error: + if not isinstance(error, HTTPError) or error.status != 404: + self._report_network_error(f'fetch checksums: {error}') + return None + self.ydl.report_warning('No hash information found for the release, skipping verification') + else: + for ln in hashes.decode().splitlines(): + if ln.endswith(_get_binary_name()): + checksum = ln.split()[0] + break + if not checksum: + self.ydl.report_warning('The hash could not be found in the checksum file, skipping verification') - if self.has_update: - return True + if _output: + update_label = _make_label(self.requested_repo, result_tag, result_version) + self.ydl.to_screen( + f'Current version: {current_label}\n{latest_or_requested}' + + (f'\nUpgradable to: {update_label}' if update_label != requested_label else '')) - if self.target_tag == self._tag: - self.ydl.to_screen(f'yt-dlp is up to date ({self._label(CHANNEL, self.current_version)})') - elif not self._exact: - self.ydl.report_warning('yt-dlp cannot be updated any further since you are on an older Python version') - return False + return UpdateInfo( + tag=result_tag, + version=result_version, + requested_version=requested_version, + commit=target_commitish if result_tag == resolved_tag else None, + checksum=checksum) - def update(self): + def update(self, update_info=NO_DEFAULT): """Update yt-dlp executable to the latest version""" - if not self.check_update(): - return + if update_info is NO_DEFAULT: + update_info = self.query_update(_output=True) + if not update_info: + return False + err = is_non_updateable() if err: - return self._report_error(err, True) - self.ydl.to_screen(f'Updating to {self._label(self.target_channel, self.new_version)} ...') - if (_VERSION_RE.fullmatch(self.target_tag[5:]) - and version_tuple(self.target_tag[5:]) < (2023, 3, 2)): - self.ydl.report_warning('You are downgrading to a version without --update-to') - self._block_restart('Cannot automatically restart to a version without --update-to') + self._report_error(err, True) + return False + + self.ydl.to_screen(f'Current Build Hash: {_sha256_file(self.filename)}') + + update_label = _make_label(self.requested_repo, update_info.tag, update_info.version) + self.ydl.to_screen(f'Updating to {update_label} ...') directory = os.path.dirname(self.filename) if not os.access(self.filename, os.W_OK): @@ -337,20 +462,17 @@ def update(self): return self._report_error('Unable to remove the old version') try: - newcontent = self._download(self.release_name, self._tag) + newcontent = self._download_asset(update_info.binary_name, update_info.tag) except network_exceptions as e: if isinstance(e, HTTPError) and e.status == 404: return self._report_error( - f'The requested tag {self._label(self.target_channel, self.target_tag)} does not exist', True) - return self._report_network_error(f'fetch updates: {e}') + f'The requested tag {self.requested_repo}@{update_info.tag} does not exist', True) + return self._report_network_error(f'fetch updates: {e}', tag=update_info.tag) - try: - expected_hash = self.release_hash - except Exception: - self.ydl.report_warning('no hash information found for the release') - else: - if hashlib.sha256(newcontent).hexdigest() != expected_hash: - return self._report_network_error('verify the new executable') + if not update_info.checksum: + self._block_restart('Automatically restarting into unverified builds is disabled for security reasons') + elif hashlib.sha256(newcontent).hexdigest() != update_info.checksum: + return self._report_network_error('verify the new executable', tag=update_info.tag) try: with open(new_filename, 'wb') as outf: @@ -387,9 +509,14 @@ def update(self): return self._report_error( f'Unable to set permissions. Run: sudo chmod a+rx {compat_shlex_quote(self.filename)}') - self.ydl.to_screen(f'Updated yt-dlp to {self._label(self.target_channel, self.new_version)}') + self.ydl.to_screen(f'Updated yt-dlp to {update_label}') return True + @functools.cached_property + def filename(self): + """Filename of the executable""" + return compat_realpath(_get_variant_and_executable_path()[1]) + @functools.cached_property def cmd(self): """The command-line to run the executable, if known""" @@ -412,6 +539,71 @@ def wrapper(): return self.ydl._download_retcode self.restart = wrapper + def _report_error(self, msg, expected=False): + self.ydl.report_error(msg, tb=False if expected else None) + self.ydl._download_retcode = 100 + + def _report_permission_error(self, file): + self._report_error(f'Unable to write to {file}; try running as administrator', True) + + def _report_network_error(self, action, delim=';', tag=None): + if not tag: + tag = self.requested_tag + self._report_error( + f'Unable to {action}{delim} visit https://github.com/{self.requested_repo}/releases/' + + tag if tag == "latest" else f"tag/{tag}", True) + + # XXX: Everything below this line in this class is deprecated / for compat only + @property + def _target_tag(self): + """Deprecated; requested tag with 'tags/' prepended when necessary for API calls""" + return f'tags/{self.requested_tag}' if self.requested_tag != 'latest' else self.requested_tag + + def _check_update(self): + """Deprecated; report whether there is an update available""" + return bool(self.query_update(_output=True)) + + def __getattr__(self, attribute: str): + """Compat getter function for deprecated attributes""" + deprecated_props_map = { + 'check_update': '_check_update', + 'target_tag': '_target_tag', + 'target_channel': 'requested_channel', + } + update_info_props_map = { + 'has_update': '_has_update', + 'new_version': 'version', + 'latest_version': 'requested_version', + 'release_name': 'binary_name', + 'release_hash': 'checksum', + } + + if attribute not in deprecated_props_map and attribute not in update_info_props_map: + raise AttributeError(f'{type(self).__name__!r} object has no attribute {attribute!r}') + + msg = f'{type(self).__name__}.{attribute} is deprecated and will be removed in a future version' + if attribute in deprecated_props_map: + source_name = deprecated_props_map[attribute] + if not source_name.startswith('_'): + msg += f'. Please use {source_name!r} instead' + source = self + mapping = deprecated_props_map + + else: # attribute in update_info_props_map + msg += '. Please call query_update() instead' + source = self.query_update() + if source is None: + source = UpdateInfo('', None, None, None) + source._has_update = False + mapping = update_info_props_map + + deprecation_warning(msg) + for target_name, source_name in mapping.items(): + value = getattr(source, source_name) + setattr(self, target_name, value) + + return getattr(self, attribute) + def run_update(ydl): """Update the program file with the latest version from the repository @@ -420,45 +612,4 @@ def run_update(ydl): return Updater(ydl).update() -# Deprecated -def update_self(to_screen, verbose, opener): - import traceback - - deprecation_warning(f'"{__name__}.update_self" is deprecated and may be removed ' - f'in a future version. Use "{__name__}.run_update(ydl)" instead') - - printfn = to_screen - - class FakeYDL(): - to_screen = printfn - - def report_warning(self, msg, *args, **kwargs): - return printfn(f'WARNING: {msg}', *args, **kwargs) - - def report_error(self, msg, tb=None): - printfn(f'ERROR: {msg}') - if not verbose: - return - if tb is None: - # Copied from YoutubeDL.trouble - if sys.exc_info()[0]: - tb = '' - if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: - tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info)) - tb += traceback.format_exc() - else: - tb_data = traceback.format_list(traceback.extract_stack()) - tb = ''.join(tb_data) - if tb: - printfn(tb) - - def write_debug(self, msg, *args, **kwargs): - printfn(f'[debug] {msg}', *args, **kwargs) - - def urlopen(self, url): - return opener.open(url) - - return run_update(FakeYDL()) - - __all__ = ['Updater'] diff --git a/yt_dlp/utils/_legacy.py b/yt_dlp/utils/_legacy.py index dde02092c..aa9f46d20 100644 --- a/yt_dlp/utils/_legacy.py +++ b/yt_dlp/utils/_legacy.py @@ -1,4 +1,6 @@ """No longer used and new code should not use. Exists only for API compat.""" +import asyncio +import atexit import platform import struct import sys @@ -32,6 +34,77 @@ has_websockets = bool(websockets) +class WebSocketsWrapper: + """Wraps websockets module to use in non-async scopes""" + pool = None + + def __init__(self, url, headers=None, connect=True, **ws_kwargs): + self.loop = asyncio.new_event_loop() + # XXX: "loop" is deprecated + self.conn = websockets.connect( + url, extra_headers=headers, ping_interval=None, + close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'), **ws_kwargs) + if connect: + self.__enter__() + atexit.register(self.__exit__, None, None, None) + + def __enter__(self): + if not self.pool: + self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop) + return self + + def send(self, *args): + self.run_with_loop(self.pool.send(*args), self.loop) + + def recv(self, *args): + return self.run_with_loop(self.pool.recv(*args), self.loop) + + def __exit__(self, type, value, traceback): + try: + return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop) + finally: + self.loop.close() + self._cancel_all_tasks(self.loop) + + # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications + # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class + @staticmethod + def run_with_loop(main, loop): + if not asyncio.iscoroutine(main): + raise ValueError(f'a coroutine was expected, got {main!r}') + + try: + return loop.run_until_complete(main) + finally: + loop.run_until_complete(loop.shutdown_asyncgens()) + if hasattr(loop, 'shutdown_default_executor'): + loop.run_until_complete(loop.shutdown_default_executor()) + + @staticmethod + def _cancel_all_tasks(loop): + to_cancel = asyncio.all_tasks(loop) + + if not to_cancel: + return + + for task in to_cancel: + task.cancel() + + # XXX: "loop" is removed in python 3.10+ + loop.run_until_complete( + asyncio.gather(*to_cancel, loop=loop, return_exceptions=True)) + + for task in to_cancel: + if task.cancelled(): + continue + if task.exception() is not None: + loop.call_exception_handler({ + 'message': 'unhandled exception during asyncio.run() shutdown', + 'exception': task.exception(), + 'task': task, + }) + + def load_plugins(name, suffix, namespace): from ..plugins import load_plugins ret = load_plugins(name, suffix) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index b441fc273..1b910fdf0 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -1,5 +1,3 @@ -import asyncio -import atexit import base64 import binascii import calendar @@ -54,7 +52,7 @@ compat_os_name, compat_shlex_quote, ) -from ..dependencies import websockets, xattr +from ..dependencies import xattr __name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module @@ -560,7 +558,7 @@ def decode(self, s): s = self._close_object(e) if s is not None: continue - raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos) + raise type(e)(f'{e.msg} in {s[e.pos - 10:e.pos + 10]!r}', s, e.pos) assert False, 'Too many attempts to decode JSON' @@ -638,7 +636,7 @@ def replace_insane(char): elif char in '\\/|*<>': return '\0_' if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127): - return '\0_' + return '' if unicodedata.category(char)[0] in 'CM' else '\0_' return char # Replace look-alike Unicode glyphs @@ -1887,6 +1885,7 @@ def setproctitle(title): buf = ctypes.create_string_buffer(len(title_bytes)) buf.value = title_bytes try: + # PR_SET_NAME = 15 Ref: /usr/include/linux/prctl.h libc.prctl(15, buf, 0, 0, 0) except AttributeError: return # Strange libc, just skip this @@ -2267,6 +2266,9 @@ def __getitem__(self, idx): raise self.IndexError() return entries[0] + def __bool__(self): + return bool(self.getslice(0, 1)) + class OnDemandPagedList(PagedList): """Download pages until a page with less than maximum results""" @@ -4446,10 +4448,12 @@ def write_xattr(path, key, value): raise XAttrMetadataError(e.errno, e.strerror) return - # UNIX Method 1. Use xattrs/pyxattrs modules + # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules setxattr = None - if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr': + if callable(getattr(os, 'setxattr', None)): + setxattr = os.setxattr + elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr': # Unicode arguments are not supported in pyxattr until version 0.5.0 # See https://github.com/ytdl-org/youtube-dl/issues/5498 if version_tuple(xattr.__version__) >= (0, 5, 0): @@ -4794,8 +4798,9 @@ def parse_http_range(range): def read_stdin(what): - eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D' - write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n') + if what: + eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D' + write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n') return sys.stdin @@ -4926,77 +4931,6 @@ def parse_args(self): return self.parser.parse_args(self.all_args) -class WebSocketsWrapper: - """Wraps websockets module to use in non-async scopes""" - pool = None - - def __init__(self, url, headers=None, connect=True): - self.loop = asyncio.new_event_loop() - # XXX: "loop" is deprecated - self.conn = websockets.connect( - url, extra_headers=headers, ping_interval=None, - close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf')) - if connect: - self.__enter__() - atexit.register(self.__exit__, None, None, None) - - def __enter__(self): - if not self.pool: - self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop) - return self - - def send(self, *args): - self.run_with_loop(self.pool.send(*args), self.loop) - - def recv(self, *args): - return self.run_with_loop(self.pool.recv(*args), self.loop) - - def __exit__(self, type, value, traceback): - try: - return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop) - finally: - self.loop.close() - self._cancel_all_tasks(self.loop) - - # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications - # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class - @staticmethod - def run_with_loop(main, loop): - if not asyncio.iscoroutine(main): - raise ValueError(f'a coroutine was expected, got {main!r}') - - try: - return loop.run_until_complete(main) - finally: - loop.run_until_complete(loop.shutdown_asyncgens()) - if hasattr(loop, 'shutdown_default_executor'): - loop.run_until_complete(loop.shutdown_default_executor()) - - @staticmethod - def _cancel_all_tasks(loop): - to_cancel = asyncio.all_tasks(loop) - - if not to_cancel: - return - - for task in to_cancel: - task.cancel() - - # XXX: "loop" is removed in python 3.10+ - loop.run_until_complete( - asyncio.gather(*to_cancel, loop=loop, return_exceptions=True)) - - for task in to_cancel: - if task.cancelled(): - continue - if task.exception() is not None: - loop.call_exception_handler({ - 'message': 'unhandled exception during asyncio.run() shutdown', - 'exception': task.exception(), - 'task': task, - }) - - def merge_headers(*dicts): """Merge dicts of http headers case insensitively, prioritizing the latter ones""" return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))} @@ -5145,7 +5079,7 @@ def truncate_string(s, left, right=0): assert left > 3 and right >= 0 if s is None or len(s) <= left + right: return s - return f'{s[:left-3]}...{s[-right:] if right else ""}' + return f'{s[:left - 3]}...{s[-right:] if right else ""}' def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None): diff --git a/yt_dlp/utils/networking.py b/yt_dlp/utils/networking.py index ba0493cc2..4b73252cb 100644 --- a/yt_dlp/utils/networking.py +++ b/yt_dlp/utils/networking.py @@ -67,7 +67,7 @@ def __init__(self, *args, **kwargs): def __setitem__(self, key, value): if isinstance(value, bytes): value = value.decode('latin-1') - super().__setitem__(key.title(), str(value)) + super().__setitem__(key.title(), str(value).strip()) def __getitem__(self, key): return super().__getitem__(key.title()) @@ -123,6 +123,7 @@ def clean_headers(headers: HTTPHeaderDict): if 'Youtubedl-No-Compression' in headers: # compat del headers['Youtubedl-No-Compression'] headers['Accept-Encoding'] = 'identity' + headers.pop('Ytdl-socks-proxy', None) def remove_dot_segments(path): diff --git a/yt_dlp/utils/traversal.py b/yt_dlp/utils/traversal.py index 462c3ba5d..8938f4c78 100644 --- a/yt_dlp/utils/traversal.py +++ b/yt_dlp/utils/traversal.py @@ -3,12 +3,13 @@ import inspect import itertools import re +import xml.etree.ElementTree from ._utils import ( IDENTITY, NO_DEFAULT, LazyList, - int_or_none, + deprecation_warning, is_iterable_like, try_call, variadic, @@ -17,13 +18,13 @@ def traverse_obj( obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True, - casesense=True, is_user_input=False, traverse_string=False): + casesense=True, is_user_input=NO_DEFAULT, traverse_string=False): """ Safely traverse nested `dict`s and `Iterable`s >>> obj = [{}, {"key": "value"}] >>> traverse_obj(obj, (1, "key")) - "value" + 'value' Each of the provided `paths` is tested and the first producing a valid result will be returned. The next path will also be tested if the path branched but no results could be found. @@ -63,10 +64,8 @@ def traverse_obj( @param get_all If `False`, return the first matching result, otherwise all matching ones. @param casesense If `False`, consider string dictionary keys as case insensitive. - The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API + `traverse_string` is only meant to be used by YoutubeDL.prepare_outtmpl and is not part of the API - @param is_user_input Whether the keys are generated from user input. - If `True` strings get converted to `int`/`slice` if needed. @param traverse_string Whether to traverse into objects as strings. If `True`, any non-compatible object will first be converted into a string and then traversed into. @@ -80,6 +79,9 @@ def traverse_obj( If no `default` is given and the last path branches, a `list` of results is always returned. If a path ends on a `dict` that result will always be a `dict`. """ + if is_user_input is not NO_DEFAULT: + deprecation_warning('The is_user_input parameter is deprecated and no longer works') + casefold = lambda k: k.casefold() if isinstance(k, str) else k if isinstance(expected_type, type): @@ -117,7 +119,7 @@ def apply_key(key, obj, is_last): branching = True if isinstance(obj, collections.abc.Mapping): result = obj.values() - elif is_iterable_like(obj): + elif is_iterable_like(obj) or isinstance(obj, xml.etree.ElementTree.Element): result = obj elif isinstance(obj, re.Match): result = obj.groups() @@ -131,7 +133,7 @@ def apply_key(key, obj, is_last): branching = True if isinstance(obj, collections.abc.Mapping): iter_obj = obj.items() - elif is_iterable_like(obj): + elif is_iterable_like(obj) or isinstance(obj, xml.etree.ElementTree.Element): iter_obj = enumerate(obj) elif isinstance(obj, re.Match): iter_obj = itertools.chain( @@ -167,7 +169,7 @@ def apply_key(key, obj, is_last): result = next((v for k, v in obj.groupdict().items() if casefold(k) == key), None) elif isinstance(key, (int, slice)): - if is_iterable_like(obj, collections.abc.Sequence): + if is_iterable_like(obj, (collections.abc.Sequence, xml.etree.ElementTree.Element)): branching = isinstance(key, slice) with contextlib.suppress(IndexError): result = obj[key] @@ -175,6 +177,34 @@ def apply_key(key, obj, is_last): with contextlib.suppress(IndexError): result = str(obj)[key] + elif isinstance(obj, xml.etree.ElementTree.Element) and isinstance(key, str): + xpath, _, special = key.rpartition('/') + if not special.startswith('@') and special != 'text()': + xpath = key + special = None + + # Allow abbreviations of relative paths, absolute paths error + if xpath.startswith('/'): + xpath = f'.{xpath}' + elif xpath and not xpath.startswith('./'): + xpath = f'./{xpath}' + + def apply_specials(element): + if special is None: + return element + if special == '@': + return element.attrib + if special.startswith('@'): + return try_call(element.attrib.get, args=(special[1:],)) + if special == 'text()': + return element.text + assert False, f'apply_specials is missing case for {special!r}' + + if xpath: + result = list(map(apply_specials, obj.iterfind(xpath))) + else: + result = apply_specials(obj) + return branching, result if branching else (result,) def lazy_last(iterable): @@ -195,14 +225,6 @@ def apply_path(start_obj, path, test_type): key = None for last, key in lazy_last(variadic(path, (str, bytes, dict, set))): - if is_user_input and isinstance(key, str): - if key == ':': - key = ... - elif ':' in key: - key = slice(*map(int_or_none, key.split(':'))) - elif int_or_none(key) is not None: - key = int(key) - if not casesense and isinstance(key, str): key = key.casefold() diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 60c1c94cc..687ef8788 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,11 +1,15 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2023.10.07' +__version__ = '2023.12.30' -RELEASE_GIT_HEAD = '377e85a1797db9e98b78b38203ed9d4ded229991' +RELEASE_GIT_HEAD = 'f10589e3453009bb523f55849bba144c9b91cf2a' VARIANT = None UPDATE_HINT = None CHANNEL = 'stable' + +ORIGIN = 'yt-dlp/yt-dlp' + +_pkg_version = '2023.12.30' diff --git a/yt_dlp/webvtt.py b/yt_dlp/webvtt.py index dd7298277..c80c58631 100644 --- a/yt_dlp/webvtt.py +++ b/yt_dlp/webvtt.py @@ -95,6 +95,7 @@ def __init__(self, parser): _REGEX_EOF = re.compile(r'\Z') _REGEX_NL = re.compile(r'(?:\r\n|[\r\n]|$)') _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+') +_REGEX_OPTIONAL_WHITESPACE = re.compile(r'[ \t]*') def _parse_ts(ts): @@ -286,6 +287,7 @@ def parse(cls, parser): if not m1: return None m2 = parser.consume(cls._REGEX_SETTINGS) + parser.consume(_REGEX_OPTIONAL_WHITESPACE) if not parser.consume(_REGEX_NL): return None