Merge remote-tracking branch 'origin/master' into yt-live-from-start-range

This commit is contained in:
Elyse 2023-05-07 00:33:18 -06:00
commit 444e02ef3b
74 changed files with 5099 additions and 1935 deletions

View file

@ -192,7 +192,7 @@ jobs:
- name: Install Requirements
run: |
brew install coreutils
/usr/bin/python3 -m pip install -U --user pip Pyinstaller -r requirements.txt
/usr/bin/python3 -m pip install -U --user pip Pyinstaller==5.8 -r requirements.txt
- name: Prepare
run: |

View file

@ -463,15 +463,11 @@ ## Geo-restriction:
specified by --proxy (or none, if the option
is not present) is used for the actual
downloading
--geo-bypass Bypass geographic restriction via faking
X-Forwarded-For HTTP header (default)
--no-geo-bypass Do not bypass geographic restriction via
faking X-Forwarded-For HTTP header
--geo-bypass-country CODE Force bypass geographic restriction with
explicitly provided two-letter ISO 3166-2
country code
--geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction with
explicitly provided IP block in CIDR notation
--xff VALUE How to fake X-Forwarded-For HTTP header to
try bypassing geographic restriction. One of
"default" (Only when known to be useful),
"never", a two-letter ISO 3166-2 country
code, or an IP block in CIDR notation
## Video Selection:
-I, --playlist-items ITEM_SPEC Comma separated playlist_index of the items
@ -752,6 +748,7 @@ ## Internet Shortcut Options:
## Verbosity and Simulation Options:
-q, --quiet Activate quiet mode. If used with --verbose,
print the log to stderr
--no-quiet Deactivate quiet mode. (Default)
--no-warnings Ignore warnings
-s, --simulate Do not download the video and do not write
anything to disk
@ -1246,7 +1243,7 @@ # OUTPUT TEMPLATE
1. **Alternatives**: Alternate fields can be specified separated with a `,`. E.g. `%(release_date>%Y,upload_date>%Y|Unknown)s`
1. **Replacement**: A replacement value can be specified using a `&` separator. If the field is *not* empty, this replacement value will be used instead of the actual field content. This is done after alternate fields are considered; thus the replacement is used if *any* of the alternative fields is *not* empty.
1. **Replacement**: A replacement value can be specified using a `&` separator according to the [`str.format` mini-language](https://docs.python.org/3/library/string.html#format-specification-mini-language). If the field is *not* empty, this replacement value will be used instead of the actual field content. This is done after alternate fields are considered; thus the replacement is used if *any* of the alternative fields is *not* empty. E.g. `%(chapters&has chapters|no chapters)s`, `%(title&TITLE={:>20}|NO TITLE)s`
1. **Default**: A literal default value can be specified for when the field is empty using a `|` separator. This overrides `--output-na-placeholder`. E.g. `%(uploader|Unknown)s`
@ -1797,7 +1794,10 @@ #### youtubetab (YouTube playlists, channels, feeds, etc.)
* `approximate_date`: Extract approximate `upload_date` and `timestamp` in flat-playlist. This may cause date-based filters to be slightly off
#### generic
* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments. Does not apply to ffmpeg
* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments if no value is provided, or else apply the query string given as `fragment_query=VALUE`. Does not apply to ffmpeg
* `variant_query`: Passthrough the master m3u8 URL query to its variant playlist URLs if no value is provided, or else apply the query string given as `variant_query=VALUE`
* `hls_key`: An HLS AES-128 key URI *or* key (as hex), and optionally the IV (as hex), in the form of `(URI|KEY)[,IV]`; e.g. `generic:hls_key=ABCDEF1234567980,0xFEDCBA0987654321`. Passing any of these values will force usage of the native HLS downloader and override the corresponding values found in the m3u8 playlist
* `is_live`: Bypass live HLS detection and manually set `live_status` - a value of `false` will set `not_live`, any other value (or no value) will set `is_live`
#### funimation
* `language`: Audio languages to extract, e.g. `funimation:language=english,japanese`
@ -1833,7 +1833,7 @@ #### rokfinchannel
* `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks`
#### twitter
* `force_graphql`: Force usage of the GraphQL API. By default it will only be used if login cookies are provided
* `legacy_api`: Force usage of the legacy Twitter API instead of the GraphQL API for tweet extraction. Has no effect if login cookies are passed
**Note**: These options may be changed/removed in the future without concern for backward compatibility
@ -2164,6 +2164,10 @@ #### Not recommended
--youtube-skip-hls-manifest --extractor-args "youtube:skip=hls" (Alias: --no-youtube-include-hls-manifest)
--youtube-include-dash-manifest Default (Alias: --no-youtube-skip-dash-manifest)
--youtube-include-hls-manifest Default (Alias: --no-youtube-skip-hls-manifest)
--geo-bypass --xff "default"
--no-geo-bypass --xff "never"
--geo-bypass-country CODE --xff CODE
--geo-bypass-ip-block IP_BLOCK --xff IP_BLOCK
#### Developer options

View file

@ -1406,6 +1406,7 @@ def test_parse_ism_formats(self):
'vcodec': 'none',
'acodec': 'AACL',
'protocol': 'ism',
'audio_channels': 2,
'_download_params': {
'stream_type': 'audio',
'duration': 8880746666,
@ -1419,9 +1420,6 @@ def test_parse_ism_formats(self):
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
'audio_ext': 'isma',
'video_ext': 'none',
'abr': 128,
}, {
'format_id': 'video-100',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest',
@ -1445,9 +1443,6 @@ def test_parse_ism_formats(self):
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
'video_ext': 'ismv',
'audio_ext': 'none',
'vbr': 100,
}, {
'format_id': 'video-326',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest',
@ -1471,9 +1466,6 @@ def test_parse_ism_formats(self):
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
'video_ext': 'ismv',
'audio_ext': 'none',
'vbr': 326,
}, {
'format_id': 'video-698',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest',
@ -1497,9 +1489,6 @@ def test_parse_ism_formats(self):
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
'video_ext': 'ismv',
'audio_ext': 'none',
'vbr': 698,
}, {
'format_id': 'video-1493',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest',
@ -1523,9 +1512,6 @@ def test_parse_ism_formats(self):
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
'video_ext': 'ismv',
'audio_ext': 'none',
'vbr': 1493,
}, {
'format_id': 'video-4482',
'url': 'https://sdn-global-streaming-cache-3qsdn.akamaized.net/stream/3144/files/17/07/672975/3144-kZT4LWMQw6Rh7Kpd.ism/Manifest',
@ -1549,9 +1535,6 @@ def test_parse_ism_formats(self):
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
'video_ext': 'ismv',
'audio_ext': 'none',
'vbr': 4482,
}],
{
'eng': [
@ -1575,34 +1558,6 @@ def test_parse_ism_formats(self):
'ec-3_test',
'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
[{
'format_id': 'audio_deu_1-224',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
'ext': 'isma',
'tbr': 224,
'asr': 48000,
'vcodec': 'none',
'acodec': 'EC-3',
'protocol': 'ism',
'_download_params':
{
'stream_type': 'audio',
'duration': 370000000,
'timescale': 10000000,
'width': 0,
'height': 0,
'fourcc': 'EC-3',
'language': 'deu',
'codec_private_data': '00063F000000AF87FBA7022DFB42A4D405CD93843BDD0700200F00',
'sampling_rate': 48000,
'channels': 6,
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
'audio_ext': 'isma',
'video_ext': 'none',
'abr': 224,
}, {
'format_id': 'audio_deu-127',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
@ -1612,8 +1567,9 @@ def test_parse_ism_formats(self):
'vcodec': 'none',
'acodec': 'AACL',
'protocol': 'ism',
'_download_params':
{
'language': 'deu',
'audio_channels': 2,
'_download_params': {
'stream_type': 'audio',
'duration': 370000000,
'timescale': 10000000,
@ -1627,9 +1583,32 @@ def test_parse_ism_formats(self):
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
'audio_ext': 'isma',
'video_ext': 'none',
'abr': 127,
}, {
'format_id': 'audio_deu_1-224',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
'manifest_url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
'ext': 'isma',
'tbr': 224,
'asr': 48000,
'vcodec': 'none',
'acodec': 'EC-3',
'protocol': 'ism',
'language': 'deu',
'audio_channels': 6,
'_download_params': {
'stream_type': 'audio',
'duration': 370000000,
'timescale': 10000000,
'width': 0,
'height': 0,
'fourcc': 'EC-3',
'language': 'deu',
'codec_private_data': '00063F000000AF87FBA7022DFB42A4D405CD93843BDD0700200F00',
'sampling_rate': 48000,
'channels': 6,
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
}, {
'format_id': 'video_deu-23',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
@ -1641,8 +1620,8 @@ def test_parse_ism_formats(self):
'vcodec': 'AVC1',
'acodec': 'none',
'protocol': 'ism',
'_download_params':
{
'language': 'deu',
'_download_params': {
'stream_type': 'video',
'duration': 370000000,
'timescale': 10000000,
@ -1655,9 +1634,6 @@ def test_parse_ism_formats(self):
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
'video_ext': 'ismv',
'audio_ext': 'none',
'vbr': 23,
}, {
'format_id': 'video_deu-403',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
@ -1669,8 +1645,8 @@ def test_parse_ism_formats(self):
'vcodec': 'AVC1',
'acodec': 'none',
'protocol': 'ism',
'_download_params':
{
'language': 'deu',
'_download_params': {
'stream_type': 'video',
'duration': 370000000,
'timescale': 10000000,
@ -1683,9 +1659,6 @@ def test_parse_ism_formats(self):
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
'video_ext': 'ismv',
'audio_ext': 'none',
'vbr': 403,
}, {
'format_id': 'video_deu-680',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
@ -1697,8 +1670,8 @@ def test_parse_ism_formats(self):
'vcodec': 'AVC1',
'acodec': 'none',
'protocol': 'ism',
'_download_params':
{
'language': 'deu',
'_download_params': {
'stream_type': 'video',
'duration': 370000000,
'timescale': 10000000,
@ -1711,9 +1684,6 @@ def test_parse_ism_formats(self):
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
'video_ext': 'ismv',
'audio_ext': 'none',
'vbr': 680,
}, {
'format_id': 'video_deu-1253',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
@ -1725,8 +1695,9 @@ def test_parse_ism_formats(self):
'vcodec': 'AVC1',
'acodec': 'none',
'protocol': 'ism',
'_download_params':
{
'vbr': 1253,
'language': 'deu',
'_download_params': {
'stream_type': 'video',
'duration': 370000000,
'timescale': 10000000,
@ -1739,9 +1710,6 @@ def test_parse_ism_formats(self):
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
'video_ext': 'ismv',
'audio_ext': 'none',
'vbr': 1253,
}, {
'format_id': 'video_deu-2121',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
@ -1753,8 +1721,8 @@ def test_parse_ism_formats(self):
'vcodec': 'AVC1',
'acodec': 'none',
'protocol': 'ism',
'_download_params':
{
'language': 'deu',
'_download_params': {
'stream_type': 'video',
'duration': 370000000,
'timescale': 10000000,
@ -1767,9 +1735,6 @@ def test_parse_ism_formats(self):
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
'video_ext': 'ismv',
'audio_ext': 'none',
'vbr': 2121,
}, {
'format_id': 'video_deu-3275',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
@ -1781,8 +1746,8 @@ def test_parse_ism_formats(self):
'vcodec': 'AVC1',
'acodec': 'none',
'protocol': 'ism',
'_download_params':
{
'language': 'deu',
'_download_params': {
'stream_type': 'video',
'duration': 370000000,
'timescale': 10000000,
@ -1795,9 +1760,6 @@ def test_parse_ism_formats(self):
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
'video_ext': 'ismv',
'audio_ext': 'none',
'vbr': 3275,
}, {
'format_id': 'video_deu-5300',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
@ -1809,8 +1771,8 @@ def test_parse_ism_formats(self):
'vcodec': 'AVC1',
'acodec': 'none',
'protocol': 'ism',
'_download_params':
{
'language': 'deu',
'_download_params': {
'stream_type': 'video',
'duration': 370000000,
'timescale': 10000000,
@ -1823,9 +1785,6 @@ def test_parse_ism_formats(self):
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
'video_ext': 'ismv',
'audio_ext': 'none',
'vbr': 5300,
}, {
'format_id': 'video_deu-8079',
'url': 'https://smstr01.dmm.t-online.de/smooth24/smoothstream_m1/streaming/sony/9221438342941275747/636887760842957027/25_km_h-Trailer-9221571562372022953_deu_20_1300k_HD_H_264_ISMV.ism/Manifest',
@ -1837,8 +1796,8 @@ def test_parse_ism_formats(self):
'vcodec': 'AVC1',
'acodec': 'none',
'protocol': 'ism',
'_download_params':
{
'language': 'deu',
'_download_params': {
'stream_type': 'video',
'duration': 370000000,
'timescale': 10000000,
@ -1851,9 +1810,6 @@ def test_parse_ism_formats(self):
'bits_per_sample': 16,
'nal_unit_length_field': 4
},
'video_ext': 'ismv',
'audio_ext': 'none',
'vbr': 8079,
}],
{},
),

View file

@ -822,6 +822,10 @@ def expect_same_infodict(out):
test('%(title&foo|baz)s.bar', 'baz.bar')
test('%(x,id&foo|baz)s.bar', 'foo.bar')
test('%(x,title&foo|baz)s.bar', 'baz.bar')
test('%(id&a\nb|)s', ('a\nb', 'a b'))
test('%(id&hi {:>10} {}|)s', 'hi 1234 1234')
test(R'%(id&{0} {}|)s', 'NA')
test(R'%(id&{0.1}|)s', 'NA')
# Laziness
def gen():

View file

@ -445,6 +445,22 @@ def test_bitwise_operators_overflow(self):
jsi = JSInterpreter('function x(){return 1236566549 << 5}')
self.assertEqual(jsi.call_function('x'), 915423904)
def test_negative(self):
jsi = JSInterpreter("function f(){return 2 * -2.0;}")
self.assertEqual(jsi.call_function('f'), -4)
jsi = JSInterpreter('function f(){return 2 - - -2;}')
self.assertEqual(jsi.call_function('f'), 0)
jsi = JSInterpreter('function f(){return 2 - - - -2;}')
self.assertEqual(jsi.call_function('f'), 4)
jsi = JSInterpreter('function f(){return 2 - + + - -2;}')
self.assertEqual(jsi.call_function('f'), 0)
jsi = JSInterpreter('function f(){return 2 + - + - -2;}')
self.assertEqual(jsi.call_function('f'), 0)
if __name__ == '__main__':
unittest.main()

View file

@ -1195,6 +1195,13 @@ def test_js_to_json_malformed(self):
self.assertEqual(js_to_json('42a1'), '42"a1"')
self.assertEqual(js_to_json('42a-1'), '42"a"-1')
def test_js_to_json_template_literal(self):
self.assertEqual(js_to_json('`Hello ${name}`', {'name': '"world"'}), '"Hello world"')
self.assertEqual(js_to_json('`${name}${name}`', {'name': '"X"'}), '"XX"')
self.assertEqual(js_to_json('`${name}${name}`', {'name': '5'}), '"55"')
self.assertEqual(js_to_json('`${name}"${name}"`', {'name': '5'}), '"5\\"5\\""')
self.assertEqual(js_to_json('`${name}`', {}), '"name"')
def test_extract_attributes(self):
self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
@ -2014,6 +2021,8 @@ def test_traverse_obj(self):
msg='nested `...` queries should work')
self.assertCountEqual(traverse_obj(_TEST_DATA, (..., ..., 'index')), range(4),
msg='`...` query result should be flattened')
self.assertEqual(traverse_obj(iter(range(4)), ...), list(range(4)),
msg='`...` should accept iterables')
# Test function as key
self.assertEqual(traverse_obj(_TEST_DATA, lambda x, y: x == 'urls' and isinstance(y, list)),
@ -2021,6 +2030,8 @@ def test_traverse_obj(self):
msg='function as query key should perform a filter based on (key, value)')
self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], str)), {'str'},
msg='exceptions in the query function should be catched')
self.assertEqual(traverse_obj(iter(range(4)), lambda _, x: x % 2 == 0), [0, 2],
msg='function key should accept iterables')
if __debug__:
with self.assertRaises(Exception, msg='Wrong function signature should raise in debug'):
traverse_obj(_TEST_DATA, lambda a: ...)
@ -2045,6 +2056,17 @@ def test_traverse_obj(self):
with self.assertRaises(Exception, msg='Sets with length != 1 should raise in debug'):
traverse_obj(_TEST_DATA, {str.upper, str})
# Test `slice` as a key
_SLICE_DATA = [0, 1, 2, 3, 4]
self.assertEqual(traverse_obj(_TEST_DATA, ('dict', slice(1))), None,
msg='slice on a dictionary should not throw')
self.assertEqual(traverse_obj(_SLICE_DATA, slice(1)), _SLICE_DATA[:1],
msg='slice key should apply slice to sequence')
self.assertEqual(traverse_obj(_SLICE_DATA, slice(1, 2)), _SLICE_DATA[1:2],
msg='slice key should apply slice to sequence')
self.assertEqual(traverse_obj(_SLICE_DATA, slice(1, 4, 2)), _SLICE_DATA[1:4:2],
msg='slice key should apply slice to sequence')
# Test alternative paths
self.assertEqual(traverse_obj(_TEST_DATA, 'fail', 'str'), 'str',
msg='multiple `paths` should be treated as alternative paths')
@ -2228,6 +2250,12 @@ def test_traverse_obj(self):
self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', (0, 2)),
traverse_string=True), ['s', 'r'],
msg='branching should result in list if `traverse_string`')
self.assertEqual(traverse_obj({}, (0, ...), traverse_string=True), [],
msg='branching should result in list if `traverse_string`')
self.assertEqual(traverse_obj({}, (0, lambda x, y: True), traverse_string=True), [],
msg='branching should result in list if `traverse_string`')
self.assertEqual(traverse_obj({}, (0, slice(1)), traverse_string=True), [],
msg='branching should result in list if `traverse_string`')
# Test is_user_input behavior
_IS_USER_INPUT_DATA = {'range8': list(range(8))}

View file

@ -142,6 +142,10 @@
'https://www.youtube.com/s/player/dac945fd/player_ias.vflset/en_US/base.js',
'o8BkRxXhuYsBCWi6RplPdP', '3Lx32v_hmzTm6A',
),
(
'https://www.youtube.com/s/player/6f20102c/player_ias.vflset/en_US/base.js',
'lE8DhoDmKqnmJJ', 'pJTTX6XyJP2BYw',
),
]

View file

@ -21,7 +21,7 @@
import traceback
import unicodedata
import urllib.request
from string import ascii_letters
from string import Formatter, ascii_letters
from .cache import Cache
from .compat import compat_os_name, compat_shlex_quote
@ -1161,7 +1161,7 @@ def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False):
}
MATH_FIELD_RE = rf'(?:{FIELD_RE}|-?{NUMBER_RE})'
MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys()))
INTERNAL_FORMAT_RE = re.compile(rf'''(?x)
INTERNAL_FORMAT_RE = re.compile(rf'''(?xs)
(?P<negate>-)?
(?P<fields>{FIELD_RE})
(?P<maths>(?:{MATH_OPERATORS_RE}{MATH_FIELD_RE})*)
@ -1242,6 +1242,14 @@ def _dumpjson_default(obj):
return list(obj)
return repr(obj)
class _ReplacementFormatter(Formatter):
def get_field(self, field_name, args, kwargs):
if field_name.isdigit():
return args[0], -1
raise ValueError('Unsupported field')
replacement_formatter = _ReplacementFormatter()
def create_key(outer_mobj):
if not outer_mobj.group('has_key'):
return outer_mobj.group(0)
@ -1263,7 +1271,13 @@ def create_key(outer_mobj):
if fmt == 's' and value is not None and key in field_size_compat_map.keys():
fmt = f'0{field_size_compat_map[key]:d}d'
value = default if value is None else value if replacement is None else replacement
if value is None:
value = default
elif replacement is not None:
try:
value = replacement_formatter.format(replacement, value)
except ValueError:
value = na
flags = outer_mobj.group('conversion') or ''
str_fmt = f'{fmt[:-1]}s'
@ -1668,7 +1682,7 @@ def process_ie_result(self, ie_result, download=True, extra_info=None):
self.add_extra_info(info_copy, extra_info)
info_copy, _ = self.pre_process(info_copy)
self._fill_common_fields(info_copy, False)
self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True)
self.__forced_printings(info_copy)
self._raise_pending_errors(info_copy)
if self.params.get('force_write_download_archive', False):
self.record_download_archive(info_copy)
@ -1937,7 +1951,7 @@ def _build_format_filter(self, filter_spec):
'!=': operator.ne,
}
operator_rex = re.compile(r'''(?x)\s*
(?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s*
(?P<key>[\w.-]+)\s*
(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
(?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s*
''' % '|'.join(map(re.escape, OPERATORS.keys())))
@ -2710,7 +2724,7 @@ def is_wellformed(f):
self.list_formats(info_dict)
if list_only:
# Without this printing, -F --print-json will not work
self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True)
self.__forced_printings(info_dict)
return info_dict
format_selector = self.format_selector
@ -2870,6 +2884,12 @@ def _forceprint(self, key, info_dict):
if info_dict is None:
return
info_copy = info_dict.copy()
info_copy.setdefault('filename', self.prepare_filename(info_dict))
if info_dict.get('requested_formats') is not None:
# For RTMP URLs, also include the playpath
info_copy['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
elif info_dict.get('url'):
info_copy['urls'] = info_dict['url'] + info_dict.get('play_path', '')
info_copy['formats_table'] = self.render_formats_table(info_dict)
info_copy['thumbnails_table'] = self.render_thumbnails_table(info_dict)
info_copy['subtitles_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('subtitles'))
@ -2895,46 +2915,36 @@ def format_tmpl(tmpl):
tmpl = format_tmpl(tmpl)
self.to_screen(f'[info] Writing {tmpl!r} to: {filename}')
if self._ensure_dir_exists(filename):
with open(filename, 'a', encoding='utf-8') as f:
f.write(self.evaluate_outtmpl(tmpl, info_copy) + '\n')
with open(filename, 'a', encoding='utf-8', newline='') as f:
f.write(self.evaluate_outtmpl(tmpl, info_copy) + os.linesep)
def __forced_printings(self, info_dict, filename, incomplete):
def print_mandatory(field, actual_field=None):
if actual_field is None:
actual_field = field
if (self.params.get('force%s' % field, False)
and (not incomplete or info_dict.get(actual_field) is not None)):
self.to_stdout(info_dict[actual_field])
def print_optional(field):
if (self.params.get('force%s' % field, False)
and info_dict.get(field) is not None):
self.to_stdout(info_dict[field])
info_dict = info_dict.copy()
if filename is not None:
info_dict['filename'] = filename
if info_dict.get('requested_formats') is not None:
# For RTMP URLs, also include the playpath
info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats'])
elif info_dict.get('url'):
info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '')
return info_copy
def __forced_printings(self, info_dict, filename=None, incomplete=True):
if (self.params.get('forcejson')
or self.params['forceprint'].get('video')
or self.params['print_to_file'].get('video')):
self.post_extract(info_dict)
self._forceprint('video', info_dict)
if filename:
info_dict['filename'] = filename
info_copy = self._forceprint('video', info_dict)
print_mandatory('title')
print_mandatory('id')
print_mandatory('url', 'urls')
print_optional('thumbnail')
print_optional('description')
print_optional('filename')
if self.params.get('forceduration') and info_dict.get('duration') is not None:
self.to_stdout(formatSeconds(info_dict['duration']))
print_mandatory('format')
def print_field(field, actual_field=None, optional=False):
if actual_field is None:
actual_field = field
if self.params.get(f'force{field}') and (
info_copy.get(field) is not None or (not optional and not incomplete)):
self.to_stdout(info_copy[actual_field])
print_field('title')
print_field('id')
print_field('url', 'urls')
print_field('thumbnail', optional=True)
print_field('description', optional=True)
print_field('filename', optional=True)
if self.params.get('forceduration') and info_copy.get('duration') is not None:
self.to_stdout(formatSeconds(info_copy['duration']))
print_field('format')
if self.params.get('forcejson'):
self.to_stdout(json.dumps(self.sanitize_info(info_dict)))
@ -3316,7 +3326,7 @@ def ffmpeg_fixup(cndn, msg, cls):
or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None,
'Possible MPEG-TS in MP4 container or malformed AAC timestamps',
FFmpegFixupM3u8PP)
ffmpeg_fixup(info_dict.get('is_live') and downloader == 'DashSegmentsFD',
ffmpeg_fixup(info_dict.get('is_live') and downloader == 'dashsegments',
'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP)
ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed timestamps detected', FFmpegFixupTimestampPP)
@ -3482,7 +3492,7 @@ def run_pp(self, pp, infodict):
*files_to_delete, info=infodict, msg='Deleting original file %s (pass -k to keep)')
return infodict
def run_all_pps(self, key, info, *, additional_pps=None):
def run_all_pps(self, key, info, *, additional_pps=None, fatal=True):
if key != 'video':
self._forceprint(key, info)
for pp in (additional_pps or []) + self._pps[key]:

View file

@ -412,12 +412,17 @@ def metadataparser_actions(f):
except Exception as err:
raise ValueError(f'Invalid playlist-items {opts.playlist_items!r}: {err}')
geo_bypass_code = opts.geo_bypass_ip_block or opts.geo_bypass_country
if geo_bypass_code is not None:
opts.geo_bypass_country, opts.geo_bypass_ip_block = None, None
if opts.geo_bypass.lower() not in ('default', 'never'):
try:
GeoUtils.random_ipv4(geo_bypass_code)
GeoUtils.random_ipv4(opts.geo_bypass)
except Exception:
raise ValueError('unsupported geo-bypass country or ip-block')
raise ValueError(f'Unsupported --xff "{opts.geo_bypass}"')
if len(opts.geo_bypass) == 2:
opts.geo_bypass_country = opts.geo_bypass
else:
opts.geo_bypass_ip_block = opts.geo_bypass
opts.geo_bypass = opts.geo_bypass.lower() != 'never'
opts.match_filter = match_filter_func(opts.match_filter, opts.breaking_match_filter)
@ -720,7 +725,8 @@ def parse_options(argv=None):
'dumpjson', 'dump_single_json', 'getdescription', 'getduration', 'getfilename',
'getformat', 'getid', 'getthumbnail', 'gettitle', 'geturl'
))
opts.quiet = opts.quiet or any_getting or opts.print_json or bool(opts.forceprint)
if opts.quiet is None:
opts.quiet = any_getting or opts.print_json or bool(opts.forceprint)
playlist_pps = [pp for pp in postprocessors if pp.get('when') == 'playlist']
write_playlist_infojson = (opts.writeinfojson and not opts.clean_infojson

View file

@ -1,4 +1,4 @@
import types
from ..compat.compat_utils import passthrough_module
try:
import Cryptodome as _parent
@ -6,9 +6,11 @@
try:
import Crypto as _parent
except (ImportError, SyntaxError): # Old Crypto gives SyntaxError in newer Python
_parent = types.ModuleType('no_Cryptodome')
_parent = passthrough_module(__name__, 'no_Cryptodome')
__bool__ = lambda: False
del passthrough_module
__version__ = ''
AES = PKCS1_v1_5 = Blowfish = PKCS1_OAEP = SHA1 = CMAC = RSA = None
try:

View file

@ -254,6 +254,14 @@
BRMediathekIE,
)
from .bravotv import BravoTVIE
from .brainpop import (
BrainPOPIE,
BrainPOPJrIE,
BrainPOPELLIE,
BrainPOPEspIE,
BrainPOPFrIE,
BrainPOPIlIE,
)
from .breakcom import BreakIE
from .breitbart import BreitBartIE
from .brightcove import (
@ -298,7 +306,10 @@
CBCGemPlaylistIE,
CBCGemLiveIE,
)
from .cbs import CBSIE
from .cbs import (
CBSIE,
ParamountPressExpressIE,
)
from .cbslocal import (
CBSLocalIE,
CBSLocalArticleIE,
@ -345,6 +356,7 @@
)
from .ciscowebex import CiscoWebexIE
from .cjsw import CJSWIE
from .clipchamp import ClipchampIE
from .cliphunter import CliphunterIE
from .clippit import ClippitIE
from .cliprs import ClipRsIE
@ -441,6 +453,10 @@
)
from .democracynow import DemocracynowIE
from .detik import DetikEmbedIE
from .dlf import (
DLFIE,
DLFCorpusIE,
)
from .dfb import DFBIE
from .dhm import DHMIE
from .digg import DiggIE
@ -674,10 +690,18 @@
from .giantbomb import GiantBombIE
from .giga import GigaIE
from .glide import GlideIE
from .globalplayer import (
GlobalPlayerLiveIE,
GlobalPlayerLivePlaylistIE,
GlobalPlayerAudioIE,
GlobalPlayerAudioEpisodeIE,
GlobalPlayerVideoIE
)
from .globo import (
GloboIE,
GloboArticleIE,
)
from .gmanetwork import GMANetworkVideoIE
from .go import GoIE
from .godtube import GodTubeIE
from .gofile import GofileIE
@ -709,13 +733,16 @@
from .heise import HeiseIE
from .hellporno import HellPornoIE
from .helsinki import HelsinkiIE
from .hentaistigma import HentaiStigmaIE
from .hgtv import HGTVComShowIE
from .hketv import HKETVIE
from .hidive import HiDiveIE
from .historicfilms import HistoricFilmsIE
from .hitbox import HitboxIE, HitboxLiveIE
from .hitrecord import HitRecordIE
from .hollywoodreporter import (
HollywoodReporterIE,
HollywoodReporterPlaylistIE,
)
from .holodex import HolodexIE
from .hotnewhiphop import HotNewHipHopIE
from .hotstar import (
@ -727,6 +754,7 @@
)
from .howcast import HowcastIE
from .howstuffworks import HowStuffWorksIE
from .hrefli import HrefLiRedirectIE
from .hrfensehen import HRFernsehenIE
from .hrti import (
HRTiIE,
@ -936,10 +964,6 @@
LimelightChannelIE,
LimelightChannelListIE,
)
from .line import (
LineLiveIE,
LineLiveChannelIE,
)
from .linkedin import (
LinkedInIE,
LinkedInLearningIE,
@ -1219,6 +1243,8 @@
NhkForSchoolBangumiIE,
NhkForSchoolSubjectIE,
NhkForSchoolProgramListIE,
NhkRadioNewsPageIE,
NhkRadiruIE,
)
from .nhl import NHLIE
from .nick import (
@ -1390,6 +1416,7 @@
PeriscopeIE,
PeriscopeUserIE,
)
from .pgatour import PGATourIE
from .philharmoniedeparis import PhilharmonieDeParisIE
from .phoenix import PhoenixIE
from .photobucket import PhotobucketIE
@ -1606,6 +1633,11 @@
from .rtp import RTPIE
from .rtrfm import RTRFMIE
from .rts import RTSIE
from .rtvcplay import (
RTVCPlayIE,
RTVCPlayEmbedIE,
RTVCKalturaIE,
)
from .rtve import (
RTVEALaCartaIE,
RTVEAudioIE,
@ -1675,6 +1707,7 @@
)
from .scrolller import ScrolllerIE
from .seeker import SeekerIE
from .senalcolombia import SenalColombiaLiveIE
from .senategov import SenateISVPIE, SenateGovIE
from .sendtonews import SendtoNewsIE
from .servus import ServusIE
@ -1772,6 +1805,7 @@
BellatorIE,
ParamountNetworkIE,
)
from .stageplus import StagePlusVODConcertIE
from .startrek import StarTrekIE
from .stitcher import (
StitcherIE,
@ -1954,6 +1988,7 @@
from .triller import (
TrillerIE,
TrillerUserIE,
TrillerShortIE,
)
from .trilulilu import TriluliluIE
from .trovo import (
@ -2280,6 +2315,8 @@
WeiboMobileIE
)
from .weiqitv import WeiqiTVIE
from .wevidi import WeVidiIE
from .whyp import WhypIE
from .wikimedia import WikimediaIE
from .willow import WillowIE
from .wimtv import WimTVIE
@ -2334,8 +2371,6 @@
from .yahoo import (
YahooIE,
YahooSearchIE,
YahooGyaOPlayerIE,
YahooGyaOIE,
YahooJapanNewsIE,
)
from .yandexdisk import YandexDiskIE

View file

@ -436,6 +436,16 @@ def _real_extract(self, url):
if 3 not in ondemand_types:
# cannot acquire decryption key for these streams
self.report_warning('This is a premium-only stream')
info.update(traverse_obj(api_response, {
'series': ('series', 'title'),
'season': ('season', 'title'),
'season_number': ('season', 'sequence'),
'episode_number': ('episode', 'number'),
}))
if not title:
title = traverse_obj(api_response, ('episode', 'title'))
if not description:
description = traverse_obj(api_response, ('episode', 'content'))
m3u8_url = f'https://vod-abematv.akamaized.net/program/{video_id}/playlist.m3u8'
elif video_type == 'slots':

View file

@ -1573,7 +1573,7 @@ def extract_redirect_url(html, url=None, fatal=False):
}), headers={
'Content-Type': 'application/x-www-form-urlencoded'
})
elif mso_id == 'Spectrum':
elif mso_id in ('Spectrum', 'Charter_Direct'):
# Spectrum's login for is dynamically loaded via JS so we need to hardcode the flow
# as a one-off implementation.
provider_redirect_page, urlh = provider_redirect_page_res

View file

@ -1,5 +1,6 @@
from .common import InfoExtractor
from .vimeo import VimeoIE
from ..utils import ExtractorError, traverse_obj, url_or_none
class AeonCoIE(InfoExtractor):
@ -19,22 +20,55 @@ class AeonCoIE(InfoExtractor):
}
}, {
'url': 'https://aeon.co/videos/dazzling-timelapse-shows-how-microbes-spoil-our-food-and-sometimes-enrich-it',
'md5': '4e5f3dad9dbda0dbfa2da41a851e631e',
'md5': '03582d795382e49f2fd0b427b55de409',
'info_dict': {
'id': '728595228',
'id': '759576926',
'ext': 'mp4',
'title': 'Wrought',
'thumbnail': 'https://i.vimeocdn.com/video/1484618528-c91452611f9a4e4497735a533da60d45b2fe472deb0c880f0afaab0cd2efb22a-d_1280',
'uploader': 'Biofilm Productions',
'uploader_id': 'user140352216',
'uploader_url': 'https://vimeo.com/user140352216',
'thumbnail': 'https://i.vimeocdn.com/video/1525599692-84614af88e446612f49ca966cf8f80eab2c73376bedd80555741c521c26f9a3e-d_1280',
'uploader': 'Aeon Video',
'uploader_id': 'aeonvideo',
'uploader_url': 'https://vimeo.com/aeonvideo',
'duration': 1344
}
}, {
'url': 'https://aeon.co/videos/chew-over-the-prisoners-dilemma-and-see-if-you-can-find-the-rational-path-out',
'md5': '1cfda0bf3ae24df17d00f2c0cb6cc21b',
'info_dict': {
'id': 'emyi4z-O0ls',
'ext': 'mp4',
'title': 'How to outsmart the Prisoners Dilemma - Lucas Husted',
'thumbnail': 'https://i.ytimg.com/vi_webp/emyi4z-O0ls/maxresdefault.webp',
'uploader': 'TED-Ed',
'uploader_id': '@TEDEd',
'uploader_url': 'https://www.youtube.com/@TEDEd',
'duration': 344,
'upload_date': '20200827',
'channel_id': 'UCsooa4yRKGN_zEE8iknghZA',
'playable_in_embed': True,
'description': 'md5:c0959524f08cb60f96fd010f3dfb17f3',
'categories': ['Education'],
'like_count': int,
'channel': 'TED-Ed',
'chapters': 'count:7',
'channel_url': 'https://www.youtube.com/channel/UCsooa4yRKGN_zEE8iknghZA',
'tags': 'count:26',
'availability': 'public',
'channel_follower_count': int,
'view_count': int,
'age_limit': 0,
'live_status': 'not_live',
'comment_count': int,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
vimeo_id = self._search_regex(r'hosterId":\s*"(?P<id>[0-9]+)', webpage, 'vimeo id')
vimeo_url = VimeoIE._smuggle_referrer(f'https://player.vimeo.com/video/{vimeo_id}', 'https://aeon.co')
return self.url_result(vimeo_url, VimeoIE)
embed_url = traverse_obj(self._yield_json_ld(webpage, video_id), (
lambda _, v: v['@type'] == 'VideoObject', 'embedUrl', {url_or_none}), get_all=False)
if not embed_url:
raise ExtractorError('No embed URL found in webpage')
if 'player.vimeo.com' in embed_url:
embed_url = VimeoIE._smuggle_referrer(embed_url, 'https://aeon.co/')
return self.url_result(embed_url)

View file

@ -26,6 +26,7 @@
srt_subtitles_timecode,
str_or_none,
traverse_obj,
unified_timestamp,
unsmuggle_url,
url_or_none,
urlencode_postdata,
@ -133,7 +134,7 @@ def _get_all_children(self, reply):
class BiliBiliIE(BilibiliBaseIE):
_VALID_URL = r'https?://www\.bilibili\.com/video/[aAbB][vV](?P<id>[^/?#&]+)'
_VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.bilibili.com/video/BV13x41117TL',
@ -281,19 +282,60 @@ class BiliBiliIE(BilibiliBaseIE):
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
},
'params': {'skip_download': True},
}, {
'note': 'video redirects to festival page',
'url': 'https://www.bilibili.com/video/BV1wP4y1P72h',
'info_dict': {
'id': 'BV1wP4y1P72h',
'ext': 'mp4',
'title': '牛虎年相交之际一首传统民族打击乐《牛斗虎》祝大家新春快乐虎年大吉【bilibili音乐虎闹新春】',
'timestamp': 1643947497,
'upload_date': '20220204',
'description': 'md5:8681a0d4d2c06b4ae27e59c8080a7fe6',
'uploader': '叨叨冯聊音乐',
'duration': 246.719,
'uploader_id': '528182630',
'view_count': int,
'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
},
'params': {'skip_download': True},
}, {
'note': 'newer festival video',
'url': 'https://www.bilibili.com/festival/2023honkaiimpact3gala?bvid=BV1ay4y1d77f',
'info_dict': {
'id': 'BV1ay4y1d77f',
'ext': 'mp4',
'title': '【崩坏3新春剧场】为特别的你送上祝福',
'timestamp': 1674273600,
'upload_date': '20230121',
'description': 'md5:58af66d15c6a0122dc30c8adfd828dd8',
'uploader': '果蝇轰',
'duration': 1111.722,
'uploader_id': '8469526',
'view_count': int,
'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
},
'params': {'skip_download': True},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id)
play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data']
is_festival = 'videoData' not in initial_state
if is_festival:
video_data = initial_state['videoInfo']
else:
play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data']
video_data = initial_state['videoData']
video_id, title = video_data['bvid'], video_data.get('title')
# Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself.
page_list_json = traverse_obj(
page_list_json = not is_festival and traverse_obj(
self._download_json(
'https://api.bilibili.com/x/player/pagelist', video_id,
fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'},
@ -316,20 +358,39 @@ def _real_extract(self, url):
cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid')
festival_info = {}
if is_festival:
play_info = self._download_json(
'https://api.bilibili.com/x/player/playurl', video_id,
query={'bvid': video_id, 'cid': cid, 'fnval': 4048},
note='Extracting festival video formats')['data']
festival_info = traverse_obj(initial_state, {
'uploader': ('videoInfo', 'upName'),
'uploader_id': ('videoInfo', 'upMid', {str_or_none}),
'like_count': ('videoStatus', 'like', {int_or_none}),
'thumbnail': ('sectionEpisodes', lambda _, v: v['bvid'] == video_id, 'cover'),
}, get_all=False)
return {
**traverse_obj(initial_state, {
'uploader': ('upData', 'name'),
'uploader_id': ('upData', 'mid', {str_or_none}),
'like_count': ('videoData', 'stat', 'like', {int_or_none}),
'tags': ('tags', ..., 'tag_name'),
'thumbnail': ('videoData', 'pic', {url_or_none}),
}),
**festival_info,
**traverse_obj(video_data, {
'description': 'desc',
'timestamp': ('pubdate', {int_or_none}),
'view_count': (('viewCount', ('stat', 'view')), {int_or_none}),
'comment_count': ('stat', 'reply', {int_or_none}),
}, get_all=False),
'id': f'{video_id}{format_field(part_id, None, "_p%d")}',
'formats': self.extract_formats(play_info),
'_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None,
'title': title,
'description': traverse_obj(initial_state, ('videoData', 'desc')),
'view_count': traverse_obj(initial_state, ('videoData', 'stat', 'view')),
'uploader': traverse_obj(initial_state, ('upData', 'name')),
'uploader_id': traverse_obj(initial_state, ('upData', 'mid')),
'like_count': traverse_obj(initial_state, ('videoData', 'stat', 'like')),
'comment_count': traverse_obj(initial_state, ('videoData', 'stat', 'reply')),
'tags': traverse_obj(initial_state, ('tags', ..., 'tag_name')),
'thumbnail': traverse_obj(initial_state, ('videoData', 'pic')),
'timestamp': traverse_obj(initial_state, ('videoData', 'pubdate')),
'duration': float_or_none(play_info.get('timelength'), scale=1000),
'chapters': self._get_chapters(aid, cid),
'subtitles': self.extract_subtitles(video_id, aid, cid),
@ -996,6 +1057,53 @@ class BiliIntlIE(BiliIntlBaseIE):
'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$',
'upload_date': '20221212',
'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation',
},
}, {
# episode comment extraction
'url': 'https://www.bilibili.tv/en/play/34580/340317',
'info_dict': {
'id': '340317',
'ext': 'mp4',
'timestamp': 1604057820,
'upload_date': '20201030',
'episode_number': 5,
'title': 'E5 - My Own Steel',
'description': 'md5:2b17ab10aebb33e3c2a54da9e8e487e2',
'thumbnail': r're:https?://pic\.bstarstatic\.com/ogv/.+\.png$',
'episode': 'Episode 5',
'comment_count': int,
'chapters': [{
'start_time': 0,
'end_time': 61.0,
'title': '<Untitled Chapter 1>'
}, {
'start_time': 61.0,
'end_time': 134.0,
'title': 'Intro'
}, {
'start_time': 1290.0,
'end_time': 1379.0,
'title': 'Outro'
}],
},
'params': {
'getcomments': True
}
}, {
# user generated content comment extraction
'url': 'https://www.bilibili.tv/en/video/2045730385',
'info_dict': {
'id': '2045730385',
'ext': 'mp4',
'description': 'md5:693b6f3967fb4e7e7764ea817857c33a',
'timestamp': 1667891924,
'upload_date': '20221108',
'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan - Bstation',
'comment_count': int,
'thumbnail': 'https://pic.bstarstatic.com/ugc/f6c363659efd2eabe5683fbb906b1582.jpg',
},
'params': {
'getcomments': True
}
}, {
# episode id without intro and outro
@ -1055,11 +1163,69 @@ def _extract_video_metadata(self, url, video_id, season_id):
# XXX: webpage metadata may not accurate, it just used to not crash when video_data not found
return merge_dicts(
self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id), {
self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id, fatal=False), {
'title': self._html_search_meta('og:title', webpage),
'description': self._html_search_meta('og:description', webpage)
})
def _get_comments_reply(self, root_id, next_id=0, display_id=None):
comment_api_raw_data = self._download_json(
'https://api.bilibili.tv/reply/web/detail', display_id,
note=f'Downloading reply comment of {root_id} - {next_id}',
query={
'platform': 'web',
'ps': 20, # comment's reply per page (default: 3)
'root': root_id,
'next': next_id,
})
for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)):
yield {
'author': traverse_obj(replies, ('member', 'name')),
'author_id': traverse_obj(replies, ('member', 'mid')),
'author_thumbnail': traverse_obj(replies, ('member', 'face')),
'text': traverse_obj(replies, ('content', 'message')),
'id': replies.get('rpid'),
'like_count': int_or_none(replies.get('like_count')),
'parent': replies.get('parent'),
'timestamp': unified_timestamp(replies.get('ctime_text'))
}
if not traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')):
yield from self._get_comments_reply(
root_id, comment_api_raw_data['data']['cursor']['next'], display_id)
def _get_comments(self, video_id, ep_id):
for i in itertools.count(0):
comment_api_raw_data = self._download_json(
'https://api.bilibili.tv/reply/web/root', video_id,
note=f'Downloading comment page {i + 1}',
query={
'platform': 'web',
'pn': i, # page number
'ps': 20, # comment per page (default: 20)
'oid': video_id,
'type': 3 if ep_id else 1, # 1: user generated content, 3: series content
'sort_type': 1, # 1: best, 2: recent
})
for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)):
yield {
'author': traverse_obj(replies, ('member', 'name')),
'author_id': traverse_obj(replies, ('member', 'mid')),
'author_thumbnail': traverse_obj(replies, ('member', 'face')),
'text': traverse_obj(replies, ('content', 'message')),
'id': replies.get('rpid'),
'like_count': int_or_none(replies.get('like_count')),
'timestamp': unified_timestamp(replies.get('ctime_text')),
'author_is_uploader': bool(traverse_obj(replies, ('member', 'type'))),
}
if replies.get('count'):
yield from self._get_comments_reply(replies.get('rpid'), display_id=video_id)
if traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')):
break
def _real_extract(self, url):
season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid')
video_id = ep_id or aid
@ -1087,7 +1253,8 @@ def _real_extract(self, url):
**self._extract_video_metadata(url, video_id, season_id),
'formats': self._get_formats(ep_id=ep_id, aid=aid),
'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid),
'chapters': chapters
'chapters': chapters,
'__post_extractor': self.extract_comments(video_id, ep_id)
}

View file

@ -77,7 +77,10 @@ class BitChuteIE(InfoExtractor):
def _check_format(self, video_url, video_id):
urls = orderedSet(
re.sub(r'(^https?://)(seed\d+)(?=\.bitchute\.com)', fr'\g<1>{host}', video_url)
for host in (r'\g<2>', 'seed150', 'seed151', 'seed152', 'seed153'))
for host in (r'\g<2>', 'seed122', 'seed125', 'seed126', 'seed128',
'seed132', 'seed150', 'seed151', 'seed152', 'seed153',
'seed167', 'seed171', 'seed177', 'seed305', 'seed307',
'seedp29xb', 'zb10-7gsop1v78'))
for url in urls:
try:
response = self._request_webpage(

View file

@ -0,0 +1,318 @@
import json
import re
from .common import InfoExtractor
from ..utils import (
classproperty,
int_or_none,
traverse_obj,
urljoin
)
class BrainPOPBaseIE(InfoExtractor):
_NETRC_MACHINE = 'brainpop'
_ORIGIN = '' # So that _VALID_URL doesn't crash
_LOGIN_ERRORS = {
1502: 'The username and password you entered did not match.', # LOGIN_FAILED
1503: 'Payment method is expired.', # LOGIN_FAILED_ACCOUNT_NOT_ACTIVE
1506: 'Your BrainPOP plan has expired.', # LOGIN_FAILED_ACCOUNT_EXPIRED
1507: 'Terms not accepted.', # LOGIN_FAILED_TERMS_NOT_ACCEPTED
1508: 'Account not activated.', # LOGIN_FAILED_SUBSCRIPTION_NOT_ACTIVE
1512: 'The maximum number of devices permitted are logged in with your account right now.', # LOGIN_FAILED_LOGIN_LIMIT_REACHED
1513: 'You are trying to access your account from outside of its allowed IP range.', # LOGIN_FAILED_INVALID_IP
1514: 'Individual accounts are not included in your plan. Try again with your shared username and password.', # LOGIN_FAILED_MBP_DISABLED
1515: 'Account not activated.', # LOGIN_FAILED_TEACHER_NOT_ACTIVE
1523: 'That username and password won\'t work on this BrainPOP site.', # LOGIN_FAILED_NO_ACCESS
1524: 'You\'ll need to join a class before you can login.', # LOGIN_FAILED_STUDENT_NO_PERIOD
1526: 'Your account is locked. Reset your password, or ask a teacher or administrator for help.', # LOGIN_FAILED_ACCOUNT_LOCKED
}
@classproperty
def _VALID_URL(cls):
root = re.escape(cls._ORIGIN).replace(r'https:', r'https?:').replace(r'www\.', r'(?:www\.)?')
return rf'{root}/(?P<slug>[^/]+/[^/]+/(?P<id>[^/?#&]+))'
def _assemble_formats(self, slug, format_id, display_id, token='', extra_fields={}):
formats = []
formats = self._extract_m3u8_formats(
f'{urljoin(self._HLS_URL, slug)}.m3u8?{token}',
display_id, 'mp4', m3u8_id=f'{format_id}-hls', fatal=False)
formats.append({
'format_id': format_id,
'url': f'{urljoin(self._VIDEO_URL, slug)}?{token}',
})
for f in formats:
f.update(extra_fields)
return formats
def _extract_adaptive_formats(self, data, token, display_id, key_format='%s', extra_fields={}):
formats = []
additional_key_formats = {
'%s': {},
'ad_%s': {
'format_note': 'Audio description',
'source_preference': -2
}
}
for additional_key_format, additional_key_fields in additional_key_formats.items():
for key_quality, key_index in enumerate(('high', 'low')):
full_key_index = additional_key_format % (key_format % key_index)
if data.get(full_key_index):
formats.extend(self._assemble_formats(data[full_key_index], full_key_index, display_id, token, {
'quality': -1 - key_quality,
**additional_key_fields,
**extra_fields
}))
return formats
def _perform_login(self, username, password):
login_res = self._download_json(
'https://api.brainpop.com/api/login', None,
data=json.dumps({'username': username, 'password': password}).encode(),
headers={
'Content-Type': 'application/json',
'Referer': self._ORIGIN
}, note='Logging in', errnote='Unable to log in', expected_status=400)
status_code = int_or_none(login_res['status_code'])
if status_code != 1505:
self.report_warning(
f'Unable to login: {self._LOGIN_ERRORS.get(status_code) or login_res.get("message")}'
or f'Got status code {status_code}')
class BrainPOPIE(BrainPOPBaseIE):
_ORIGIN = 'https://www.brainpop.com'
_VIDEO_URL = 'https://svideos.brainpop.com'
_HLS_URL = 'https://hls.brainpop.com'
_CDN_URL = 'https://cdn.brainpop.com'
_TESTS = [{
'url': 'https://www.brainpop.com/health/conflictresolution/martinlutherkingjr/movie?ref=null',
'md5': '3ead374233ae74c7f1b0029a01c972f0',
'info_dict': {
'id': '1f3259fa457292b4',
'ext': 'mp4',
'title': 'Martin Luther King, Jr.',
'display_id': 'martinlutherkingjr',
'description': 'md5:f403dbb2bf3ccc7cf4c59d9e43e3c349',
},
}, {
'url': 'https://www.brainpop.com/science/space/bigbang/',
'md5': '9a1ff0e77444dd9e437354eb669c87ec',
'info_dict': {
'id': 'acae52cd48c99acf',
'ext': 'mp4',
'title': 'Big Bang',
'display_id': 'bigbang',
'description': 'md5:3e53b766b0f116f631b13f4cae185d38',
},
'skip': 'Requires login',
}]
def _real_extract(self, url):
slug, display_id = self._match_valid_url(url).group('slug', 'id')
movie_data = self._download_json(
f'https://api.brainpop.com/api/content/published/bp/en/{slug}/movie?full=1', display_id,
'Downloading movie data JSON', 'Unable to download movie data')['data']
topic_data = traverse_obj(self._download_json(
f'https://api.brainpop.com/api/content/published/bp/en/{slug}?full=1', display_id,
'Downloading topic data JSON', 'Unable to download topic data', fatal=False),
('data', 'topic'), expected_type=dict) or movie_data['topic']
if not traverse_obj(movie_data, ('access', 'allow')):
reason = traverse_obj(movie_data, ('access', 'reason'))
if 'logged' in reason:
self.raise_login_required(reason, metadata_available=True)
else:
self.raise_no_formats(reason, video_id=display_id)
movie_feature = movie_data['feature']
movie_feature_data = movie_feature['data']
formats, subtitles = [], {}
formats.extend(self._extract_adaptive_formats(movie_feature_data, movie_feature_data.get('token', ''), display_id, '%s_v2', {
'language': movie_feature.get('language') or 'en',
'language_preference': 10
}))
for lang, localized_feature in traverse_obj(movie_feature, 'localization', default={}, expected_type=dict).items():
formats.extend(self._extract_adaptive_formats(localized_feature, localized_feature.get('token', ''), display_id, '%s_v2', {
'language': lang,
'language_preference': -10
}))
# TODO: Do localization fields also have subtitles?
for name, url in movie_feature_data.items():
lang = self._search_regex(
r'^subtitles_(?P<lang>\w+)$', name, 'subtitle metadata', default=None)
if lang and url:
subtitles.setdefault(lang, []).append({
'url': urljoin(self._CDN_URL, url)
})
return {
'id': topic_data['topic_id'],
'display_id': display_id,
'title': topic_data.get('name'),
'description': topic_data.get('synopsis'),
'formats': formats,
'subtitles': subtitles,
}
class BrainPOPLegacyBaseIE(BrainPOPBaseIE):
def _parse_js_topic_data(self, topic_data, display_id, token):
movie_data = topic_data['movies']
# TODO: Are there non-burned subtitles?
formats = self._extract_adaptive_formats(movie_data, token, display_id)
return {
'id': topic_data['EntryID'],
'display_id': display_id,
'title': topic_data.get('name'),
'alt_title': topic_data.get('title'),
'description': topic_data.get('synopsis'),
'formats': formats,
}
def _real_extract(self, url):
slug, display_id = self._match_valid_url(url).group('slug', 'id')
webpage = self._download_webpage(url, display_id)
topic_data = self._search_json(
r'var\s+content\s*=\s*', webpage, 'content data',
display_id, end_pattern=';')['category']['unit']['topic']
token = self._search_regex(r'ec_token\s*:\s*[\'"]([^\'"]+)', webpage, 'video token')
return self._parse_js_topic_data(topic_data, display_id, token)
class BrainPOPJrIE(BrainPOPLegacyBaseIE):
_ORIGIN = 'https://jr.brainpop.com'
_VIDEO_URL = 'https://svideos-jr.brainpop.com'
_HLS_URL = 'https://hls-jr.brainpop.com'
_CDN_URL = 'https://cdn-jr.brainpop.com'
_TESTS = [{
'url': 'https://jr.brainpop.com/health/feelingsandsel/emotions/',
'md5': '04e0561bb21770f305a0ce6cf0d869ab',
'info_dict': {
'id': '347',
'ext': 'mp4',
'title': 'Emotions',
'display_id': 'emotions',
},
}, {
'url': 'https://jr.brainpop.com/science/habitats/arctichabitats/',
'md5': 'b0ed063bbd1910df00220ee29340f5d6',
'info_dict': {
'id': '29',
'ext': 'mp4',
'title': 'Arctic Habitats',
'display_id': 'arctichabitats',
},
'skip': 'Requires login',
}]
class BrainPOPELLIE(BrainPOPLegacyBaseIE):
_ORIGIN = 'https://ell.brainpop.com'
_VIDEO_URL = 'https://svideos-esl.brainpop.com'
_HLS_URL = 'https://hls-esl.brainpop.com'
_CDN_URL = 'https://cdn-esl.brainpop.com'
_TESTS = [{
'url': 'https://ell.brainpop.com/level1/unit1/lesson1/',
'md5': 'a2012700cfb774acb7ad2e8834eed0d0',
'info_dict': {
'id': '1',
'ext': 'mp4',
'title': 'Lesson 1',
'display_id': 'lesson1',
'alt_title': 'Personal Pronouns',
},
}, {
'url': 'https://ell.brainpop.com/level3/unit6/lesson5/',
'md5': 'be19c8292c87b24aacfb5fda2f3f8363',
'info_dict': {
'id': '101',
'ext': 'mp4',
'title': 'Lesson 5',
'display_id': 'lesson5',
'alt_title': 'Review: Unit 6',
},
'skip': 'Requires login',
}]
class BrainPOPEspIE(BrainPOPLegacyBaseIE):
IE_DESC = 'BrainPOP Español'
_ORIGIN = 'https://esp.brainpop.com'
_VIDEO_URL = 'https://svideos.brainpop.com'
_HLS_URL = 'https://hls.brainpop.com'
_CDN_URL = 'https://cdn.brainpop.com/mx'
_TESTS = [{
'url': 'https://esp.brainpop.com/ciencia/la_diversidad_de_la_vida/ecosistemas/',
'md5': 'cb3f062db2b3c5240ddfcfde7108f8c9',
'info_dict': {
'id': '3893',
'ext': 'mp4',
'title': 'Ecosistemas',
'display_id': 'ecosistemas',
'description': 'md5:80fc55b07e241f8c8f2aa8d74deaf3c3',
},
}, {
'url': 'https://esp.brainpop.com/espanol/la_escritura/emily_dickinson/',
'md5': '98c1b9559e0e33777209c425cda7dac4',
'info_dict': {
'id': '7146',
'ext': 'mp4',
'title': 'Emily Dickinson',
'display_id': 'emily_dickinson',
'description': 'md5:2795ad87b1d239c9711c1e92ab5a978b',
},
'skip': 'Requires login',
}]
class BrainPOPFrIE(BrainPOPLegacyBaseIE):
IE_DESC = 'BrainPOP Français'
_ORIGIN = 'https://fr.brainpop.com'
_VIDEO_URL = 'https://svideos.brainpop.com'
_HLS_URL = 'https://hls.brainpop.com'
_CDN_URL = 'https://cdn.brainpop.com/fr'
_TESTS = [{
'url': 'https://fr.brainpop.com/sciencesdelaterre/energie/sourcesdenergie/',
'md5': '97e7f48af8af93f8a2be11709f239371',
'info_dict': {
'id': '1651',
'ext': 'mp4',
'title': 'Sources d\'énergie',
'display_id': 'sourcesdenergie',
'description': 'md5:7eece350f019a21ef9f64d4088b2d857',
},
}, {
'url': 'https://fr.brainpop.com/francais/ecrire/plagiat/',
'md5': '0cf2b4f89804d0dd4a360a51310d445a',
'info_dict': {
'id': '5803',
'ext': 'mp4',
'title': 'Plagiat',
'display_id': 'plagiat',
'description': 'md5:4496d87127ace28e8b1eda116e77cd2b',
},
'skip': 'Requires login',
}]
class BrainPOPIlIE(BrainPOPLegacyBaseIE):
IE_DESC = 'BrainPOP Hebrew'
_ORIGIN = 'https://il.brainpop.com'
_VIDEO_URL = 'https://svideos.brainpop.com'
_HLS_URL = 'https://hls.brainpop.com'
_CDN_URL = 'https://cdn.brainpop.com/he'
_TESTS = [{
'url': 'https://il.brainpop.com/category_9/subcategory_150/subjects_3782/',
'md5': '9e4ea9dc60ecd385a6e5ca12ccf31641',
'info_dict': {
'id': '3782',
'ext': 'mp4',
'title': 'md5:e993632fcda0545d9205602ec314ad67',
'display_id': 'subjects_3782',
'description': 'md5:4cc084a8012beb01f037724423a4d4ed',
},
}]

View file

@ -1,117 +1,185 @@
import re
from .adobepass import AdobePassIE
from ..utils import (
smuggle_url,
update_url_query,
int_or_none,
extract_attributes,
float_or_none,
try_get,
dict_get,
get_element_html_by_class,
int_or_none,
merge_dicts,
parse_age_limit,
remove_end,
str_or_none,
traverse_obj,
unescapeHTML,
unified_timestamp,
update_url_query,
url_or_none,
)
class BravoTVIE(AdobePassIE):
_VALID_URL = r'https?://(?:www\.)?(?P<req_id>bravotv|oxygen)\.com/(?:[^/]+/)+(?P<id>[^/?#]+)'
_VALID_URL = r'https?://(?:www\.)?(?P<site>bravotv|oxygen)\.com/(?:[^/]+/)+(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is',
'md5': 'e34684cfea2a96cd2ee1ef3a60909de9',
'info_dict': {
'id': 'epL0pmK1kQlT',
'id': '3923059',
'ext': 'mp4',
'title': 'The Top Chef Season 16 Winner Is...',
'description': 'Find out who takes the title of Top Chef!',
'uploader': 'NBCU-BRAV',
'upload_date': '20190314',
'timestamp': 1552591860,
'season_number': 16,
'episode_number': 15,
'series': 'Top Chef',
'episode': 'The Top Chef Season 16 Winner Is...',
'duration': 190.0,
}
'duration': 190.357,
'season': 'Season 16',
'thumbnail': r're:^https://.+\.jpg',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1',
'only_matching': True,
'url': 'https://www.bravotv.com/top-chef/season-20/episode-1/london-calling',
'info_dict': {
'id': '9000234570',
'ext': 'mp4',
'title': 'London Calling',
'description': 'md5:5af95a8cbac1856bd10e7562f86bb759',
'upload_date': '20230310',
'timestamp': 1678410000,
'season_number': 20,
'episode_number': 1,
'series': 'Top Chef',
'episode': 'London Calling',
'duration': 3266.03,
'season': 'Season 20',
'chapters': 'count:7',
'thumbnail': r're:^https://.+\.jpg',
'age_limit': 14,
},
'params': {'skip_download': 'm3u8'},
'skip': 'This video requires AdobePass MSO credentials',
}, {
'url': 'https://www.oxygen.com/in-ice-cold-blood/season-1/closing-night',
'info_dict': {
'id': '3692045',
'ext': 'mp4',
'title': 'Closing Night',
'description': 'md5:3170065c5c2f19548d72a4cbc254af63',
'upload_date': '20180401',
'timestamp': 1522623600,
'season_number': 1,
'episode_number': 1,
'series': 'In Ice Cold Blood',
'episode': 'Closing Night',
'duration': 2629.051,
'season': 'Season 1',
'chapters': 'count:6',
'thumbnail': r're:^https://.+\.jpg',
'age_limit': 14,
},
'params': {'skip_download': 'm3u8'},
'skip': 'This video requires AdobePass MSO credentials',
}, {
'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2',
'info_dict': {
'id': '3974019',
'ext': 'mp4',
'title': '\'Handling The Horwitz House After The Murder (Season 2, Episode 16)',
'description': 'md5:f9d638dd6946a1c1c0533a9c6100eae5',
'upload_date': '20190617',
'timestamp': 1560790800,
'season_number': 2,
'episode_number': 16,
'series': 'In Ice Cold Blood',
'episode': '\'Handling The Horwitz House After The Murder (Season 2, Episode 16)',
'duration': 68.235,
'season': 'Season 2',
'thumbnail': r're:^https://.+\.jpg',
'age_limit': 14,
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1',
'only_matching': True,
}]
def _real_extract(self, url):
site, display_id = self._match_valid_url(url).groups()
site, display_id = self._match_valid_url(url).group('site', 'id')
webpage = self._download_webpage(url, display_id)
settings = self._parse_json(self._search_regex(
r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'),
display_id)
info = {}
settings = self._search_json(
r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>', webpage, 'settings', display_id)
tve = extract_attributes(get_element_html_by_class('tve-video-deck-app', webpage) or '')
query = {
'mbr': 'true',
'manifest': 'm3u',
'formats': 'm3u,mpeg4',
}
account_pid, release_pid = [None] * 2
tve = settings.get('ls_tve')
if tve:
query['manifest'] = 'm3u'
mobj = re.search(r'<[^>]+id="pdk-player"[^>]+data-url=["\']?(?:https?:)?//player\.theplatform\.com/p/([^/]+)/(?:[^/]+/)*select/([^?#&"\']+)', webpage)
if mobj:
account_pid, tp_path = mobj.groups()
release_pid = tp_path.strip('/').split('/')[-1]
else:
account_pid = 'HNK2IC'
tp_path = release_pid = tve['release_pid']
if tve.get('entitlement') == 'auth':
adobe_pass = settings.get('tve_adobe_auth', {})
if site == 'bravotv':
site = 'bravo'
account_pid = tve.get('data-mpx-media-account-pid') or 'HNK2IC'
account_id = tve['data-mpx-media-account-id']
metadata = self._parse_json(
tve.get('data-normalized-video', ''), display_id, fatal=False, transform_source=unescapeHTML)
video_id = tve.get('data-guid') or metadata['guid']
if tve.get('data-entitlement') == 'auth':
auth = traverse_obj(settings, ('tve_adobe_auth', {dict})) or {}
site = remove_end(site, 'tv')
release_pid = tve['data-release-pid']
resource = self._get_mvpd_resource(
adobe_pass.get('adobePassResourceId') or site,
tve['title'], release_pid, tve.get('rating'))
query['auth'] = self._extract_mvpd_auth(
url, release_pid,
adobe_pass.get('adobePassRequestorId') or site, resource)
tve.get('data-adobe-pass-resource-id') or auth.get('adobePassResourceId') or site,
tve['data-title'], release_pid, tve.get('data-rating'))
query.update({
'switch': 'HLSServiceSecure',
'auth': self._extract_mvpd_auth(
url, release_pid, auth.get('adobePassRequestorId') or site, resource),
})
else:
shared_playlist = settings['ls_playlist']
account_pid = shared_playlist['account_pid']
metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']]
tp_path = release_pid = metadata.get('release_pid')
if not release_pid:
release_pid = metadata['guid']
tp_path = 'media/guid/2140479951/' + release_pid
info.update({
'title': metadata['title'],
'description': metadata.get('description'),
'season_number': int_or_none(metadata.get('season_num')),
'episode_number': int_or_none(metadata.get('episode_num')),
})
query['switch'] = 'progressive'
tp_url = 'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path)
ls_playlist = traverse_obj(settings, ('ls_playlist', ..., {dict}), get_all=False) or {}
account_pid = ls_playlist.get('mpxMediaAccountPid') or 'PHSl-B'
account_id = ls_playlist['mpxMediaAccountId']
video_id = ls_playlist['defaultGuid']
metadata = traverse_obj(
ls_playlist, ('videos', lambda _, v: v['guid'] == video_id, {dict}), get_all=False)
tp_url = f'https://link.theplatform.com/s/{account_pid}/media/guid/{account_id}/{video_id}'
tp_metadata = self._download_json(
update_url_query(tp_url, {'format': 'preview'}),
display_id, fatal=False)
if tp_metadata:
info.update({
'title': tp_metadata.get('title'),
'description': tp_metadata.get('description'),
'duration': float_or_none(tp_metadata.get('duration'), 1000),
'season_number': int_or_none(
dict_get(tp_metadata, ('pl1$seasonNumber', 'nbcu$seasonNumber'))),
'episode_number': int_or_none(
dict_get(tp_metadata, ('pl1$episodeNumber', 'nbcu$episodeNumber'))),
# For some reason the series is sometimes wrapped into a single element array.
'series': try_get(
dict_get(tp_metadata, ('pl1$show', 'nbcu$show')),
lambda x: x[0] if isinstance(x, list) else x,
expected_type=str),
'episode': dict_get(
tp_metadata, ('pl1$episodeName', 'nbcu$episodeName', 'title')),
})
update_url_query(tp_url, {'format': 'preview'}), video_id, fatal=False)
info.update({
'_type': 'url_transparent',
'id': release_pid,
'url': smuggle_url(update_url_query(tp_url, query), {'force_smil_url': True}),
'ie_key': 'ThePlatform',
})
return info
seconds_or_none = lambda x: float_or_none(x, 1000)
chapters = traverse_obj(tp_metadata, ('chapters', ..., {
'start_time': ('startTime', {seconds_or_none}),
'end_time': ('endTime', {seconds_or_none}),
}))
# prune pointless single chapters that span the entire duration from short videos
if len(chapters) == 1 and not traverse_obj(chapters, (0, 'end_time')):
chapters = None
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
update_url_query(f'{tp_url}/stream.m3u8', query), video_id, 'mp4', m3u8_id='hls')
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
'chapters': chapters,
**merge_dicts(traverse_obj(tp_metadata, {
'title': 'title',
'description': 'description',
'duration': ('duration', {seconds_or_none}),
'timestamp': ('pubDate', {seconds_or_none}),
'season_number': (('pl1$seasonNumber', 'nbcu$seasonNumber'), {int_or_none}),
'episode_number': (('pl1$episodeNumber', 'nbcu$episodeNumber'), {int_or_none}),
'series': (('pl1$show', 'nbcu$show'), (None, ...), {str}),
'episode': (('title', 'pl1$episodeNumber', 'nbcu$episodeNumber'), {str_or_none}),
'age_limit': ('ratings', ..., 'rating', {parse_age_limit}),
}, get_all=False), traverse_obj(metadata, {
'title': 'title',
'description': 'description',
'duration': ('durationInSeconds', {int_or_none}),
'timestamp': ('airDate', {unified_timestamp}),
'thumbnail': ('thumbnailUrl', {url_or_none}),
'season_number': ('seasonNumber', {int_or_none}),
'episode_number': ('episodeNumber', {int_or_none}),
'episode': 'episodeTitle',
'series': 'show',
}))
}

View file

@ -575,6 +575,7 @@ def build_format_id(kind):
self.raise_no_formats(
error.get('message') or error.get('error_subcode') or error['error_code'], expected=True)
headers.pop('Authorization', None) # or else http formats will give error 400
for f in formats:
f.setdefault('http_headers', {}).update(headers)
@ -895,8 +896,9 @@ def extract_policy_key():
store_pk(policy_key)
return policy_key
api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id)
headers = {}
token = smuggled_data.get('token')
api_url = f'https://{"edge-auth" if token else "edge"}.api.brightcove.com/playback/v1/accounts/{account_id}/{content_type}s/{video_id}'
headers = {'Authorization': f'Bearer {token}'} if token else {}
referrer = smuggled_data.get('referrer') # XXX: notice the spelling/case of the key
if referrer:
headers.update({

View file

@ -8,14 +8,16 @@
compat_str,
)
from ..utils import (
ExtractorError,
int_or_none,
join_nonempty,
js_to_json,
orderedSet,
parse_iso8601,
smuggle_url,
strip_or_none,
traverse_obj,
try_get,
ExtractorError,
)
@ -404,7 +406,7 @@ def _real_extract(self, url):
class CBCGemPlaylistIE(InfoExtractor):
IE_NAME = 'gem.cbc.ca:playlist'
_VALID_URL = r'https?://gem\.cbc\.ca/media/(?P<id>(?P<show>[0-9a-z-]+)/s(?P<season>[0-9]+))/?(?:[?#]|$)'
_VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P<id>(?P<show>[0-9a-z-]+)/s(?P<season>[0-9]+))/?(?:[?#]|$)'
_TESTS = [{
# TV show playlist, all public videos
'url': 'https://gem.cbc.ca/media/schitts-creek/s06',
@ -414,6 +416,9 @@ class CBCGemPlaylistIE(InfoExtractor):
'title': 'Season 6',
'description': 'md5:6a92104a56cbeb5818cc47884d4326a2',
},
}, {
'url': 'https://gem.cbc.ca/schitts-creek/s06',
'only_matching': True,
}]
_API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/shows/'
@ -473,8 +478,9 @@ def _real_extract(self, url):
class CBCGemLiveIE(InfoExtractor):
IE_NAME = 'gem.cbc.ca:live'
_VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>\d+)'
_TEST = {
_VALID_URL = r'https?://gem\.cbc\.ca/live(?:-event)?/(?P<id>\d+)'
_TESTS = [
{
'url': 'https://gem.cbc.ca/live/920604739687',
'info_dict': {
'title': 'Ottawa',
@ -488,34 +494,74 @@ class CBCGemLiveIE(InfoExtractor):
'uploader': 'CBCC-NEW',
},
'skip': 'Live might have ended',
},
{
'url': 'https://gem.cbc.ca/live/44',
'info_dict': {
'id': '44',
'ext': 'mp4',
'is_live': True,
'title': r're:^Ottawa [0-9\-: ]+',
'description': 'The live TV channel and local programming from Ottawa',
'live_status': 'is_live',
'thumbnail': r're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*'
},
'params': {'skip_download': True},
'skip': 'Live might have ended',
},
{
'url': 'https://gem.cbc.ca/live-event/10835',
'info_dict': {
'id': '10835',
'ext': 'mp4',
'is_live': True,
'title': r're:^The National \| Bidens trip wraps up, Paltrow testifies, Bird flu [0-9\-: ]+',
'description': 'March 24, 2023 | President Bidens Ottawa visit ends with big pledges from both countries. Plus, Gwyneth Paltrow testifies in her ski collision trial.',
'live_status': 'is_live',
'thumbnail': r're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*',
'timestamp': 1679706000,
'upload_date': '20230325',
},
'params': {'skip_download': True},
'skip': 'Live might have ended',
}
# It's unclear where the chars at the end come from, but they appear to be
# constant. Might need updating in the future.
# There are two URLs, some livestreams are in one, and some
# in the other. The JSON schema is the same for both.
_API_URLS = ['https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT', 'https://tpfeed.cbc.ca/f/ExhSPC/FNiv9xQx_BnT']
]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_info = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['data']
for api_url in self._API_URLS:
video_info = next((
stream for stream in self._download_json(api_url, video_id)['entries']
if stream.get('guid') == video_id), None)
if video_info:
break
else:
# Two types of metadata JSON
if not video_info.get('formattedIdMedia'):
video_info = traverse_obj(
video_info, (('freeTv', ('streams', ...)), 'items', lambda _, v: v['key'] == video_id, {dict}),
get_all=False, default={})
video_stream_id = video_info.get('formattedIdMedia')
if not video_stream_id:
raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True)
stream_data = self._download_json(
'https://services.radio-canada.ca/media/validation/v2/', video_id, query={
'appCode': 'mpx',
'connectionType': 'hd',
'deviceType': 'ipad',
'idMedia': video_stream_id,
'multibitrate': 'true',
'output': 'json',
'tech': 'hls',
'manifestType': 'desktop',
})
return {
'_type': 'url_transparent',
'ie_key': 'ThePlatform',
'url': video_info['content'][0]['url'],
'id': video_id,
'title': video_info.get('title'),
'description': video_info.get('description'),
'tags': try_get(video_info, lambda x: x['keywords'].split(', ')),
'thumbnail': video_info.get('cbc$staticImage'),
'formats': self._extract_m3u8_formats(stream_data['url'], video_id, 'mp4', live=True),
'is_live': True,
**traverse_obj(video_info, {
'title': 'title',
'description': 'description',
'thumbnail': ('images', 'card', 'url'),
'timestamp': ('airDate', {parse_iso8601}),
})
}

View file

@ -1,8 +1,14 @@
from .brightcove import BrightcoveNewIE
from .common import InfoExtractor
from .theplatform import ThePlatformFeedIE
from .youtube import YoutubeIE
from ..utils import (
ExtractorError,
extract_attributes,
get_element_html_by_id,
int_or_none,
find_xpath_attr,
smuggle_url,
xpath_element,
xpath_text,
update_url_query,
@ -162,3 +168,110 @@ def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517):
'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000),
'thumbnail': url_or_none(xpath_text(video_data, 'previewImageURL')),
})
class ParamountPressExpressIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?paramountpressexpress\.com(?:/[\w-]+)+/(?P<yt>yt-)?video/?\?watch=(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.paramountpressexpress.com/cbs-entertainment/shows/survivor/video/?watch=pnzew7e2hx',
'md5': '56631dbcadaab980d1fc47cb7b76cba4',
'info_dict': {
'id': '6322981580112',
'ext': 'mp4',
'title': 'Im Felicia',
'description': 'md5:88fad93f8eede1c9c8f390239e4c6290',
'uploader_id': '6055873637001',
'upload_date': '20230320',
'timestamp': 1679334960,
'duration': 49.557,
'thumbnail': r're:^https://.+\.jpg',
'tags': [],
},
}, {
'url': 'https://www.paramountpressexpress.com/cbs-entertainment/video/?watch=2s5eh8kppc',
'md5': 'edcb03e3210b88a3e56c05aa863e0e5b',
'info_dict': {
'id': '6323036027112',
'ext': 'mp4',
'title': 'Y&R Set Visit: Jerry OConnell Quizzes Cast on Pre-Love Scene Rituals and More',
'description': 'md5:b929867a357aac5544b783d834c78383',
'uploader_id': '6055873637001',
'upload_date': '20230321',
'timestamp': 1679430180,
'duration': 132.032,
'thumbnail': r're:^https://.+\.jpg',
'tags': [],
},
}, {
'url': 'https://www.paramountpressexpress.com/paramount-plus/yt-video/?watch=OX9wJWOcqck',
'info_dict': {
'id': 'OX9wJWOcqck',
'ext': 'mp4',
'title': 'Rugrats | Season 2 Official Trailer | Paramount+',
'description': 'md5:1f7e26f5625a9f0d6564d9ad97a9f7de',
'uploader': 'Paramount Plus',
'uploader_id': '@paramountplus',
'uploader_url': 'http://www.youtube.com/@paramountplus',
'channel': 'Paramount Plus',
'channel_id': 'UCrRttZIypNTA1Mrfwo745Sg',
'channel_url': 'https://www.youtube.com/channel/UCrRttZIypNTA1Mrfwo745Sg',
'upload_date': '20230316',
'duration': 88,
'age_limit': 0,
'availability': 'public',
'live_status': 'not_live',
'playable_in_embed': True,
'view_count': int,
'like_count': int,
'channel_follower_count': int,
'thumbnail': 'https://i.ytimg.com/vi/OX9wJWOcqck/maxresdefault.jpg',
'categories': ['Entertainment'],
'tags': ['Rugrats'],
},
}, {
'url': 'https://www.paramountpressexpress.com/showtime/yt-video/?watch=_ljssSoDLkw',
'info_dict': {
'id': '_ljssSoDLkw',
'ext': 'mp4',
'title': 'Lavell Crawford: THEE Lavell Crawford Comedy Special Official Trailer | SHOWTIME',
'description': 'md5:39581bcc3fd810209b642609f448af70',
'uploader': 'SHOWTIME',
'uploader_id': '@Showtime',
'uploader_url': 'http://www.youtube.com/@Showtime',
'channel': 'SHOWTIME',
'channel_id': 'UCtwMWJr2BFPkuJTnSvCESSQ',
'channel_url': 'https://www.youtube.com/channel/UCtwMWJr2BFPkuJTnSvCESSQ',
'upload_date': '20230209',
'duration': 49,
'age_limit': 0,
'availability': 'public',
'live_status': 'not_live',
'playable_in_embed': True,
'view_count': int,
'like_count': int,
'comment_count': int,
'channel_follower_count': int,
'thumbnail': 'https://i.ytimg.com/vi_webp/_ljssSoDLkw/maxresdefault.webp',
'categories': ['People & Blogs'],
'tags': 'count:27',
},
}]
def _real_extract(self, url):
display_id, is_youtube = self._match_valid_url(url).group('id', 'yt')
if is_youtube:
return self.url_result(display_id, YoutubeIE)
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
r'\bvideo_id\s*=\s*["\'](\d+)["\']\s*,', webpage, 'Brightcove ID')
token = self._search_regex(r'\btoken\s*=\s*["\']([\w.-]+)["\']', webpage, 'token')
player = extract_attributes(get_element_html_by_id('vcbrightcoveplayer', webpage) or '')
account_id = player.get('data-account') or '6055873637001'
player_id = player.get('data-player') or 'OtLKgXlO9F'
embed = player.get('data-embed') or 'default'
return self.url_result(smuggle_url(
f'https://players.brightcove.net/{account_id}/{player_id}_{embed}/index.html?videoId={video_id}',
{'token': token}), BrightcoveNewIE)

View file

@ -0,0 +1,61 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
traverse_obj,
unified_timestamp,
url_or_none,
)
class ClipchampIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?clipchamp\.com/watch/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://clipchamp.com/watch/gRXZ4ZhdDaU',
'info_dict': {
'id': 'gRXZ4ZhdDaU',
'ext': 'mp4',
'title': 'Untitled video',
'uploader': 'Alexander Schwartz',
'timestamp': 1680805580,
'upload_date': '20230406',
'thumbnail': r're:^https?://.+\.jpg',
},
'params': {'skip_download': 'm3u8'},
}]
_STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s'
_STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video']
storage_location = data.get('storage_location')
if storage_location != 'cf_stream':
raise ExtractorError(f'Unsupported clip storage location "{storage_location}"')
path = data['download_url']
iframe = self._download_webpage(
f'https://iframe.cloudflarestream.com/{path}', video_id, 'Downloading player iframe')
subdomain = self._search_regex(
r'\bcustomer-domain-prefix=["\']([\w-]+)["\']', iframe,
'subdomain', fatal=False) or 'customer-2ut9yn3y6fta1yxe'
formats = self._extract_mpd_formats(
self._STREAM_URL_TMPL % (subdomain, path, 'mpd'), video_id,
query=self._STREAM_URL_QUERY, fatal=False, mpd_id='dash')
formats.extend(self._extract_m3u8_formats(
self._STREAM_URL_TMPL % (subdomain, path, 'm3u8'), video_id, 'mp4',
query=self._STREAM_URL_QUERY, fatal=False, m3u8_id='hls'))
return {
'id': video_id,
'formats': formats,
'uploader': ' '.join(traverse_obj(data, ('creator', ('first_name', 'last_name'), {str}))) or None,
**traverse_obj(data, {
'title': ('project', 'project_name', {str}),
'timestamp': ('created_at', {unified_timestamp}),
'thumbnail': ('thumbnail_url', {url_or_none}),
}),
}

View file

@ -2998,6 +2998,8 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
'protocol': 'ism',
'fragments': fragments,
'has_drm': ism_doc.find('Protection') is not None,
'language': stream_language,
'audio_channels': int_or_none(track.get('Channels')),
'_download_params': {
'stream_type': stream_type,
'duration': duration,
@ -3528,7 +3530,7 @@ def _RETURN_TYPE(cls):
@classmethod
def is_single_video(cls, url):
"""Returns whether the URL is of a single video, None if unknown"""
assert cls.suitable(url), 'The URL must be suitable for the extractor'
if cls.suitable(url):
return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
@classmethod
@ -3671,18 +3673,22 @@ def _extract_chapters_helper(self, chapter_list, start_function, title_function,
'start_time': start_function(chapter),
'title': title_function(chapter),
} for chapter in chapter_list or []]
if not strict:
if strict:
warn = self.report_warning
else:
warn = self.write_debug
chapter_list.sort(key=lambda c: c['start_time'] or 0)
chapters = [{'start_time': 0}]
for idx, chapter in enumerate(chapter_list):
if chapter['start_time'] is None:
self.report_warning(f'Incomplete chapter {idx}')
warn(f'Incomplete chapter {idx}')
elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
chapters.append(chapter)
elif chapter not in chapters:
self.report_warning(
f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"')
issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
return chapters[1:]
def _extract_chapters_from_description(self, description, duration):

192
yt_dlp/extractor/dlf.py Normal file
View file

@ -0,0 +1,192 @@
import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
extract_attributes,
int_or_none,
traverse_obj,
url_or_none,
)
class DLFBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?deutschlandfunk\.de/'
_BUTTON_REGEX = r'(<button[^>]+alt="Anhören"[^>]+data-audio-diraid[^>]*>)'
def _parse_button_attrs(self, button, audio_id=None):
attrs = extract_attributes(button)
audio_id = audio_id or attrs['data-audio-diraid']
url = traverse_obj(
attrs, 'data-audio-download-src', 'data-audio', 'data-audioreference',
'data-audio-src', expected_type=url_or_none)
ext = determine_ext(url)
return {
'id': audio_id,
'extractor_key': DLFIE.ie_key(),
'extractor': DLFIE.IE_NAME,
**traverse_obj(attrs, {
'title': (('data-audiotitle', 'data-audio-title', 'data-audio-download-tracking-title'), {str}),
'duration': (('data-audioduration', 'data-audio-duration'), {int_or_none}),
'thumbnail': ('data-audioimage', {url_or_none}),
'uploader': 'data-audio-producer',
'series': 'data-audio-series',
'channel': 'data-audio-origin-site-name',
'webpage_url': ('data-audio-download-tracking-path', {url_or_none}),
}, get_all=False),
'formats': (self._extract_m3u8_formats(url, audio_id, fatal=False)
if ext == 'm3u8' else [{'url': url, 'ext': ext, 'vcodec': 'none'}])
}
class DLFIE(DLFBaseIE):
IE_NAME = 'dlf'
_VALID_URL = DLFBaseIE._VALID_URL_BASE + r'[\w-]+-dlf-(?P<id>[\da-f]{8})-100\.html'
_TESTS = [
# Audio as an HLS stream
{
'url': 'https://www.deutschlandfunk.de/tanz-der-saiteninstrumente-das-wild-strings-trio-aus-slowenien-dlf-03a3eb19-100.html',
'info_dict': {
'id': '03a3eb19',
'title': r're:Tanz der Saiteninstrumente [-/] Das Wild Strings Trio aus Slowenien',
'ext': 'm4a',
'duration': 3298,
'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673',
'uploader': 'Deutschlandfunk',
'series': 'On Stage',
'channel': 'deutschlandfunk'
},
'params': {
'skip_download': 'm3u8'
},
'skip': 'This webpage no longer exists'
}, {
'url': 'https://www.deutschlandfunk.de/russische-athleten-kehren-zurueck-auf-die-sportbuehne-ein-gefaehrlicher-tueroeffner-dlf-d9cc1856-100.html',
'info_dict': {
'id': 'd9cc1856',
'title': 'Russische Athleten kehren zurück auf die Sportbühne: Ein gefährlicher Türöffner',
'ext': 'mp3',
'duration': 291,
'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673',
'uploader': 'Deutschlandfunk',
'series': 'Kommentare und Themen der Woche',
'channel': 'deutschlandfunk'
}
},
]
def _real_extract(self, url):
audio_id = self._match_id(url)
webpage = self._download_webpage(url, audio_id)
return self._parse_button_attrs(
self._search_regex(self._BUTTON_REGEX, webpage, 'button'), audio_id)
class DLFCorpusIE(DLFBaseIE):
IE_NAME = 'dlf:corpus'
IE_DESC = 'DLF Multi-feed Archives'
_VALID_URL = DLFBaseIE._VALID_URL_BASE + r'(?P<id>(?![\w-]+-dlf-[\da-f]{8})[\w-]+-\d+)\.html'
_TESTS = [
# Recorded news broadcast with referrals to related broadcasts
{
'url': 'https://www.deutschlandfunk.de/fechten-russland-belarus-ukraine-protest-100.html',
'info_dict': {
'id': 'fechten-russland-belarus-ukraine-protest-100',
'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet',
'description': 'md5:91340aab29c71aa7518ad5be13d1e8ad'
},
'playlist_mincount': 5,
'playlist': [{
'info_dict': {
'id': '1fc5d64a',
'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet',
'ext': 'mp3',
'duration': 252,
'thumbnail': 'https://assets.deutschlandfunk.de/aad16241-6b76-4a09-958b-96d0ee1d6f57/512x512.jpg?t=1679480020313',
'uploader': 'Deutschlandfunk',
'series': 'Sport',
'channel': 'deutschlandfunk'
}
}, {
'info_dict': {
'id': '2ada145f',
'title': r're:(?:Sportpolitik / )?Fechtverband votiert für Rückkehr russischer Athleten',
'ext': 'mp3',
'duration': 336,
'thumbnail': 'https://assets.deutschlandfunk.de/FILE_93982766f7317df30409b8a184ac044a/512x512.jpg?t=1678547581005',
'uploader': 'Deutschlandfunk',
'series': 'Deutschlandfunk Nova',
'channel': 'deutschlandfunk-nova'
}
}, {
'info_dict': {
'id': '5e55e8c9',
'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis',
'ext': 'mp3',
'duration': 187,
'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412',
'uploader': 'Deutschlandfunk',
'series': 'Sport am Samstag',
'channel': 'deutschlandfunk'
}
}, {
'info_dict': {
'id': '47e1a096',
'title': r're:Rückkehr Russlands im Fechten [-/] "Fassungslos, dass es einfach so passiert ist"',
'ext': 'mp3',
'duration': 602,
'thumbnail': 'https://assets.deutschlandfunk.de/da4c494a-21cc-48b4-9cc7-40e09fd442c2/512x512.jpg?t=1678562155770',
'uploader': 'Deutschlandfunk',
'series': 'Sport am Samstag',
'channel': 'deutschlandfunk'
}
}, {
'info_dict': {
'id': '5e55e8c9',
'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis',
'ext': 'mp3',
'duration': 187,
'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412',
'uploader': 'Deutschlandfunk',
'series': 'Sport am Samstag',
'channel': 'deutschlandfunk'
}
}]
},
# Podcast feed with tag buttons, playlist count fluctuates
{
'url': 'https://www.deutschlandfunk.de/kommentare-und-themen-der-woche-100.html',
'info_dict': {
'id': 'kommentare-und-themen-der-woche-100',
'title': 'Meinung - Kommentare und Themen der Woche',
'description': 'md5:2901bbd65cd2d45e116d399a099ce5d5',
},
'playlist_mincount': 10,
},
# Podcast feed with no description
{
'url': 'https://www.deutschlandfunk.de/podcast-tolle-idee-100.html',
'info_dict': {
'id': 'podcast-tolle-idee-100',
'title': 'Wissenschaftspodcast - Tolle Idee! - Was wurde daraus?',
},
'playlist_mincount': 11,
},
]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
return {
'_type': 'playlist',
'id': playlist_id,
'description': self._html_search_meta(
['description', 'og:description', 'twitter:description'], webpage, default=None),
'title': self._html_search_meta(
['og:title', 'twitter:title'], webpage, default=None),
'entries': map(self._parse_button_attrs, re.findall(self._BUTTON_REGEX, webpage)),
}

View file

@ -12,7 +12,6 @@
mimetype2ext,
str_or_none,
traverse_obj,
try_get,
unified_timestamp,
update_url_query,
url_or_none,
@ -25,7 +24,7 @@ class DRTVIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:
(?:www\.)?dr\.dk/(?:tv/se|nyheder|(?:radio|lyd)(?:/ondemand)?)/(?:[^/]+/)*|
(?:www\.)?dr\.dk/(?:tv/se|nyheder|(?P<radio>radio|lyd)(?:/ondemand)?)/(?:[^/]+/)*|
(?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/
)
(?P<id>[\da-z_-]+)
@ -80,7 +79,7 @@ class DRTVIE(InfoExtractor):
'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a',
'timestamp': 1546628400,
'upload_date': '20190104',
'duration': 3504.618,
'duration': 3504.619,
'formats': 'mincount:20',
'release_year': 2017,
'season_id': 'urn:dr:mu:bundle:5afc03ad6187a4065ca5fd35',
@ -101,14 +100,16 @@ class DRTVIE(InfoExtractor):
'ext': 'mp4',
'title': 'Bonderøven 2019 (1:8)',
'description': 'md5:b6dcfe9b6f0bea6703e9a0092739a5bd',
'timestamp': 1603188600,
'upload_date': '20201020',
'timestamp': 1654856100,
'upload_date': '20220610',
'duration': 2576.6,
'season': 'Bonderøven 2019',
'season_id': 'urn:dr:mu:bundle:5c201667a11fa01ca4528ce5',
'release_year': 2019,
'season_number': 2019,
'series': 'Frank & Kastaniegaarden'
'series': 'Frank & Kastaniegaarden',
'episode_number': 1,
'episode': 'Episode 1',
},
'params': {
'skip_download': True,
@ -140,10 +141,26 @@ class DRTVIE(InfoExtractor):
'params': {
'skip_download': True,
},
'skip': 'this video has been removed',
}, {
'url': 'https://www.dr.dk/lyd/p4kbh/regionale-nyheder-kh4/regionale-nyheder-2023-03-14-10-30-9',
'info_dict': {
'ext': 'mp4',
'id': '14802310112',
'timestamp': 1678786200,
'duration': 120.043,
'season_id': 'urn:dr:mu:bundle:63a4f7c87140143504b6710f',
'series': 'P4 København regionale nyheder',
'upload_date': '20230314',
'release_year': 0,
'description': 'Hør seneste regionale nyheder fra P4 København.',
'season': 'Regionale nyheder',
'title': 'Regionale nyheder',
},
}]
def _real_extract(self, url):
raw_video_id = self._match_id(url)
raw_video_id, is_radio_url = self._match_valid_url(url).group('id', 'radio')
webpage = self._download_webpage(url, raw_video_id)
@ -170,15 +187,17 @@ def _real_extract(self, url):
programcard_url = '%s/%s' % (_PROGRAMCARD_BASE, video_id)
else:
programcard_url = _PROGRAMCARD_BASE
page = self._parse_json(
self._search_regex(
r'data\s*=\s*({.+?})\s*(?:;|</script)', webpage,
'data'), '1')['cache']['page']
page = page[list(page.keys())[0]]
item = try_get(
page, (lambda x: x['item'], lambda x: x['entries'][0]['item']),
dict)
video_id = item['customId'].split(':')[-1]
if is_radio_url:
video_id = self._search_nextjs_data(
webpage, raw_video_id)['props']['pageProps']['episode']['productionNumber']
else:
json_data = self._search_json(
r'window\.__data\s*=', webpage, 'data', raw_video_id)
video_id = traverse_obj(json_data, (
'cache', 'page', ..., (None, ('entries', 0)), 'item', 'customId',
{lambda x: x.split(':')[-1]}), get_all=False)
if not video_id:
raise ExtractorError('Unable to extract video id')
query['productionnumber'] = video_id
data = self._download_json(
@ -269,10 +288,11 @@ def decrypt_uri(e):
f['vcodec'] = 'none'
formats.extend(f4m_formats)
elif target == 'HLS':
formats.extend(self._extract_m3u8_formats(
fmts, subs = self._extract_m3u8_formats_and_subtitles(
uri, video_id, 'mp4', entry_protocol='m3u8_native',
quality=preference, m3u8_id=format_id,
fatal=False))
quality=preference, m3u8_id=format_id, fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
else:
bitrate = link.get('Bitrate')
if bitrate:

View file

@ -14,6 +14,7 @@
ExtractorError,
UnsupportedError,
determine_ext,
determine_protocol,
dict_get,
extract_basic_auth,
format_field,
@ -32,6 +33,7 @@
unescapeHTML,
unified_timestamp,
unsmuggle_url,
update_url_query,
url_or_none,
urljoin,
variadic,
@ -866,7 +868,7 @@ class GenericIE(InfoExtractor):
},
},
{
# Video.js embed, multiple formats
# Youtube embed, formerly: Video.js embed, multiple formats
'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html',
'info_dict': {
'id': 'yygqldloqIk',
@ -893,6 +895,7 @@ class GenericIE(InfoExtractor):
'params': {
'skip_download': True,
},
'skip': '404 Not Found',
},
# rtl.nl embed
{
@ -2168,6 +2171,33 @@ class GenericIE(InfoExtractor):
'age_limit': 18,
},
},
{
'note': 'Live HLS direct link',
'url': 'https://d18j67ugtrocuq.cloudfront.net/out/v1/2767aec339144787926bd0322f72c6e9/index.m3u8',
'info_dict': {
'id': 'index',
'title': r're:index',
'ext': 'mp4',
'live_status': 'is_live',
},
'params': {
'skip_download': 'm3u8',
},
},
{
'note': 'Video.js VOD HLS',
'url': 'https://gist.githubusercontent.com/bashonly/2aae0862c50f4a4b84f220c315767208/raw/e3380d413749dabbe804c9c2d8fd9a45142475c7/videojs_hls_test.html',
'info_dict': {
'id': 'videojs_hls_test',
'title': 'video',
'ext': 'mp4',
'age_limit': 0,
'duration': 1800,
},
'params': {
'skip_download': 'm3u8',
},
},
]
def report_following_redirect(self, new_url):
@ -2184,12 +2214,41 @@ def report_detected(self, name, num=1, note=None):
self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}')
def _fragment_query(self, url):
if self._configuration_arg('fragment_query'):
query_string = urllib.parse.urlparse(url).query
if query_string:
return {'extra_param_to_segment_url': query_string}
return {}
def _extra_manifest_info(self, info, manifest_url):
fragment_query = self._configuration_arg('fragment_query', [None], casesense=True)[0]
if fragment_query is not None:
info['extra_param_to_segment_url'] = (
urllib.parse.urlparse(fragment_query).query or fragment_query
or urllib.parse.urlparse(manifest_url).query or None)
hex_or_none = lambda x: x if re.fullmatch(r'(0x)?[\da-f]+', x, re.IGNORECASE) else None
info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), {
'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}),
}) or None
variant_query = self._configuration_arg('variant_query', [None], casesense=True)[0]
if variant_query is not None:
query = urllib.parse.parse_qs(
urllib.parse.urlparse(variant_query).query or variant_query
or urllib.parse.urlparse(manifest_url).query)
for fmt in self._downloader._get_formats(info):
fmt['url'] = update_url_query(fmt['url'], query)
# Attempt to detect live HLS or set VOD duration
m3u8_format = next((f for f in self._downloader._get_formats(info)
if determine_protocol(f) == 'm3u8_native'), None)
if m3u8_format:
is_live = self._configuration_arg('is_live', [None])[0]
if is_live is not None:
info['live_status'] = 'not_live' if is_live == 'false' else 'is_live'
return
headers = m3u8_format.get('http_headers') or info.get('http_headers')
duration = self._extract_m3u8_vod_duration(
m3u8_format['url'], info.get('id'), note='Checking m3u8 live status',
errnote='Failed to download m3u8 media playlist', headers=headers)
if not duration:
info['live_status'] = 'is_live'
info['duration'] = info.get('duration') or duration
def _extract_rss(self, url, video_id, doc):
NS_MAP = {
@ -2397,10 +2456,8 @@ def _real_extract(self, url):
subtitles = {}
if format_id.endswith('mpegurl') or ext == 'm3u8':
formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers)
info_dict.update(self._fragment_query(url))
elif format_id.endswith('mpd') or format_id.endswith('dash+xml') or ext == 'mpd':
formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers)
info_dict.update(self._fragment_query(url))
elif format_id == 'f4m' or ext == 'f4m':
formats = self._extract_f4m_formats(url, video_id, headers=headers)
else:
@ -2415,6 +2472,7 @@ def _real_extract(self, url):
'subtitles': subtitles,
'http_headers': headers or None,
})
self._extra_manifest_info(info_dict, url)
return info_dict
if not self.get_param('test', False) and not is_intentional:
@ -2427,7 +2485,7 @@ def _real_extract(self, url):
if first_bytes.startswith(b'#EXTM3U'):
self.report_detected('M3U playlist')
info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
info_dict.update(self._fragment_query(url))
self._extra_manifest_info(info_dict, url)
return info_dict
# Maybe it's a direct link to a video?
@ -2478,7 +2536,7 @@ def _real_extract(self, url):
doc,
mpd_base_url=full_response.geturl().rpartition('/')[0],
mpd_url=url)
info_dict.update(self._fragment_query(url))
self._extra_manifest_info(info_dict, url)
self.report_detected('DASH manifest')
return info_dict
elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
@ -2567,8 +2625,7 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
varname = mobj.group(1)
sources = variadic(self._parse_json(
mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or [])
formats = []
subtitles = {}
formats, subtitles, src = [], {}, None
for source in sources:
src = source.get('src')
if not src or not isinstance(src, str):
@ -2591,8 +2648,6 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
for fmt in formats:
fmt.update(self._fragment_query(src))
if not formats:
formats.append({
@ -2608,11 +2663,11 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
for sub_match in re.finditer(rf'(?s){re.escape(varname)}' r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage):
sub = self._parse_json(
sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {}
src = str_or_none(sub.get('src'))
if not src:
sub_src = str_or_none(sub.get('src'))
if not sub_src:
continue
subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({
'url': urllib.parse.urljoin(url, src),
'url': urllib.parse.urljoin(url, sub_src),
'name': sub.get('label'),
'http_headers': {
'Referer': actual_url,
@ -2620,7 +2675,10 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
})
if formats or subtitles:
self.report_detected('video.js embed')
return [{'formats': formats, 'subtitles': subtitles}]
info_dict = {'formats': formats, 'subtitles': subtitles}
if formats:
self._extra_manifest_info(info_dict, src)
return [info_dict]
# Look for generic KVS player (before json-ld bc of some urls that break otherwise)
found = self._search_regex((
@ -2795,10 +2853,10 @@ def filter_video(urls):
return [self._extract_xspf_playlist(video_url, video_id)]
elif ext == 'm3u8':
entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers)
entry_info_dict.update(self._fragment_query(video_url))
self._extra_manifest_info(entry_info_dict, video_url)
elif ext == 'mpd':
entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers)
entry_info_dict.update(self._fragment_query(video_url))
self._extra_manifest_info(entry_info_dict, video_url)
elif ext == 'f4m':
entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers)
elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url:

View file

@ -10,7 +10,7 @@
class GeniusIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?genius\.com/videos/(?P<id>[^?/#]+)'
_VALID_URL = r'https?://(?:www\.)?genius\.com/(?:videos|(?P<article>a))/(?P<id>[^?/#]+)'
_TESTS = [{
'url': 'https://genius.com/videos/Vince-staples-breaks-down-the-meaning-of-when-sparks-fly',
'md5': '64c2ad98cfafcfda23bfa0ad0c512f4c',
@ -41,19 +41,37 @@ class GeniusIE(InfoExtractor):
'timestamp': 1631209167,
'thumbnail': r're:^https?://.*\.jpg$',
},
}, {
'url': 'https://genius.com/a/cordae-anderson-paak-break-down-the-meaning-of-two-tens',
'md5': 'f98a4e03b16b0a2821bd6e52fb3cc9d7',
'info_dict': {
'id': '6321509903112',
'ext': 'mp4',
'title': 'Cordae & Anderson .Paak Breaks Down The Meaning Of “Two Tens”',
'description': 'md5:1255f0e1161d07342ce56a8464ac339d',
'tags': ['song id: 5457554'],
'uploader_id': '4863540648001',
'duration': 361.813,
'upload_date': '20230301',
'timestamp': 1677703908,
'thumbnail': r're:^https?://.*\.jpg$',
},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
display_id, is_article = self._match_valid_url(url).group('id', 'article')
webpage = self._download_webpage(url, display_id)
metadata = self._search_json(
r'<meta content="', webpage, 'metadata', display_id, transform_source=unescapeHTML)
video_id = traverse_obj(
metadata, ('video', 'provider_id'),
('dfp_kv', lambda _, x: x['name'] == 'brightcove_video_id', 'values', 0), get_all=False)
r'<meta content="', webpage, 'metadata', display_id,
end_pattern=r'"\s+itemprop="page_data"', transform_source=unescapeHTML)
video_id = traverse_obj(metadata, (
(('article', 'media', ...), ('video', None)),
('provider_id', ('dfp_kv', lambda _, v: v['name'] == 'brightcove_video_id', 'values', ...))),
get_all=False)
if not video_id:
raise ExtractorError('Brightcove video id not found in webpage')
# Not all article pages have videos, expect the error
raise ExtractorError('Brightcove video ID not found in webpage', expected=bool(is_article))
config = self._search_json(r'var\s*APP_CONFIG\s*=', webpage, 'config', video_id, default={})
account_id = config.get('brightcove_account_id', '4863540648001')
@ -68,7 +86,7 @@ def _real_extract(self, url):
class GeniusLyricsIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?genius\.com/(?P<id>[^?/#]+)-lyrics[?/#]?'
_VALID_URL = r'https?://(?:www\.)?genius\.com/(?P<id>[^?/#]+)-lyrics(?:[?/#]|$)'
_TESTS = [{
'url': 'https://genius.com/Lil-baby-heyy-lyrics',
'playlist_mincount': 2,

254
yt_dlp/extractor/globalplayer.py Executable file
View file

@ -0,0 +1,254 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
join_nonempty,
parse_duration,
str_or_none,
traverse_obj,
unified_strdate,
unified_timestamp,
urlhandle_detect_ext,
)
class GlobalPlayerBaseIE(InfoExtractor):
def _get_page_props(self, url, video_id):
webpage = self._download_webpage(url, video_id)
return self._search_nextjs_data(webpage, video_id)['props']['pageProps']
def _request_ext(self, url, video_id):
return urlhandle_detect_ext(self._request_webpage( # Server rejects HEAD requests
url, video_id, note='Determining source extension'))
def _extract_audio(self, episode, series):
return {
'vcodec': 'none',
**traverse_obj(series, {
'series': 'title',
'series_id': 'id',
'thumbnail': 'imageUrl',
'uploader': 'itunesAuthor', # podcasts only
}),
**traverse_obj(episode, {
'id': 'id',
'description': ('description', {clean_html}),
'duration': ('duration', {parse_duration}),
'thumbnail': 'imageUrl',
'url': 'streamUrl',
'timestamp': (('pubDate', 'startDate'), {unified_timestamp}),
'title': 'title',
}, get_all=False)
}
class GlobalPlayerLiveIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/live/(?P<id>\w+)/\w+'
_TESTS = [{
'url': 'https://www.globalplayer.com/live/smoothchill/uk/',
'info_dict': {
'id': '2mx1E',
'ext': 'aac',
'display_id': 'smoothchill-uk',
'title': 're:^Smooth Chill.+$',
'thumbnail': 'https://herald.musicradio.com/media/f296ade8-50c9-4f60-911f-924e96873620.png',
'description': 'Music To Chill To',
'live_status': 'is_live',
},
}, {
# national station
'url': 'https://www.globalplayer.com/live/heart/uk/',
'info_dict': {
'id': '2mwx4',
'ext': 'aac',
'description': 'turn up the feel good!',
'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png',
'live_status': 'is_live',
'title': 're:^Heart UK.+$',
'display_id': 'heart-uk',
},
}, {
# regional variation
'url': 'https://www.globalplayer.com/live/heart/london/',
'info_dict': {
'id': 'AMqg',
'ext': 'aac',
'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png',
'title': 're:^Heart London.+$',
'live_status': 'is_live',
'display_id': 'heart-london',
'description': 'turn up the feel good!',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
station = self._get_page_props(url, video_id)['station']
stream_url = station['streamUrl']
return {
'id': station['id'],
'display_id': join_nonempty('brandSlug', 'slug', from_dict=station) or station.get('legacyStationPrefix'),
'url': stream_url,
'ext': self._request_ext(stream_url, video_id),
'vcodec': 'none',
'is_live': True,
**traverse_obj(station, {
'title': (('name', 'brandName'), {str_or_none}),
'description': 'tagline',
'thumbnail': 'brandLogo',
}, get_all=False),
}
class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/playlists/(?P<id>\w+)'
_TESTS = [{
# "live playlist"
'url': 'https://www.globalplayer.com/playlists/8bLk/',
'info_dict': {
'id': '8bLk',
'ext': 'aac',
'live_status': 'is_live',
'description': 'md5:e10f5e10b01a7f2c14ba815509fbb38d',
'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=',
'title': 're:^Classic FM Hall of Fame.+$'
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
station = self._get_page_props(url, video_id)['playlistData']
stream_url = station['streamUrl']
return {
'id': video_id,
'url': stream_url,
'ext': self._request_ext(stream_url, video_id),
'vcodec': 'none',
'is_live': True,
**traverse_obj(station, {
'title': 'title',
'description': 'description',
'thumbnail': 'image',
}),
}
class GlobalPlayerAudioIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)/|catchup/\w+/\w+/)(?P<id>\w+)/?(?:$|[?#])'
_TESTS = [{
# podcast
'url': 'https://www.globalplayer.com/podcasts/42KuaM/',
'playlist_mincount': 5,
'info_dict': {
'id': '42KuaM',
'title': 'Filthy Ritual',
'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',
'categories': ['Society & Culture', 'True Crime'],
'uploader': 'Global',
'description': 'md5:da5b918eac9ae319454a10a563afacf9',
},
}, {
# radio catchup
'url': 'https://www.globalplayer.com/catchup/lbc/uk/46vyD7z/',
'playlist_mincount': 3,
'info_dict': {
'id': '46vyD7z',
'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.',
'title': 'Nick Ferrari',
'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf',
},
}]
def _real_extract(self, url):
video_id, podcast = self._match_valid_url(url).group('id', 'podcast')
props = self._get_page_props(url, video_id)
series = props['podcastInfo'] if podcast else props['catchupInfo']
return {
'_type': 'playlist',
'id': video_id,
'entries': [self._extract_audio(ep, series) for ep in traverse_obj(
series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))],
'categories': traverse_obj(series, ('categories', ..., 'name')) or None,
**traverse_obj(series, {
'description': 'description',
'thumbnail': 'imageUrl',
'title': 'title',
'uploader': 'itunesAuthor', # podcasts only
}),
}
class GlobalPlayerAudioEpisodeIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)|catchup/\w+/\w+)/episodes/(?P<id>\w+)/?(?:$|[?#])'
_TESTS = [{
# podcast
'url': 'https://www.globalplayer.com/podcasts/episodes/7DrfNnE/',
'info_dict': {
'id': '7DrfNnE',
'ext': 'mp3',
'title': 'Filthy Ritual - Trailer',
'description': 'md5:1f1562fd0f01b4773b590984f94223e0',
'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',
'duration': 225.0,
'timestamp': 1681254900,
'series': 'Filthy Ritual',
'series_id': '42KuaM',
'upload_date': '20230411',
'uploader': 'Global',
},
}, {
# radio catchup
'url': 'https://www.globalplayer.com/catchup/lbc/uk/episodes/2zGq26Vcv1fCWhddC4JAwETXWe/',
'info_dict': {
'id': '2zGq26Vcv1fCWhddC4JAwETXWe',
'ext': 'm4a',
'timestamp': 1682056800,
'series': 'Nick Ferrari',
'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf',
'upload_date': '20230421',
'series_id': '46vyD7z',
'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.',
'title': 'Nick Ferrari',
'duration': 10800.0,
},
}]
def _real_extract(self, url):
video_id, podcast = self._match_valid_url(url).group('id', 'podcast')
props = self._get_page_props(url, video_id)
episode = props['podcastEpisode'] if podcast else props['catchupEpisode']
return self._extract_audio(
episode, traverse_obj(episode, 'podcast', 'show', expected_type=dict) or {})
class GlobalPlayerVideoIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/videos/(?P<id>\w+)'
_TESTS = [{
'url': 'https://www.globalplayer.com/videos/2JsSZ7Gm2uP/',
'info_dict': {
'id': '2JsSZ7Gm2uP',
'ext': 'mp4',
'description': 'md5:6a9f063c67c42f218e42eee7d0298bfd',
'thumbnail': 'md5:d4498af48e15aae4839ce77b97d39550',
'upload_date': '20230420',
'title': 'Treble Malakai Bayoh sings a sublime Handel aria at Classic FM Live',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
meta = self._get_page_props(url, video_id)['videoData']
return {
'id': video_id,
**traverse_obj(meta, {
'url': 'url',
'thumbnail': ('image', 'url'),
'title': 'title',
'upload_date': ('publish_date', {unified_strdate}),
'description': 'description',
}),
}

View file

@ -0,0 +1,83 @@
from .common import InfoExtractor
from .dailymotion import DailymotionIE
from .youtube import YoutubeIE
class GMANetworkVideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www)\.gmanetwork\.com/(?:\w+/){3}(?P<id>\d+)/(?P<display_id>[\w-]+)/video'
_TESTS = [{
'url': 'https://www.gmanetwork.com/fullepisodes/home/running_man_philippines/168677/running-man-philippines-catch-the-thief-full-chapter-2/video?section=home',
'info_dict': {
'id': '28BqW0AXPe0',
'ext': 'mp4',
'upload_date': '20220919',
'uploader_url': 'http://www.youtube.com/channel/UChsoPNR5x-wdSO2GrOSIWqQ',
'like_count': int,
'view_count': int,
'uploader': 'YoüLOL',
'channel_id': 'UChsoPNR5x-wdSO2GrOSIWqQ',
'duration': 5313,
'comment_count': int,
'tags': 'count:22',
'uploader_id': 'UChsoPNR5x-wdSO2GrOSIWqQ',
'title': 'Running Man Philippines: Catch the Thief (FULL CHAPTER 2)',
'channel_url': 'https://www.youtube.com/channel/UChsoPNR5x-wdSO2GrOSIWqQ',
'thumbnail': 'https://i.ytimg.com/vi/28BqW0AXPe0/maxresdefault.jpg',
'release_timestamp': 1663594212,
'age_limit': 0,
'channel_follower_count': int,
'categories': ['Entertainment'],
'description': 'md5:811bdcea74f9c48051824e494756e926',
'live_status': 'not_live',
'playable_in_embed': True,
'channel': 'YoüLOL',
'availability': 'public',
'release_date': '20220919',
}
}, {
'url': 'https://www.gmanetwork.com/fullepisodes/home/more_than_words/87059/more-than-words-full-episode-80/video?section=home',
'info_dict': {
'id': 'yiDOExw2aSA',
'ext': 'mp4',
'live_status': 'not_live',
'channel': 'GMANetwork',
'like_count': int,
'channel_follower_count': int,
'description': 'md5:6d00cd658394fa1a5071200d3ed4be05',
'duration': 1419,
'age_limit': 0,
'comment_count': int,
'upload_date': '20181003',
'thumbnail': 'https://i.ytimg.com/vi_webp/yiDOExw2aSA/maxresdefault.webp',
'availability': 'public',
'playable_in_embed': True,
'channel_id': 'UCKL5hAuzgFQsyrsQKgU0Qng',
'title': 'More Than Words: Full Episode 80 (Finale)',
'uploader_id': 'GMANETWORK',
'categories': ['Entertainment'],
'uploader': 'GMANetwork',
'channel_url': 'https://www.youtube.com/channel/UCKL5hAuzgFQsyrsQKgU0Qng',
'tags': 'count:29',
'view_count': int,
'uploader_url': 'http://www.youtube.com/user/GMANETWORK',
}
}]
def _real_extract(self, url):
content_id, display_id = self._match_valid_url(url).group('id', 'display_id')
webpage = self._download_webpage(url, display_id)
# webpage route
youtube_id = self._search_regex(
r'var\s*YOUTUBE_VIDEO\s*=\s*[\'"]+(?P<yt_id>[\w-]+)', webpage, 'youtube_id', fatal=False)
if youtube_id:
return self.url_result(youtube_id, YoutubeIE, youtube_id)
# api call route
# more info at https://aphrodite.gmanetwork.com/fullepisodes/assets/fullepisodes/js/dist/fullepisodes_video.js?v=1.1.11
network_url = self._search_regex(
r'NETWORK_URL\s*=\s*[\'"](?P<url>[^\'"]+)', webpage, 'network_url')
json_data = self._download_json(f'{network_url}api/data/content/video/{content_id}', display_id)
if json_data.get('video_file'):
return self.url_result(json_data['video_file'], YoutubeIE, json_data['video_file'])
else:
return self.url_result(json_data['dailymotion_file'], DailymotionIE, json_data['dailymotion_file'])

View file

@ -3,6 +3,7 @@
from .common import InfoExtractor
from ..utils import (
OnDemandPagedList,
float_or_none,
traverse_obj,
unified_strdate,
)
@ -19,7 +20,9 @@ class GronkhIE(InfoExtractor):
'title': 'H.O.R.D.E. - DAS ZWEiTE ZEiTALTER 🎲 Session 1',
'view_count': int,
'thumbnail': 'https://01.cdn.vod.farm/preview/9e2555d3a23bf4e5c5b7c6b3b70a9d84.jpg',
'upload_date': '20221111'
'upload_date': '20221111',
'chapters': 'count:3',
'duration': 31463,
},
'params': {'skip_download': True}
}, {
@ -30,7 +33,8 @@ class GronkhIE(InfoExtractor):
'title': 'GTV0536, 2021-10-01 - MARTHA IS DEAD #FREiAB1830 !FF7 !horde !archiv',
'view_count': int,
'thumbnail': 'https://01.cdn.vod.farm/preview/6436746cce14e25f751260a692872b9b.jpg',
'upload_date': '20211001'
'upload_date': '20211001',
'duration': 32058,
},
'params': {'skip_download': True}
}, {
@ -56,6 +60,12 @@ def _real_extract(self, url):
'upload_date': unified_strdate(data_json.get('created_at')),
'formats': formats,
'subtitles': subtitles,
'duration': float_or_none(data_json.get('source_length')),
'chapters': traverse_obj(data_json, (
'chapters', lambda _, v: float_or_none(v['offset']) is not None, {
'title': 'title',
'start_time': ('offset', {float_or_none}),
})) or None,
}

View file

@ -1,37 +0,0 @@
from .common import InfoExtractor
class HentaiStigmaIE(InfoExtractor):
_VALID_URL = r'^https?://hentai\.animestigma\.com/(?P<id>[^/]+)'
_TEST = {
'url': 'http://hentai.animestigma.com/inyouchuu-etsu-bonus/',
'md5': '4e3d07422a68a4cc363d8f57c8bf0d23',
'info_dict': {
'id': 'inyouchuu-etsu-bonus',
'ext': 'mp4',
'title': 'Inyouchuu Etsu Bonus',
'age_limit': 18,
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
r'<h2[^>]+class="posttitle"[^>]*><a[^>]*>([^<]+)</a>',
webpage, 'title')
wrap_url = self._html_search_regex(
r'<iframe[^>]+src="([^"]+mp4)"', webpage, 'wrapper url')
wrap_webpage = self._download_webpage(wrap_url, video_id)
video_url = self._html_search_regex(
r'file\s*:\s*"([^"]+)"', wrap_webpage, 'video url')
return {
'id': video_id,
'url': video_url,
'title': title,
'age_limit': 18,
}

View file

@ -0,0 +1,72 @@
import functools
import re
from .common import InfoExtractor
from .jwplatform import JWPlatformIE
from ..utils import (
ExtractorError,
OnDemandPagedList,
extract_attributes,
get_element_by_class,
get_element_html_by_class,
)
class HollywoodReporterIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?hollywoodreporter\.com/video/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.hollywoodreporter.com/video/chris-pine-michelle-rodriguez-dungeons-dragons-cast-directors-on-what-it-took-to-make-film-sxsw-2023/',
'info_dict': {
'id': 'zH4jZaR5',
'ext': 'mp4',
'title': 'md5:a9a1c073770a32f178955997712c4bd9',
'description': 'The cast and directors of \'Dungeons & Dragons: Honor Among Thieves\' talk about their new film.',
'thumbnail': 'https://cdn.jwplayer.com/v2/media/zH4jZaR5/poster.jpg?width=720',
'upload_date': '20230312',
'timestamp': 1678586423,
'duration': 242.0,
},
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
data = extract_attributes(get_element_html_by_class('vlanding-video-card__link', webpage) or '')
video_id = data['data-video-showcase-trigger']
showcase_type = data['data-video-showcase-type']
if showcase_type == 'jwplayer':
return self.url_result(f'jwplatform:{video_id}', JWPlatformIE)
elif showcase_type == 'youtube':
return self.url_result(video_id, 'Youtube')
else:
raise ExtractorError(f'Unsupported showcase type "{showcase_type}"')
class HollywoodReporterPlaylistIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?hollywoodreporter\.com/vcategory/(?P<slug>[\w-]+)-(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.hollywoodreporter.com/vcategory/heat-vision-breakdown-57822/',
'playlist_mincount': 109,
'info_dict': {
'id': '57822',
'title': 'heat-vision-breakdown',
}
}]
def _fetch_page(self, slug, pl_id, page):
page += 1
webpage = self._download_webpage(
f'https://www.hollywoodreporter.com/vcategory/{slug}-{pl_id}/page/{page}/',
pl_id, note=f'Downloading playlist page {page}')
section = get_element_by_class('video-playlist-river', webpage) or ''
for url in re.findall(r'<a[^>]+href="([^"]+)"[^>]+class="c-title__link', section):
yield self.url_result(url, HollywoodReporterIE)
def _real_extract(self, url):
slug, pl_id = self._match_valid_url(url).group('slug', 'id')
return self.playlist_result(
OnDemandPagedList(functools.partial(self._fetch_page, slug, pl_id), 15), pl_id, slug)

View file

@ -0,0 +1,15 @@
from .common import InfoExtractor
class HrefLiRedirectIE(InfoExtractor):
IE_NAME = 'href.li'
IE_DESC = False # Do not list
_VALID_URL = r'https?://href\.li/\?(?P<url>.+)'
_TESTS = [{
'url': 'https://href.li/?https://www.reddit.com/r/cats/comments/12bluel/my_cat_helps_me_with_water/?utm_source=share&utm_medium=android_app&utm_name=androidcss&utm_term=1&utm_content=share_button',
'only_matching': True,
}]
def _real_extract(self, url):
return self.url_result(self._match_valid_url(url).group('url'))

View file

@ -1,239 +1,199 @@
import itertools
import re
import functools
import urllib.parse
import hashlib
from .common import InfoExtractor
from ..utils import (
ExtractorError,
OnDemandPagedList,
int_or_none,
mimetype2ext,
remove_end,
strip_or_none,
unified_strdate,
url_or_none,
urljoin,
qualities,
traverse_obj,
unified_timestamp,
)
class IwaraBaseIE(InfoExtractor):
_BASE_REGEX = r'(?P<base_url>https?://(?:www\.|ecchi\.)?iwara\.tv)'
def _extract_playlist(self, base_url, webpage):
for path in re.findall(r'class="title">\s*<a[^<]+href="([^"]+)', webpage):
yield self.url_result(urljoin(base_url, path))
class IwaraIE(IwaraBaseIE):
_VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/videos/(?P<id>[a-zA-Z0-9]+)'
class IwaraIE(InfoExtractor):
IE_NAME = 'iwara'
_VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos?/(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'http://iwara.tv/videos/amVwUl1EHpAD9RD',
# md5 is unstable
# this video cannot be played because of migration
'only_matching': True,
'url': 'https://www.iwara.tv/video/k2ayoueezfkx6gvq',
'info_dict': {
'id': 'amVwUl1EHpAD9RD',
'id': 'k2ayoueezfkx6gvq',
'ext': 'mp4',
'title': '【MMD R-18】ガールフレンド carry_me_off',
'age_limit': 18,
'thumbnail': 'https://i.iwara.tv/sites/default/files/videos/thumbnails/7951/thumbnail-7951_0001.png',
'uploader': 'Reimu丨Action',
'upload_date': '20150828',
'description': 'md5:1d4905ce48c66c9299c617f08e106e0f',
'title': 'Defeat of Irybelda - アイリベルダの敗北',
'description': 'md5:70278abebe706647a8b4cb04cf23e0d3',
'uploader': 'Inwerwm',
'uploader_id': 'inwerwm',
'tags': 'count:1',
'like_count': 6133,
'view_count': 1050343,
'comment_count': 1,
'timestamp': 1677843869,
'modified_timestamp': 1679056362,
},
}, {
'url': 'http://ecchi.iwara.tv/videos/Vb4yf2yZspkzkBO',
'md5': '7e5f1f359cd51a027ba4a7b7710a50f0',
'url': 'https://iwara.tv/video/1ywe1sbkqwumpdxz5/',
'md5': '20691ce1473ec2766c0788e14c60ce66',
'info_dict': {
'id': '0B1LvuHnL-sRFNXB1WHNqbGw4SXc',
'ext': 'mp4',
'title': '[3D Hentai] Kyonyu × Genkai × Emaki Shinobi Girls.mp4',
'age_limit': 18,
},
'add_ie': ['GoogleDrive'],
}, {
'url': 'http://www.iwara.tv/videos/nawkaumd6ilezzgq',
# md5 is unstable
'info_dict': {
'id': '6liAP9s2Ojc',
'id': '1ywe1sbkqwumpdxz5',
'ext': 'mp4',
'age_limit': 18,
'title': '[MMD] Do It Again Ver.2 [1080p 60FPS] (Motion,Camera,Wav+DL)',
'description': 'md5:590c12c0df1443d833fbebe05da8c47a',
'upload_date': '20160910',
'uploader': 'aMMDsork',
'uploader_id': 'UCVOFyOSCyFkXTYYHITtqB7A',
'title': 'Aponia 阿波尼亚SEX Party Tonight 手动脱衣 大奶 裸腿',
'description': 'md5:0c4c310f2e0592d68b9f771d348329ca',
'uploader': '龙也zZZ',
'uploader_id': 'user792540',
'tags': [
'uncategorized'
],
'like_count': 1809,
'view_count': 25156,
'comment_count': 1,
'timestamp': 1678732213,
'modified_timestamp': 1679110271,
},
'add_ie': ['Youtube'],
}]
def _extract_formats(self, video_id, fileurl):
up = urllib.parse.urlparse(fileurl)
q = urllib.parse.parse_qs(up.query)
paths = up.path.rstrip('/').split('/')
# https://github.com/yt-dlp/yt-dlp/issues/6549#issuecomment-1473771047
x_version = hashlib.sha1('_'.join((paths[-1], q['expires'][0], '5nFp9kmbNnHdAFhaqMvt')).encode()).hexdigest()
preference = qualities(['preview', '360', '540', 'Source'])
files = self._download_json(fileurl, video_id, headers={'X-Version': x_version})
for fmt in files:
yield traverse_obj(fmt, {
'format_id': 'name',
'url': ('src', ('view', 'download'), {self._proto_relative_url}),
'ext': ('type', {mimetype2ext}),
'quality': ('name', {preference}),
'height': ('name', {int_or_none}),
}, get_all=False)
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(f'https://api.iwara.tv/video/{video_id}', video_id, expected_status=lambda x: True)
errmsg = video_data.get('message')
# at this point we can actually get uploaded user info, but do we need it?
if errmsg == 'errors.privateVideo':
self.raise_login_required('Private video. Login if you have permissions to watch')
elif errmsg:
raise ExtractorError(f'Iwara says: {errmsg}')
webpage, urlh = self._download_webpage_handle(url, video_id)
hostname = urllib.parse.urlparse(urlh.geturl()).hostname
# ecchi is 'sexy' in Japanese
age_limit = 18 if hostname.split('.')[0] == 'ecchi' else 0
video_data = self._download_json('http://www.iwara.tv/api/video/%s' % video_id, video_id)
if not video_data:
iframe_url = self._html_search_regex(
r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1',
webpage, 'iframe URL', group='url')
return {
'_type': 'url_transparent',
'url': iframe_url,
'age_limit': age_limit,
}
title = remove_end(self._html_extract_title(webpage), ' | Iwara')
thumbnail = self._html_search_regex(
r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None)
uploader = self._html_search_regex(
r'class="username">([^<]+)', webpage, 'uploader', fatal=False)
upload_date = unified_strdate(self._html_search_regex(
r'作成日:([^\s]+)', webpage, 'upload_date', fatal=False))
description = strip_or_none(self._search_regex(
r'<p>(.+?(?=</div))', webpage, 'description', fatal=False,
flags=re.DOTALL))
formats = []
for a_format in video_data:
format_uri = url_or_none(a_format.get('uri'))
if not format_uri:
continue
format_id = a_format.get('resolution')
height = int_or_none(self._search_regex(
r'(\d+)p', format_id, 'height', default=None))
formats.append({
'url': self._proto_relative_url(format_uri, 'https:'),
'format_id': format_id,
'ext': mimetype2ext(a_format.get('mime')) or 'mp4',
'height': height,
'width': int_or_none(height / 9.0 * 16.0 if height else None),
'quality': 1 if format_id == 'Source' else 0,
})
if not video_data.get('fileUrl'):
if video_data.get('embedUrl'):
return self.url_result(video_data.get('embedUrl'))
raise ExtractorError('This video is unplayable', expected=True)
return {
'id': video_id,
'title': title,
'age_limit': age_limit,
'formats': formats,
'thumbnail': self._proto_relative_url(thumbnail, 'https:'),
'uploader': uploader,
'upload_date': upload_date,
'description': description,
'age_limit': 18 if video_data.get('rating') == 'ecchi' else 0, # ecchi is 'sexy' in Japanese
**traverse_obj(video_data, {
'title': 'title',
'description': 'body',
'uploader': ('user', 'name'),
'uploader_id': ('user', 'username'),
'tags': ('tags', ..., 'id'),
'like_count': 'numLikes',
'view_count': 'numViews',
'comment_count': 'numComments',
'timestamp': ('createdAt', {unified_timestamp}),
'modified_timestamp': ('updatedAt', {unified_timestamp}),
'thumbnail': ('file', 'id', {str}, {
lambda x: f'https://files.iwara.tv/image/thumbnail/{x}/thumbnail-00.jpg'}),
}),
'formats': list(self._extract_formats(video_id, video_data.get('fileUrl'))),
}
class IwaraPlaylistIE(IwaraBaseIE):
_VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/playlist/(?P<id>[^/?#&]+)'
IE_NAME = 'iwara:playlist'
_TESTS = [{
'url': 'https://ecchi.iwara.tv/playlist/best-enf',
'info_dict': {
'title': 'Best enf',
'uploader': 'Jared98112',
'id': 'best-enf',
},
'playlist_mincount': 1097,
}, {
# urlencoded
'url': 'https://ecchi.iwara.tv/playlist/%E3%83%97%E3%83%AC%E3%82%A4%E3%83%AA%E3%82%B9%E3%83%88-2',
'info_dict': {
'id': 'プレイリスト-2',
'title': 'プレイリスト',
'uploader': 'mainyu',
},
'playlist_mincount': 91,
}]
def _real_extract(self, url):
playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url')
playlist_id = urllib.parse.unquote(playlist_id)
webpage = self._download_webpage(url, playlist_id)
return {
'_type': 'playlist',
'id': playlist_id,
'title': self._html_search_regex(r'class="title"[^>]*>([^<]+)', webpage, 'title', fatal=False),
'uploader': self._html_search_regex(r'<h2>([^<]+)', webpage, 'uploader', fatal=False),
'entries': self._extract_playlist(base_url, webpage),
}
class IwaraUserIE(IwaraBaseIE):
_VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/users/(?P<id>[^/?#&]+)'
class IwaraUserIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?iwara\.tv/profile/(?P<id>[^/?#&]+)'
IE_NAME = 'iwara:user'
_PER_PAGE = 32
_TESTS = [{
'note': 'number of all videos page is just 1 page. less than 40 videos',
'url': 'https://ecchi.iwara.tv/users/infinityyukarip',
'url': 'https://iwara.tv/profile/user792540/videos',
'info_dict': {
'title': 'Uploaded videos from Infinity_YukariP',
'id': 'infinityyukarip',
'uploader': 'Infinity_YukariP',
'uploader_id': 'infinityyukarip',
'id': 'user792540',
},
'playlist_mincount': 39,
'playlist_mincount': 80,
}, {
'note': 'no even all videos page. probably less than 10 videos',
'url': 'https://ecchi.iwara.tv/users/mmd-quintet',
'url': 'https://iwara.tv/profile/theblackbirdcalls/videos',
'info_dict': {
'title': 'Uploaded videos from mmd quintet',
'id': 'mmd-quintet',
'uploader': 'mmd quintet',
'uploader_id': 'mmd-quintet',
},
'playlist_mincount': 6,
}, {
'note': 'has paging. more than 40 videos',
'url': 'https://ecchi.iwara.tv/users/theblackbirdcalls',
'info_dict': {
'title': 'Uploaded videos from TheBlackbirdCalls',
'id': 'theblackbirdcalls',
'uploader': 'TheBlackbirdCalls',
'uploader_id': 'theblackbirdcalls',
},
'playlist_mincount': 420,
'playlist_mincount': 723,
}, {
'note': 'foreign chars in URL. there must be foreign characters in URL',
'url': 'https://ecchi.iwara.tv/users/ぶた丼',
'info_dict': {
'title': 'Uploaded videos from ぶた丼',
'id': 'ぶた丼',
'uploader': 'ぶた丼',
'uploader_id': 'ぶた丼',
},
'playlist_mincount': 170,
'url': 'https://iwara.tv/profile/user792540',
'only_matching': True,
}, {
'url': 'https://iwara.tv/profile/theblackbirdcalls',
'only_matching': True,
}]
def _entries(self, playlist_id, base_url):
webpage = self._download_webpage(
f'{base_url}/users/{playlist_id}', playlist_id)
videos_url = self._search_regex(r'<a href="(/users/[^/]+/videos)(?:\?[^"]+)?">', webpage, 'all videos url', default=None)
if not videos_url:
yield from self._extract_playlist(base_url, webpage)
return
videos_url = urljoin(base_url, videos_url)
for n in itertools.count(1):
page = self._download_webpage(
videos_url, playlist_id, note=f'Downloading playlist page {n}',
query={'page': str(n - 1)} if n > 1 else {})
yield from self._extract_playlist(
base_url, page)
if f'page={n}' not in page:
break
def _entries(self, playlist_id, user_id, page):
videos = self._download_json(
'https://api.iwara.tv/videos', playlist_id,
note=f'Downloading page {page}',
query={
'page': page,
'sort': 'date',
'user': user_id,
'limit': self._PER_PAGE,
})
for x in traverse_obj(videos, ('results', ..., 'id')):
yield self.url_result(f'https://iwara.tv/video/{x}')
def _real_extract(self, url):
playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url')
playlist_id = urllib.parse.unquote(playlist_id)
playlist_id = self._match_id(url)
user_info = self._download_json(
f'https://api.iwara.tv/profile/{playlist_id}', playlist_id,
note='Requesting user info')
user_id = traverse_obj(user_info, ('user', 'id'))
return self.playlist_result(
self._entries(playlist_id, base_url), playlist_id)
OnDemandPagedList(
functools.partial(self._entries, playlist_id, user_id),
self._PER_PAGE),
playlist_id, traverse_obj(user_info, ('user', 'name')))
class IwaraPlaylistIE(InfoExtractor):
# the ID is an UUID but I don't think it's necessary to write concrete regex
_VALID_URL = r'https?://(?:www\.)?iwara\.tv/playlist/(?P<id>[0-9a-f-]+)'
IE_NAME = 'iwara:playlist'
_PER_PAGE = 32
_TESTS = [{
'url': 'https://iwara.tv/playlist/458e5486-36a4-4ac0-b233-7e9eef01025f',
'info_dict': {
'id': '458e5486-36a4-4ac0-b233-7e9eef01025f',
},
'playlist_mincount': 3,
}]
def _entries(self, playlist_id, first_page, page):
videos = self._download_json(
'https://api.iwara.tv/videos', playlist_id, f'Downloading page {page}',
query={'page': page, 'limit': self._PER_PAGE}) if page else first_page
for x in traverse_obj(videos, ('results', ..., 'id')):
yield self.url_result(f'https://iwara.tv/video/{x}')
def _real_extract(self, url):
playlist_id = self._match_id(url)
page_0 = self._download_json(
f'https://api.iwara.tv/playlist/{playlist_id}?page=0&limit={self._PER_PAGE}', playlist_id,
note='Requesting playlist info')
return self.playlist_result(
OnDemandPagedList(
functools.partial(self._entries, playlist_id, page_0),
self._PER_PAGE),
playlist_id, traverse_obj(page_0, ('title', 'name')))

View file

@ -8,14 +8,16 @@ class JWPlatformIE(InfoExtractor):
_VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|manifest)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
_TESTS = [{
'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js',
'md5': 'fa8899fa601eb7c83a64e9d568bdf325',
'md5': '3aa16e4f6860e6e78b7df5829519aed3',
'info_dict': {
'id': 'nPripu9l',
'ext': 'mov',
'ext': 'mp4',
'title': 'Big Buck Bunny Trailer',
'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.',
'upload_date': '20081127',
'timestamp': 1227796140,
'duration': 32.0,
'thumbnail': 'https://cdn.jwplayer.com/v2/media/nPripu9l/poster.jpg?width=720',
}
}, {
'url': 'https://cdn.jwplayer.com/players/nPripu9l-ALJ3XQCI.js',
@ -37,18 +39,31 @@ class JWPlatformIE(InfoExtractor):
},
}, {
# Player url not surrounded by quotes
'url': 'https://www.deutsche-kinemathek.de/en/online/streaming/darling-berlin',
'url': 'https://www.deutsche-kinemathek.de/en/online/streaming/school-trip',
'info_dict': {
'id': 'R10NQdhY',
'title': 'Playgirl',
'id': 'jUxh5uin',
'title': 'Klassenfahrt',
'ext': 'mp4',
'upload_date': '20220624',
'thumbnail': 'https://cdn.jwplayer.com/v2/media/R10NQdhY/poster.jpg?width=720',
'timestamp': 1656064800,
'description': 'BRD 1966, Will Tremper',
'duration': 5146.0,
'upload_date': '20230109',
'thumbnail': 'https://cdn.jwplayer.com/v2/media/jUxh5uin/poster.jpg?width=720',
'timestamp': 1673270298,
'description': '',
'duration': 5193.0,
},
'params': {'allowed_extractors': ['generic', 'jwplatform']},
}, {
# iframe src attribute includes backslash before URL string
'url': 'https://www.elespectador.com/colombia/video-asi-se-evito-la-fuga-de-john-poulos-presunto-feminicida-de-valentina-trespalacios-explicacion',
'info_dict': {
'id': 'QD3gsexj',
'title': 'Así se evitó la fuga de John Poulos, presunto feminicida de Valentina Trespalacios',
'ext': 'mp4',
'upload_date': '20230127',
'thumbnail': 'https://cdn.jwplayer.com/v2/media/QD3gsexj/poster.jpg?width=720',
'timestamp': 1674862986,
'description': 'md5:128fd74591c4e1fc2da598c5cb6f5ce4',
'duration': 263.0,
},
}]
@classmethod
@ -57,7 +72,7 @@ def _extract_embed_urls(cls, url, webpage):
# <input value=URL> is used by hyland.com
# if we find <iframe>, dont look for <input>
ret = re.findall(
r'<%s[^>]+?%s=["\']?((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})' % (tag, key),
r'<%s[^>]+?%s=\\?["\']?((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})' % (tag, key),
webpage)
if ret:
return ret

View file

@ -14,7 +14,7 @@
class KickBaseIE(InfoExtractor):
def _real_initialize(self):
self._request_webpage(HEADRequest('https://kick.com/'), None, 'Setting up session')
self._request_webpage(HEADRequest('https://kick.com/'), None, 'Setting up session', fatal=False)
xsrf_token = self._get_cookies('https://kick.com/').get('XSRF-TOKEN')
if not xsrf_token:
self.write_debug('kick.com did not set XSRF-TOKEN cookie')

View file

@ -1,33 +1,24 @@
import itertools
import re
from .common import InfoExtractor
from ..utils import int_or_none, format_field
from ..utils import int_or_none, parse_qs, traverse_obj
class LastFMPlaylistBaseIE(InfoExtractor):
def _entries(self, url, playlist_id):
webpage = self._download_webpage(url, playlist_id)
start_page_number = int_or_none(self._search_regex(
r'\bpage=(\d+)', url, 'page', default=None)) or 1
last_page_number = int_or_none(self._search_regex(
r'>(\d+)</a>[^<]*</li>[^<]*<li[^>]+class="pagination-next', webpage, 'last_page', default=None))
for page_number in range(start_page_number, (last_page_number or start_page_number) + 1):
single_page = traverse_obj(parse_qs(url), ('page', -1, {int_or_none}))
for page in itertools.count(single_page or 1):
webpage = self._download_webpage(
url, playlist_id,
note='Downloading page %d%s' % (page_number, format_field(last_page_number, None, ' of %d')),
query={'page': page_number})
page_entries = [
self.url_result(player_url, 'Youtube')
for player_url in set(re.findall(r'data-youtube-url="([^"]+)"', webpage))
]
for e in page_entries:
yield e
url, playlist_id, f'Downloading page {page}', query={'page': page})
videos = re.findall(r'data-youtube-url="([^"]+)"', webpage)
yield from videos
if single_page or not videos:
return
def _real_extract(self, url):
playlist_id = self._match_id(url)
return self.playlist_result(self._entries(url, playlist_id), playlist_id)
return self.playlist_from_matches(self._entries(url, playlist_id), playlist_id, ie='Youtube')
class LastFMPlaylistIE(LastFMPlaylistBaseIE):
@ -37,7 +28,7 @@ class LastFMPlaylistIE(LastFMPlaylistBaseIE):
'info_dict': {
'id': 'Oasis',
},
'playlist_count': 11,
'playlist_mincount': 11,
}, {
'url': 'https://www.last.fm/music/Oasis',
'only_matching': True,
@ -73,6 +64,18 @@ class LastFMUserIE(LastFMPlaylistBaseIE):
'id': '12319471',
},
'playlist_count': 30,
}, {
'url': 'https://www.last.fm/user/naamloos1/playlists/12543760',
'info_dict': {
'id': '12543760',
},
'playlist_mincount': 80,
}, {
'url': 'https://www.last.fm/user/naamloos1/playlists/12543760?page=3',
'info_dict': {
'id': '12543760',
},
'playlist_count': 32,
}]

View file

@ -1,143 +0,0 @@
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
format_field,
int_or_none,
str_or_none,
)
class LineLiveBaseIE(InfoExtractor):
_API_BASE_URL = 'https://live-api.line-apps.com/web/v4.0/channel/'
def _parse_broadcast_item(self, item):
broadcast_id = compat_str(item['id'])
title = item['title']
is_live = item.get('isBroadcastingNow')
thumbnails = []
for thumbnail_id, thumbnail_url in (item.get('thumbnailURLs') or {}).items():
if not thumbnail_url:
continue
thumbnails.append({
'id': thumbnail_id,
'url': thumbnail_url,
})
channel = item.get('channel') or {}
channel_id = str_or_none(channel.get('id'))
return {
'id': broadcast_id,
'title': title,
'thumbnails': thumbnails,
'timestamp': int_or_none(item.get('createdAt')),
'channel': channel.get('name'),
'channel_id': channel_id,
'channel_url': format_field(channel_id, None, 'https://live.line.me/channels/%s'),
'duration': int_or_none(item.get('archiveDuration')),
'view_count': int_or_none(item.get('viewerCount')),
'comment_count': int_or_none(item.get('chatCount')),
'is_live': is_live,
}
class LineLiveIE(LineLiveBaseIE):
_VALID_URL = r'https?://live\.line\.me/channels/(?P<channel_id>\d+)/broadcast/(?P<id>\d+)'
_TESTS = [{
'url': 'https://live.line.me/channels/5833718/broadcast/18373277',
'md5': '2c15843b8cb3acd55009ddcb2db91f7c',
'info_dict': {
'id': '18373277',
'title': '2021/12/05 15分犬定例譲渡会🐶',
'ext': 'mp4',
'timestamp': 1638674925,
'upload_date': '20211205',
'thumbnail': 'md5:e1f5817e60f4a72b7e43377cf308d7ef',
'channel_url': 'https://live.line.me/channels/5833718',
'channel': 'Yahooニュース掲載🗞プロフ見てね🐕🐕',
'channel_id': '5833718',
'duration': 937,
'view_count': int,
'comment_count': int,
'is_live': False,
}
}, {
# archiveStatus == 'DELETED'
'url': 'https://live.line.me/channels/4778159/broadcast/16378488',
'only_matching': True,
}]
def _real_extract(self, url):
channel_id, broadcast_id = self._match_valid_url(url).groups()
broadcast = self._download_json(
self._API_BASE_URL + '%s/broadcast/%s' % (channel_id, broadcast_id),
broadcast_id)
item = broadcast['item']
info = self._parse_broadcast_item(item)
protocol = 'm3u8' if info['is_live'] else 'm3u8_native'
formats = []
for k, v in (broadcast.get(('live' if info['is_live'] else 'archived') + 'HLSURLs') or {}).items():
if not v:
continue
if k == 'abr':
formats.extend(self._extract_m3u8_formats(
v, broadcast_id, 'mp4', protocol,
m3u8_id='hls', fatal=False))
continue
f = {
'ext': 'mp4',
'format_id': 'hls-' + k,
'protocol': protocol,
'url': v,
}
if not k.isdigit():
f['vcodec'] = 'none'
formats.append(f)
if not formats:
archive_status = item.get('archiveStatus')
if archive_status != 'ARCHIVED':
self.raise_no_formats('this video has been ' + archive_status.lower(), expected=True)
info['formats'] = formats
return info
class LineLiveChannelIE(LineLiveBaseIE):
_VALID_URL = r'https?://live\.line\.me/channels/(?P<id>\d+)(?!/broadcast/\d+)(?:[/?&#]|$)'
_TEST = {
'url': 'https://live.line.me/channels/5893542',
'info_dict': {
'id': '5893542',
'title': 'いくらちゃんだよぉ🦒',
'description': 'md5:4d418087973ad081ceb1b3481f0b1816',
},
'playlist_mincount': 29
}
def _archived_broadcasts_entries(self, archived_broadcasts, channel_id):
while True:
for row in (archived_broadcasts.get('rows') or []):
share_url = str_or_none(row.get('shareURL'))
if not share_url:
continue
info = self._parse_broadcast_item(row)
info.update({
'_type': 'url',
'url': share_url,
'ie_key': LineLiveIE.ie_key(),
})
yield info
if not archived_broadcasts.get('hasNextPage'):
return
archived_broadcasts = self._download_json(
self._API_BASE_URL + channel_id + '/archived_broadcasts',
channel_id, query={
'lastId': info['id'],
})
def _real_extract(self, url):
channel_id = self._match_id(url)
channel = self._download_json(self._API_BASE_URL + channel_id, channel_id)
return self.playlist_result(
self._archived_broadcasts_entries(channel.get('archivedBroadcasts') or {}, channel_id),
channel_id, channel.get('title'), channel.get('information'))

View file

@ -8,12 +8,12 @@
float_or_none,
int_or_none,
str_or_none,
traverse_obj,
traverse_obj
)
class MedalTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?medal\.tv/(?P<path>games/[^/?#&]+/clips)/(?P<id>[^/?#&]+)'
_VALID_URL = r'https?://(?:www\.)?medal\.tv/games/[^/?#&]+/clips/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://medal.tv/games/valorant/clips/jTBFnLKdLy15K',
'md5': '6930f8972914b6b9fdc2bb3918098ba0',
@ -80,25 +80,14 @@ class MedalTVIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
path = self._match_valid_url(url).group('path')
webpage = self._download_webpage(url, video_id)
next_data = self._search_json(
'<script[^>]*__NEXT_DATA__[^>]*>', webpage,
hydration_data = self._search_json(
r'<script[^>]*>[^<]*\bhydrationData\s*=', webpage,
'next data', video_id, end_pattern='</script>', fatal=False)
build_id = next_data.get('buildId')
if not build_id:
raise ExtractorError(
'Could not find build ID.', video_id=video_id)
locale = next_data.get('locale', 'en')
api_response = self._download_json(
f'https://medal.tv/_next/data/{build_id}/{locale}/{path}/{video_id}.json', video_id)
clip = traverse_obj(api_response, ('pageProps', 'clip')) or {}
clip = traverse_obj(hydration_data, ('clips', ...), get_all=False)
if not clip:
raise ExtractorError(
'Could not find video information.', video_id=video_id)
@ -152,7 +141,7 @@ def add_item(container, item_url, height, id_key='format_id', item_id=None):
# Necessary because the id of the author is not known in advance.
# Won't raise an issue if no profile can be found as this is optional.
author = traverse_obj(api_response, ('pageProps', 'profile')) or {}
author = traverse_obj(hydration_data, ('profiles', ...), get_all=False) or {}
author_id = str_or_none(author.get('userId'))
author_url = format_field(author_id, None, 'https://medal.tv/users/%s')

View file

@ -2,16 +2,44 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
remove_end,
str_or_none,
strip_or_none,
traverse_obj,
urljoin,
)
class MediaStreamIE(InfoExtractor):
_VALID_URL = r'https?://mdstrm.com/(?:embed|live-stream)/(?P<id>\w+)'
class MediaStreamBaseIE(InfoExtractor):
_EMBED_BASE_URL = 'https://mdstrm.com/embed'
_BASE_URL_RE = r'https?://mdstrm\.com/(?:embed|live-stream)'
def _extract_mediastream_urls(self, webpage):
yield from traverse_obj(list(self._yield_json_ld(webpage, None)), (
lambda _, v: v['@type'] == 'VideoObject', ('embedUrl', 'contentUrl'),
{lambda x: x if re.match(rf'{self._BASE_URL_RE}/\w+', x) else None}))
for mobj in re.finditer(r'<script[^>]+>[^>]*playerMdStream\.mdstreamVideo\(\s*[\'"](?P<video_id>\w+)', webpage):
yield f'{self._EMBED_BASE_URL}/{mobj.group("video_id")}'
yield from re.findall(
rf'<iframe[^>]+\bsrc="({self._BASE_URL_RE}/\w+)', webpage)
for mobj in re.finditer(
r'''(?x)
<(?:div|ps-mediastream)[^>]+
(class="[^"]*MediaStreamVideoPlayer)[^"]*"[^>]+
data-video-id="(?P<video_id>\w+)"
(?:\s*data-video-type="(?P<video_type>[^"]+))?
(?:[^>]*>\s*<div[^>]+\1[^"]*"[^>]+data-mediastream=["\'][^>]+
https://mdstrm\.com/(?P<live>live-stream))?
''', webpage):
video_type = 'live-stream' if mobj.group('video_type') == 'live' or mobj.group('live') else 'embed'
yield f'https://mdstrm.com/{video_type}/{mobj.group("video_id")}'
class MediaStreamIE(MediaStreamBaseIE):
_VALID_URL = MediaStreamBaseIE._BASE_URL_RE + r'/(?P<id>\w+)'
_TESTS = [{
'url': 'https://mdstrm.com/embed/6318e3f1d1d316083ae48831',
@ -23,6 +51,7 @@ class MediaStreamIE(InfoExtractor):
'thumbnail': r're:^https?://[^?#]+6318e3f1d1d316083ae48831',
'ext': 'mp4',
},
'params': {'skip_download': 'm3u8'},
}]
_WEBPAGE_TESTS = [{
@ -35,9 +64,7 @@ class MediaStreamIE(InfoExtractor):
'ext': 'mp4',
'live_status': 'is_live',
},
'params': {
'skip_download': 'Livestream'
},
'params': {'skip_download': 'Livestream'},
}, {
'url': 'https://www.multimedios.com/television/clases-de-llaves-y-castigos-quien-sabe-mas',
'md5': 'de31f0b1ecc321fb35bf22d58734ea40',
@ -48,6 +75,7 @@ class MediaStreamIE(InfoExtractor):
'thumbnail': 're:^https?://[^?#]+63731bab8ec9b308a2c9ed28',
'ext': 'mp4',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://www.americatv.com.pe/videos/esto-es-guerra/facundo-gonzalez-sufrio-fuerte-golpe-durante-competencia-frente-hugo-garcia-eeg-noticia-139120',
'info_dict': {
@ -57,6 +85,7 @@ class MediaStreamIE(InfoExtractor):
'thumbnail': 're:^https?://[^?#]+63756df1c638b008a5659dec',
'ext': 'mp4',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://www.americatv.com.pe/videos/al-fondo-hay-sitio/nuevas-lomas-town-bernardo-mata-se-enfrento-sujeto-luchar-amor-macarena-noticia-139083',
'info_dict': {
@ -66,26 +95,12 @@ class MediaStreamIE(InfoExtractor):
'thumbnail': 're:^https?://[^?#]+637307669609130f74cd3a6e',
'ext': 'mp4',
},
'params': {'skip_download': 'm3u8'},
}]
@classmethod
def _extract_embed_urls(cls, url, webpage):
for mobj in re.finditer(r'<script[^>]+>[^>]*playerMdStream.mdstreamVideo\(\s*[\'"](?P<video_id>\w+)', webpage):
yield f'https://mdstrm.com/embed/{mobj.group("video_id")}'
yield from re.findall(
r'<iframe[^>]src\s*=\s*"(https://mdstrm.com/[\w-]+/\w+)', webpage)
for mobj in re.finditer(
r'''(?x)
<(?:div|ps-mediastream)[^>]+
class\s*=\s*"[^"]*MediaStreamVideoPlayer[^"]*"[^>]+
data-video-id\s*=\s*"(?P<video_id>\w+)\s*"
(?:\s*data-video-type\s*=\s*"(?P<video_type>[^"]+))?
''', webpage):
video_type = 'live-stream' if mobj.group('video_type') == 'live' else 'embed'
yield f'https://mdstrm.com/{video_type}/{mobj.group("video_id")}'
def _extract_from_webpage(self, url, webpage):
for embed_url in self._extract_mediastream_urls(webpage):
yield self.url_result(embed_url, MediaStreamIE, None)
def _real_extract(self, url):
video_id = self._match_id(url)
@ -94,7 +109,7 @@ def _real_extract(self, url):
if 'Debido a tu ubicación no puedes ver el contenido' in webpage:
self.raise_geo_restricted()
player_config = self._search_json(r'window.MDSTRM.OPTIONS\s*=', webpage, 'metadata', video_id)
player_config = self._search_json(r'window\.MDSTRM\.OPTIONS\s*=', webpage, 'metadata', video_id)
formats, subtitles = [], {}
for video_format in player_config['src']:
@ -122,7 +137,7 @@ def _real_extract(self, url):
}
class WinSportsVideoIE(InfoExtractor):
class WinSportsVideoIE(MediaStreamBaseIE):
_VALID_URL = r'https?://www\.winsports\.co/videos/(?P<id>[\w-]+)'
_TESTS = [{
@ -158,21 +173,36 @@ class WinSportsVideoIE(InfoExtractor):
'ext': 'mp4',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://www.winsports.co/videos/bucaramanga-se-quedo-con-el-grito-de-gol-en-la-garganta',
'info_dict': {
'id': '6402adb62bbf3b18d454e1b0',
'display_id': 'bucaramanga-se-quedo-con-el-grito-de-gol-en-la-garganta',
'title': '⚽Bucaramanga se quedó con el grito de gol en la garganta',
'description': 'Gol anulado Bucaramanga',
'thumbnail': r're:^https?://[^?#]+6402adb62bbf3b18d454e1b0',
'ext': 'mp4',
},
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
json_ld = self._search_json_ld(webpage, display_id, expected_type='VideoObject', default={})
media_setting_json = self._search_json(
r'<script\s*[^>]+data-drupal-selector="drupal-settings-json">', webpage, 'drupal-setting-json', display_id)
data = self._search_json(
r'<script\s*[^>]+data-drupal-selector="drupal-settings-json">', webpage, 'data', display_id)
mediastream_id = traverse_obj(
media_setting_json, ('settings', 'mediastream_formatter', ..., 'mediastream_id', {str_or_none}),
get_all=False) or json_ld.get('url')
if not mediastream_id:
mediastream_url = urljoin(f'{self._EMBED_BASE_URL}/', (
traverse_obj(data, (
(('settings', 'mediastream_formatter', ..., 'mediastream_id'), 'url'), {str}), get_all=False)
or next(self._extract_mediastream_urls(webpage), None)))
if not mediastream_url:
self.raise_no_formats('No MediaStream embed found in webpage')
title = clean_html(remove_end(
self._search_json_ld(webpage, display_id, expected_type='VideoObject', default={}).get('title')
or self._og_search_title(webpage), '| Win Sports'))
return self.url_result(
urljoin('https://mdstrm.com/embed/', mediastream_id), MediaStreamIE, display_id, url_transparent=True,
display_id=display_id, video_title=strip_or_none(remove_end(json_ld.get('title'), '| Win Sports')))
mediastream_url, MediaStreamIE, display_id, url_transparent=True, display_id=display_id, video_title=title)

View file

@ -12,9 +12,13 @@
RegexNotFoundError,
UserNotLive,
clean_html,
determine_ext,
float_or_none,
int_or_none,
mimetype2ext,
parse_age_limit,
parse_duration,
remove_end,
smuggle_url,
traverse_obj,
try_get,
@ -22,7 +26,6 @@
unified_timestamp,
update_url_query,
url_basename,
xpath_attr,
)
@ -660,6 +663,7 @@ class NBCStationsIE(InfoExtractor):
'ext': 'mp4',
'title': 'Large Structure Fire in Downtown LA Prompts Smoke Odor Advisory',
'description': 'md5:417ed3c2d91fe9d301e6db7b0942f182',
'duration': 112.513,
'timestamp': 1661135892,
'upload_date': '20220822',
'uploader': 'NBC 4',
@ -676,6 +680,7 @@ class NBCStationsIE(InfoExtractor):
'ext': 'mp4',
'title': 'Huracán complica que televidente de Tucson reciba reembolso',
'description': 'md5:af298dc73aab74d4fca6abfb12acb6cf',
'duration': 172.406,
'timestamp': 1660886507,
'upload_date': '20220819',
'uploader': 'Telemundo Arizona',
@ -685,6 +690,22 @@ class NBCStationsIE(InfoExtractor):
'params': {
'skip_download': 'm3u8',
},
}, {
# direct mp4 link
'url': 'https://www.nbcboston.com/weather/video-weather/highs-near-freezing-in-boston-on-wednesday/2961135/',
'md5': '9bf8c41dc7abbb75b1a44f1491a4cc85',
'info_dict': {
'id': '2961135',
'ext': 'mp4',
'title': 'Highs Near Freezing in Boston on Wednesday',
'description': 'md5:3ec486609a926c99f00a3512e6c0e85b',
'duration': 235.669,
'timestamp': 1675268656,
'upload_date': '20230201',
'uploader': '',
'channel_id': 'WBTS',
'channel': 'nbcboston',
},
}]
_RESOLUTIONS = {
@ -711,7 +732,7 @@ def _real_extract(self, url):
if not video_data:
raise ExtractorError('No video metadata found in webpage', expected=True)
info, formats, subtitles = {}, [], {}
info, formats = {}, []
is_live = int_or_none(video_data.get('mpx_is_livestream')) == 1
query = {
'formats': 'MPEG-DASH none,M3U none,MPEG-DASH none,MPEG4,MP3',
@ -747,13 +768,14 @@ def _real_extract(self, url):
video_url = traverse_obj(video_data, ((None, ('video', 'meta')), 'mp4_url'), get_all=False)
if video_url:
ext = determine_ext(video_url)
height = self._search_regex(r'\d+-(\d+)p', url_basename(video_url), 'height', default=None)
formats.append({
'url': video_url,
'ext': 'mp4',
'ext': ext,
'width': int_or_none(self._RESOLUTIONS.get(height)),
'height': int_or_none(height),
'format_id': 'http-mp4',
'format_id': f'http-{ext}',
})
info.update({
@ -770,14 +792,25 @@ def _real_extract(self, url):
smil = self._download_xml(
f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id,
note='Downloading SMIL data', query=query, fatal=is_live)
if smil:
manifest_url = xpath_attr(smil, f'.//{{{default_ns}}}video', 'src', fatal=is_live)
subtitles = self._parse_smil_subtitles(smil, default_ns)
subtitles = self._parse_smil_subtitles(smil, default_ns) if smil else {}
for video in smil.findall(self._xpath_ns('.//video', default_ns)) if smil else []:
info['duration'] = float_or_none(remove_end(video.get('dur'), 'ms'), 1000)
video_src_url = video.get('src')
ext = mimetype2ext(video.get('type'), default=determine_ext(video_src_url))
if ext == 'm3u8':
fmts, subs = self._extract_m3u8_formats_and_subtitles(
manifest_url, video_id, 'mp4', m3u8_id='hls', fatal=is_live,
video_src_url, video_id, 'mp4', m3u8_id='hls', fatal=is_live,
live=is_live, errnote='No HLS formats found')
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
elif video_src_url:
formats.append({
'url': video_src_url,
'format_id': f'https-{ext}',
'ext': ext,
'width': int_or_none(video.get('width')),
'height': int_or_none(video.get('height')),
})
if not formats:
self.raise_no_formats('No video content found in webpage', expected=True)

View file

@ -5,7 +5,7 @@
from .common import InfoExtractor
from ..utils import ExtractorError, parse_iso8601
_BASE_URL_RE = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)'
_BASE_URL_RE = r'https?://(?:www\.|beta\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)'
class NebulaBaseIE(InfoExtractor):
@ -183,6 +183,10 @@ class NebulaIE(NebulaBaseIE):
'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
'only_matching': True,
},
{
'url': 'https://beta.nebula.tv/videos/money-episode-1-the-draw',
'only_matching': True,
},
]
def _fetch_video_metadata(self, slug):

View file

@ -6,7 +6,8 @@
traverse_obj,
unescapeHTML,
unified_timestamp,
urljoin
urljoin,
url_or_none
)
@ -334,3 +335,140 @@ def _real_extract(self, url):
for x in traverse_obj(bangumi_list, ('part', ..., 'part-video-dasid')) or []]
return self.playlist_result(bangumis, program_id, title, description)
class NhkRadiruIE(InfoExtractor):
_GEO_COUNTRIES = ['JP']
IE_DESC = 'NHK らじる (Radiru/Rajiru)'
_VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?'
_TESTS = [{
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3853544',
'skip': 'Episode expired on 2023-04-16',
'info_dict': {
'channel': 'NHK-FM',
'description': 'md5:94b08bdeadde81a97df4ec882acce3e9',
'ext': 'm4a',
'id': '0449_01_3853544',
'series': 'ジャズ・トゥナイト',
'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg',
'timestamp': 1680969600,
'title': 'ジャズ・トゥナイト NEWジャズ特集',
'upload_date': '20230408',
'release_timestamp': 1680962400,
'release_date': '20230408',
'was_live': True,
},
}, {
# playlist, airs every weekday so it should _hopefully_ be okay forever
'url': 'https://www.nhk.or.jp/radio/ondemand/detail.html?p=0458_01',
'info_dict': {
'id': '0458_01',
'title': 'ベストオブクラシック',
'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。',
'channel': 'NHK-FM',
'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg',
},
'playlist_mincount': 3,
}, {
# one with letters in the id
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F300_06_3738470',
'note': 'Expires on 2024-03-31',
'info_dict': {
'id': 'F300_06_3738470',
'ext': 'm4a',
'title': '有島武郎「一房のぶどう」',
'description': '朗読:川野一宇(ラジオ深夜便アンカー)\r\n\r\n2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より',
'channel': 'NHKラジオ第1、NHK-FM',
'timestamp': 1635757200,
'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg',
'release_date': '20161207',
'series': 'らじる文庫 by ラジオ深夜便 ',
'release_timestamp': 1481126700,
'upload_date': '20211101',
}
}, {
# news
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_3855109',
'skip': 'Expires on 2023-04-17',
'info_dict': {
'id': 'F261_01_3855109',
'ext': 'm4a',
'channel': 'NHKラジオ第1',
'timestamp': 1681635900,
'release_date': '20230416',
'series': 'NHKラジオニュース',
'title': '午後時のNHKニュース',
'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
'upload_date': '20230416',
'release_timestamp': 1681635600,
},
}]
def _extract_episode_info(self, headline, programme_id, series_meta):
episode_id = f'{programme_id}_{headline["headline_id"]}'
episode = traverse_obj(headline, ('file_list', 0, {dict}))
return {
**series_meta,
'id': episode_id,
'formats': self._extract_m3u8_formats(episode.get('file_name'), episode_id, fatal=False),
'container': 'm4a_dash', # force fixup, AAC-only HLS
'was_live': True,
'series': series_meta.get('title'),
'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'),
**traverse_obj(episode, {
'title': 'file_title',
'description': 'file_title_sub',
'timestamp': ('open_time', {unified_timestamp}),
'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}),
}),
}
def _real_extract(self, url):
site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline')
programme_id = f'{site_id}_{corner_id}'
if site_id == 'F261':
json_url = 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json'
else:
json_url = f'https://www.nhk.or.jp/radioondemand/json/{site_id}/bangumi_{programme_id}.json'
meta = self._download_json(json_url, programme_id)['main']
series_meta = traverse_obj(meta, {
'title': 'program_name',
'channel': 'media_name',
'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}),
}, get_all=False)
if headline_id:
return self._extract_episode_info(
traverse_obj(meta, (
'detail_list', lambda _, v: v['headline_id'] == headline_id), get_all=False),
programme_id, series_meta)
def entries():
for headline in traverse_obj(meta, ('detail_list', ..., {dict})):
yield self._extract_episode_info(headline, programme_id, series_meta)
return self.playlist_result(
entries(), programme_id, playlist_description=meta.get('site_detail'), **series_meta)
class NhkRadioNewsPageIE(InfoExtractor):
_VALID_URL = r'https?://www\.nhk\.or\.jp/radionews/?(?:$|[?#])'
_TESTS = [{
# airs daily, on-the-hour most hours
'url': 'https://www.nhk.or.jp/radionews/',
'playlist_mincount': 5,
'info_dict': {
'id': 'F261_01',
'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
'description': 'md5:bf2c5b397e44bc7eb26de98d8f15d79d',
'channel': 'NHKラジオ第1',
'title': 'NHKラジオニュース',
}
}]
def _real_extract(self, url):
return self.url_result('https://www.nhk.or.jp/radio/ondemand/detail.html?p=F261_01', NhkRadiruIE)

View file

@ -477,23 +477,32 @@ def _get_subtitles(self, video_id, api_data, session_api_data):
user_id_str = session_api_data.get('serviceUserId')
thread_ids = traverse_obj(api_data, ('comment', 'threads', lambda _, v: v['isActive']))
raw_danmaku = self._extract_all_comments(video_id, thread_ids, user_id_str, comment_user_key)
if not raw_danmaku:
legacy_danmaku = self._extract_legacy_comments(video_id, thread_ids, user_id_str, comment_user_key) or []
new_comments = traverse_obj(api_data, ('comment', 'nvComment'))
new_danmaku = self._extract_new_comments(
new_comments.get('server'), video_id,
new_comments.get('params'), new_comments.get('threadKey'))
if not legacy_danmaku and not new_danmaku:
self.report_warning(f'Failed to get comments. {bug_reports_message()}')
return
return {
'comments': [{
'ext': 'json',
'data': json.dumps(raw_danmaku),
'data': json.dumps(legacy_danmaku + new_danmaku),
}],
}
def _extract_all_comments(self, video_id, threads, user_id, user_key):
def _extract_legacy_comments(self, video_id, threads, user_id, user_key):
auth_data = {
'user_id': user_id,
'userkey': user_key,
} if user_id and user_key else {'user_id': ''}
api_url = traverse_obj(threads, (..., 'server'), get_all=False)
# Request Start
post_data = [{'ping': {'content': 'rs:0'}}]
for i, thread in enumerate(threads):
@ -532,17 +541,32 @@ def _extract_all_comments(self, video_id, threads, user_id, user_key):
# Request Final
post_data.append({'ping': {'content': 'rf:0'}})
for api_url in self._COMMENT_API_ENDPOINTS:
comments = self._download_json(
api_url, video_id, data=json.dumps(post_data).encode(), fatal=False,
return self._download_json(
f'{api_url}/api.json', video_id, data=json.dumps(post_data).encode(), fatal=False,
headers={
'Referer': 'https://www.nicovideo.jp/watch/%s' % video_id,
'Referer': f'https://www.nicovideo.jp/watch/{video_id}',
'Origin': 'https://www.nicovideo.jp',
'Content-Type': 'text/plain;charset=UTF-8',
},
note='Downloading comments', errnote=f'Failed to access endpoint {api_url}')
if comments:
return comments
def _extract_new_comments(self, endpoint, video_id, params, thread_key):
comments = self._download_json(
f'{endpoint}/v1/threads', video_id, data=json.dumps({
'additionals': {},
'params': params,
'threadKey': thread_key,
}).encode(), fatal=False,
headers={
'Referer': 'https://www.nicovideo.jp/',
'Origin': 'https://www.nicovideo.jp',
'Content-Type': 'text/plain;charset=UTF-8',
'x-client-os-type': 'others',
'x-frontend-id': '6',
'x-frontend-version': '0',
},
note='Downloading comments (new)', errnote='Failed to download comments (new)')
return traverse_obj(comments, ('data', 'threads', ..., 'comments', ...))
class NiconicoPlaylistBaseIE(InfoExtractor):
@ -636,10 +660,10 @@ def _real_extract(self, url):
class NiconicoSeriesIE(InfoExtractor):
IE_NAME = 'niconico:series'
_VALID_URL = r'https?://(?:(?:www\.|sp\.)?nicovideo\.jp|nico\.ms)/series/(?P<id>\d+)'
_VALID_URL = r'https?://(?:(?:www\.|sp\.)?nicovideo\.jp(?:/user/\d+)?|nico\.ms)/series/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.nicovideo.jp/series/110226',
'url': 'https://www.nicovideo.jp/user/44113208/series/110226',
'info_dict': {
'id': '110226',
'title': 'ご立派ァ!のシリーズ',
@ -659,7 +683,7 @@ class NiconicoSeriesIE(InfoExtractor):
def _real_extract(self, url):
list_id = self._match_id(url)
webpage = self._download_webpage(f'https://www.nicovideo.jp/series/{list_id}', list_id)
webpage = self._download_webpage(url, list_id)
title = self._search_regex(
(r'<title>「(.+)(全',
@ -667,10 +691,9 @@ def _real_extract(self, url):
webpage, 'title', fatal=False)
if title:
title = unescapeHTML(title)
playlist = [
self.url_result(f'https://www.nicovideo.jp/watch/{v_id}', video_id=v_id)
for v_id in re.findall(r'data-href=[\'"](?:https://www\.nicovideo\.jp)?/watch/([a-z0-9]+)', webpage)]
return self.playlist_result(playlist, list_id, title)
json_data = next(self._yield_json_ld(webpage, None, fatal=False))
return self.playlist_from_matches(
traverse_obj(json_data, ('itemListElement', ..., 'url')), list_id, title, ie=NiconicoIE)
class NiconicoHistoryIE(NiconicoPlaylistBaseIE):

View file

@ -1,13 +1,14 @@
import functools
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..utils import (
clean_html,
format_field,
int_or_none,
strip_or_none,
traverse_obj,
unified_timestamp,
urlencode_postdata,
urljoin,
)
@ -24,7 +25,7 @@ class ParlerIE(InfoExtractor):
'thumbnail': 'https://bl-images.parler.com/videos/6ce7cdf3-a27a-4d72-bf9c-d3e17ce39a66/thumbnail.jpeg',
'title': 'Parler video #df79fdba-07cc-48fe-b085-3293897520d7',
'description': 'md5:6f220bde2df4a97cbb89ac11f1fd8197',
'timestamp': 1659744000,
'timestamp': 1659785481,
'upload_date': '20220806',
'uploader': 'Tulsi Gabbard',
'uploader_id': 'TulsiGabbard',
@ -34,78 +35,57 @@ class ParlerIE(InfoExtractor):
'repost_count': int,
},
},
{
'url': 'https://parler.com/feed/a7406eb4-91e5-4793-b5e3-ade57a24e287',
'md5': '11687e2f5bb353682cee338d181422ed',
'info_dict': {
'id': 'a7406eb4-91e5-4793-b5e3-ade57a24e287',
'ext': 'mp4',
'thumbnail': 'https://bl-images.parler.com/videos/317827a8-1e48-4cbc-981f-7dd17d4c1183/thumbnail.jpeg',
'title': 'Parler video #a7406eb4-91e5-4793-b5e3-ade57a24e287',
'description': 'This man should run for office',
'timestamp': 1659657600,
'upload_date': '20220805',
'uploader': 'Benny Johnson',
'uploader_id': 'BennyJohnson',
'uploader_url': 'https://parler.com/BennyJohnson',
'view_count': int,
'comment_count': int,
'repost_count': int,
},
},
{
'url': 'https://parler.com/feed/f23b85c1-6558-470f-b9ff-02c145f28da5',
'md5': 'eaba1ff4a10fe281f5ce74e930ab2cb4',
'info_dict': {
'id': 'r5vkSaz8PxQ',
'ext': 'mp4',
'thumbnail': 'https://i.ytimg.com/vi_webp/r5vkSaz8PxQ/maxresdefault.webp',
'title': 'Tom MacDonald Names Reaction',
'description': 'md5:33c21f0d35ae6dc2edf3007d6696baea',
'upload_date': '20220716',
'duration': 1267,
'uploader': 'Mahesh Chookolingo',
'uploader_id': 'maheshchookolingo',
'uploader_url': 'http://www.youtube.com/user/maheshchookolingo',
'channel': 'Mahesh Chookolingo',
'channel_id': 'UCox6YeMSY1PQInbCtTaZj_w',
'channel_url': 'https://www.youtube.com/channel/UCox6YeMSY1PQInbCtTaZj_w',
'categories': ['Entertainment'],
'tags': list,
'availability': 'public',
'live_status': 'not_live',
'view_count': int,
'comment_count': int,
'duration': 1267,
'like_count': int,
'channel_follower_count': int,
'age_limit': 0,
'channel_id': 'UCox6YeMSY1PQInbCtTaZj_w',
'upload_date': '20220716',
'thumbnail': 'https://i.ytimg.com/vi/r5vkSaz8PxQ/maxresdefault.jpg',
'tags': 'count:17',
'availability': 'public',
'categories': ['Entertainment'],
'playable_in_embed': True,
'channel': 'Who Knows What! With Mahesh & Friends',
'title': 'Tom MacDonald Names Reaction',
'uploader': 'Who Knows What! With Mahesh & Friends',
'uploader_id': '@maheshchookolingo',
'age_limit': 0,
'description': 'md5:33c21f0d35ae6dc2edf3007d6696baea',
'channel_url': 'https://www.youtube.com/channel/UCox6YeMSY1PQInbCtTaZj_w',
'view_count': int,
'uploader_url': 'http://www.youtube.com/@maheshchookolingo',
},
},
]
def _real_extract(self, url):
video_id = self._match_id(url)
data = self._download_json(
'https://parler.com/open-api/ParleyDetailEndpoint.php', video_id,
data=urlencode_postdata({'uuid': video_id}))['data'][0]
primary = data['primary']
embed = self._parse_json(primary.get('V2LINKLONG') or '', video_id, fatal=False)
if embed:
return self.url_result(embed[0], YoutubeIE)
data = self._download_json(f'https://api.parler.com/v0/public/parleys/{video_id}',
video_id)['data']
if data.get('link'):
return self.url_result(data['link'], YoutubeIE)
return {
'id': video_id,
'url': traverse_obj(primary, ('video_data', 'videoSrc')),
'thumbnail': traverse_obj(primary, ('video_data', 'thumbnailUrl')),
'title': '',
'description': strip_or_none(clean_html(primary.get('full_body'))) or None,
'timestamp': unified_timestamp(primary.get('date_created')),
'uploader': strip_or_none(primary.get('name')),
'uploader_id': strip_or_none(primary.get('username')),
'uploader_url': format_field(strip_or_none(primary.get('username')), None, 'https://parler.com/%s'),
'view_count': int_or_none(primary.get('view_count')),
'comment_count': int_or_none(traverse_obj(data, ('engagement', 'commentCount'))),
'repost_count': int_or_none(traverse_obj(data, ('engagement', 'echoCount'))),
'title': strip_or_none(data.get('title')) or '',
**traverse_obj(data, {
'url': ('video', 'videoSrc'),
'thumbnail': ('video', 'thumbnailUrl'),
'description': ('body', {clean_html}),
'timestamp': ('date_created', {unified_timestamp}),
'uploader': ('user', 'name', {strip_or_none}),
'uploader_id': ('user', 'username', {str}),
'uploader_url': ('user', 'username', {functools.partial(urljoin, 'https://parler.com/')}),
'view_count': ('views', {int_or_none}),
'comment_count': ('total_comments', {int_or_none}),
'repost_count': ('echos', {int_or_none}),
})
}

View file

@ -0,0 +1,47 @@
from .brightcove import BrightcoveNewIE
from .common import InfoExtractor
class PGATourIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?pgatour\.com/video/[\w-]+/(?P<tc>T)?(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.pgatour.com/video/competition/T6322447785112/adam-hadwin-2023-the-players-round-4-18th-hole-shot-1',
'info_dict': {
'id': '6322447785112',
'ext': 'mp4',
'title': 'Adam Hadwin | 2023 THE PLAYERS | Round 4 | 18th hole | Shot 1',
'uploader_id': '6116716431001',
'upload_date': '20230312',
'timestamp': 1678653136,
'duration': 20.011,
'thumbnail': r're:^https://.+\.jpg',
'tags': 'count:7',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://www.pgatour.com/video/features/6322506425112/follow-the-players-trophy-on-championship-sunday',
'info_dict': {
'id': '6322506425112',
'ext': 'mp4',
'title': 'Follow THE PLAYERS trophy on Championship Sunday',
'description': 'md5:4d29e4bdfa03694a0ebfd08950398568',
'uploader_id': '6082840763001',
'upload_date': '20230313',
'timestamp': 1678739835,
'duration': 123.435,
'thumbnail': r're:^https://.+\.jpg',
'tags': 'count:8',
},
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
video_id, is_tourcast = self._match_valid_url(url).group('id', 'tc')
# From https://www.pgatour.com/_next/static/chunks/pages/_app-8bcf849560daf38d.js
account_id = '6116716431001' if is_tourcast else '6082840763001'
player_id = 'Vsd5Umu8r' if is_tourcast else 'FWIBYMBPj'
return self.url_result(
f'https://players.brightcove.net/{account_id}/{player_id}_default/index.html?videoId={video_id}',
BrightcoveNewIE)

View file

@ -1,42 +1,60 @@
from .common import InfoExtractor
from ..utils import int_or_none, urljoin
from ..utils import (
clean_html,
int_or_none,
get_element_by_class,
urljoin,
)
class PornezIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?pornez\.net/video(?P<id>[0-9]+)/'
_TEST = {
_VALID_URL = r'https?://(?:www\.)?pornez\.net/(?:video(?P<id>\w+)|watch)/'
_TESTS = [{
'url': 'https://pornez.net/video344819/mistresst-funny_penis_names-wmv/',
'md5': '2e19a0a1cff3a5dbea0ef1b9e80bcbbc',
'info_dict': {
'id': '344819',
'ext': 'mp4',
'title': r'mistresst funny_penis_names wmv',
'title': 'mistresst funny_penis_names wmv',
'thumbnail': r're:^https?://.*\.jpg$',
'age_limit': 18,
}
}
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://pornez.net/watch/leana+lovings+stiff+for+stepdaughter/',
'info_dict': {
'id': '156161',
'ext': 'mp4',
'title': 'Watch leana lovings stiff for stepdaughter porn video.',
'age_limit': 18,
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://pornez.net/videovzs27fj/tutor4k-e14-blue-wave-1080p-nbq-tutor4k-e14-blue-wave/',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
iframe_src = self._html_search_regex(
r'<iframe[^>]+src="([^"]+)"', webpage, 'iframe', fatal=True)
iframe_src = urljoin('https://pornez.net', iframe_src)
title = self._html_search_meta(['name', 'twitter:title', 'og:title'], webpage, 'title', default=None)
if title is None:
title = self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title', fatal=True)
thumbnail = self._html_search_meta(['thumbnailUrl'], webpage, 'title', default=None)
webpage = self._download_webpage(iframe_src, video_id)
entries = self._parse_html5_media_entries(iframe_src, webpage, video_id)[0]
for format in entries['formats']:
height = self._search_regex(r'_(\d+)\.m3u8', format['url'], 'height')
format['format_id'] = '%sp' % height
format['height'] = int_or_none(height)
if not video_id:
video_id = self._search_regex(
r'<link[^>]+\bhref=["\']https?://pornez.net/\?p=(\w+)["\']', webpage, 'id')
iframe_src = self._html_search_regex(r'<iframe[^>]+src="([^"]+)"', webpage, 'iframe')
iframe = self._download_webpage(urljoin('https://pornez.net', iframe_src), video_id)
entries = self._parse_html5_media_entries(iframe_src, iframe, video_id)[0]
for fmt in entries['formats']:
height = self._search_regex(r'_(\d+)\.m3u8', fmt['url'], 'height')
fmt['format_id'] = '%sp' % height
fmt['height'] = int_or_none(height)
entries.update({
'id': video_id,
'title': title,
'thumbnail': thumbnail,
'age_limit': 18
'title': (clean_html(get_element_by_class('video-title', webpage))
or self._html_search_meta(
['twitter:title', 'og:title', 'description'], webpage, 'title', default=None)),
'thumbnail': self._html_search_meta(['thumbnailUrl'], webpage, 'thumb', default=None),
'age_limit': 18,
})
return entries

View file

@ -58,6 +58,11 @@ def dl(*args, **kwargs):
def _real_initialize(self):
self._logged_in = False
def _set_age_cookies(self, host):
self._set_cookie(host, 'age_verified', '1')
self._set_cookie(host, 'accessAgeDisclaimerPH', '1')
self._set_cookie(host, 'accessPH', '1')
def _login(self, host):
if self._logged_in:
return
@ -267,8 +272,7 @@ def _real_extract(self, url):
video_id = mobj.group('id')
self._login(host)
self._set_cookie(host, 'age_verified', '1')
self._set_age_cookies(host)
def dl_webpage(platform):
self._set_cookie(host, 'platform', platform)
@ -569,6 +573,7 @@ def _real_extract(self, url):
mobj = self._match_valid_url(url)
user_id = mobj.group('id')
videos_url = '%s/videos' % mobj.group('url')
self._set_age_cookies(mobj.group('host'))
page = self._extract_page(url)
if page:
videos_url = update_url_query(videos_url, {'page': page})
@ -633,6 +638,7 @@ def _real_extract(self, url):
item_id = mobj.group('id')
self._login(host)
self._set_age_cookies(host)
return self.playlist_result(self._entries(url, host, item_id), item_id)
@ -812,5 +818,6 @@ def _real_extract(self, url):
item_id = mobj.group('id')
self._login(host)
self._set_age_cookies(host)
return self.playlist_result(self._entries(mobj.group('url'), host, item_id), item_id)

View file

@ -1,4 +1,3 @@
import random
import urllib.parse
from .common import InfoExtractor
@ -9,12 +8,14 @@
traverse_obj,
try_get,
unescapeHTML,
urlencode_postdata,
url_or_none,
)
class RedditIE(InfoExtractor):
_VALID_URL = r'https?://(?P<subdomain>[^/]+\.)?reddit(?:media)?\.com/(?P<slug>(?:r|user)/[^/]+/comments/(?P<id>[^/?#&]+))'
_NETRC_MACHINE = 'reddit'
_VALID_URL = r'https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?comments/(?P<id>[^/?#&]+))'
_TESTS = [{
'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/',
'info_dict': {
@ -109,6 +110,46 @@ class RedditIE(InfoExtractor):
'age_limit': 0,
'channel_id': 'dumbfuckers_club',
},
}, {
# post link without subreddit
'url': 'https://www.reddit.com/comments/124pp33',
'md5': '15eec9d828adcef4468b741a7e45a395',
'info_dict': {
'id': 'antsenjc2jqa1',
'ext': 'mp4',
'display_id': '124pp33',
'title': 'Harmless prank of some old friends',
'uploader': 'Dudezila',
'channel_id': 'ContagiousLaughter',
'duration': 17,
'upload_date': '20230328',
'timestamp': 1680012043,
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'age_limit': 0,
'comment_count': int,
'dislike_count': int,
'like_count': int,
},
}, {
# quarantined subreddit post
'url': 'https://old.reddit.com/r/GenZedong/comments/12fujy3/based_hasan/',
'md5': '3156ea69e3c1f1b6259683c5abd36e71',
'info_dict': {
'id': '8bwtclfggpsa1',
'ext': 'mp4',
'display_id': '12fujy3',
'title': 'Based Hasan?',
'uploader': 'KingNigelXLII',
'channel_id': 'GenZedong',
'duration': 16,
'upload_date': '20230408',
'timestamp': 1680979138,
'age_limit': 0,
'comment_count': int,
'dislike_count': int,
'like_count': int,
},
'skip': 'Requires account that has opted-in to the GenZedong subreddit',
}, {
'url': 'https://www.reddit.com/r/videos/comments/6rrwyj',
'only_matching': True,
@ -137,21 +178,45 @@ class RedditIE(InfoExtractor):
'only_matching': True,
}]
@staticmethod
def _gen_session_id():
id_length = 16
rand_max = 1 << (id_length * 4)
return '%0.*x' % (id_length, random.randrange(rand_max))
def _perform_login(self, username, password):
captcha = self._download_json(
'https://www.reddit.com/api/requires_captcha/login.json', None,
'Checking login requirement')['required']
if captcha:
raise ExtractorError('Reddit is requiring captcha before login', expected=True)
login = self._download_json(
f'https://www.reddit.com/api/login/{username}', None, data=urlencode_postdata({
'op': 'login-main',
'user': username,
'passwd': password,
'api_type': 'json',
}), note='Logging in', errnote='Login request failed')
errors = '; '.join(traverse_obj(login, ('json', 'errors', ..., 1)))
if errors:
raise ExtractorError(f'Unable to login, Reddit API says {errors}', expected=True)
elif not traverse_obj(login, ('json', 'data', 'cookie', {str})):
raise ExtractorError('Unable to login, no cookie was returned')
def _real_extract(self, url):
subdomain, slug, video_id = self._match_valid_url(url).group('subdomain', 'slug', 'id')
host, slug, video_id = self._match_valid_url(url).group('host', 'slug', 'id')
self._set_cookie('.reddit.com', 'reddit_session', self._gen_session_id())
self._set_cookie('.reddit.com', '_options', '%7B%22pref_quarantine_optin%22%3A%20true%7D')
data = self._download_json(f'https://{subdomain}reddit.com/{slug}/.json', video_id, fatal=False)
data = self._download_json(
f'https://{host}/{slug}/.json', video_id, fatal=False, expected_status=403)
if not data:
# Fall back to old.reddit.com in case the requested subdomain fails
data = self._download_json(f'https://old.reddit.com/{slug}/.json', video_id)
fallback_host = 'old.reddit.com' if host != 'old.reddit.com' else 'www.reddit.com'
self.to_screen(f'{host} request failed, retrying with {fallback_host}')
data = self._download_json(
f'https://{fallback_host}/{slug}/.json', video_id, expected_status=403)
if traverse_obj(data, 'error') == 403:
reason = data.get('reason')
if reason == 'quarantined':
self.raise_login_required('Quarantined subreddit; an account that has opted in is required')
elif reason == 'private':
self.raise_login_required('Private subreddit; an account that has been approved is required')
else:
raise ExtractorError(f'HTTP Error 403 Forbidden; reason given: {reason}')
data = data[0]['data']['children'][0]['data']
video_url = data['url']

View file

@ -1,5 +1,12 @@
from .common import InfoExtractor
from ..utils import extract_attributes, int_or_none, remove_start, traverse_obj
from ..utils import (
extract_attributes,
int_or_none,
remove_start,
str_or_none,
traverse_obj,
url_or_none,
)
class RozhlasIE(InfoExtractor):
@ -50,7 +57,7 @@ class RozhlasVltavaIE(InfoExtractor):
'url': 'https://wave.rozhlas.cz/papej-masicko-porcujeme-a-bilancujeme-filmy-a-serialy-ktere-letos-zabily-8891337',
'md5': 'ba2fdbc1242fc16771c7695d271ec355',
'info_dict': {
'id': 8891337,
'id': '8891337',
'title': 'md5:21f99739d04ab49d8c189ec711eef4ec',
},
'playlist_count': 1,
@ -69,7 +76,7 @@ class RozhlasVltavaIE(InfoExtractor):
}, {
'url': 'https://wave.rozhlas.cz/poslechnete-si-neklid-podcastovy-thriller-o-vine-strachu-a-vztahu-ktery-zasel-8554744',
'info_dict': {
'id': 8554744,
'id': '8554744',
'title': 'Poslechněte si Neklid. Podcastový thriller o vině, strachu a vztahu, který zašel příliš daleko',
},
'playlist_count': 5,
@ -139,27 +146,62 @@ class RozhlasVltavaIE(InfoExtractor):
'chapter_number': 5,
},
}]
}, {
'url': 'https://dvojka.rozhlas.cz/karel-siktanc-cerny-jezdec-bily-kun-napinava-pohadka-o-tajemnem-prizraku-8946969',
'info_dict': {
'id': '8946969',
'title': 'Karel Šiktanc: Černý jezdec, bílý kůň. Napínavá pohádka o tajemném přízraku',
},
'playlist_count': 1,
'playlist': [{
'info_dict': {
'id': '10631121',
'ext': 'm4a',
'title': 'Karel Šiktanc: Černý jezdec, bílý kůň. Napínavá pohádka o tajemném přízraku',
'description': 'Karel Šiktanc: Černý jezdec, bílý kůň',
'duration': 2656,
'artist': 'Tvůrčí skupina Drama a literatura',
'channel_id': 'dvojka',
},
}],
'params': {'skip_download': 'dash'},
}]
def _extract_video(self, entry):
chapter_number = int_or_none(traverse_obj(entry, ('meta', 'ga', 'contentSerialPart')))
formats = []
audio_id = entry['meta']['ga']['contentId']
for audio in traverse_obj(entry, ('audioLinks', lambda _, v: url_or_none(v['url']))):
ext = audio.get('variant')
if ext == 'dash':
formats.extend(self._extract_mpd_formats(
audio['url'], audio_id, mpd_id=ext, fatal=False))
elif ext == 'hls':
formats.extend(self._extract_m3u8_formats(
audio['url'], audio_id, 'm4a', m3u8_id=ext, fatal=False))
else:
formats.append({
'url': audio['url'],
'ext': ext,
'format_id': ext,
'abr': int_or_none(audio.get('bitrate')),
'acodec': ext,
'vcodec': 'none',
})
chapter_number = traverse_obj(entry, ('meta', 'ga', 'contentSerialPart', {int_or_none}))
return {
'id': entry['meta']['ga']['contentId'],
'title': traverse_obj(entry, ('meta', 'ga', 'contentName')),
'description': entry.get('title'),
'duration': entry.get('duration'),
'artist': traverse_obj(entry, ('meta', 'ga', 'contentAuthor')),
'channel_id': traverse_obj(entry, ('meta', 'ga', 'contentCreator')),
'id': audio_id,
'chapter': traverse_obj(entry, ('meta', 'ga', 'contentNameShort')) if chapter_number else None,
'chapter_number': chapter_number,
'formats': [{
'url': audio_link['url'],
'ext': audio_link.get('variant'),
'format_id': audio_link.get('variant'),
'abr': audio_link.get('bitrate'),
'acodec': audio_link.get('variant'),
'vcodec': 'none',
} for audio_link in entry['audioLinks']],
'formats': formats,
**traverse_obj(entry, {
'title': ('meta', 'ga', 'contentName'),
'description': 'title',
'duration': ('duration', {int_or_none}),
'artist': ('meta', 'ga', 'contentAuthor'),
'channel_id': ('meta', 'ga', 'contentCreator'),
})
}
def _real_extract(self, url):
@ -173,7 +215,7 @@ def _real_extract(self, url):
return {
'_type': 'playlist',
'id': data.get('embedId'),
'id': str_or_none(data.get('embedId')) or video_id,
'title': traverse_obj(data, ('series', 'title')),
'entries': map(self._extract_video, data['playlist']),
}

View file

@ -0,0 +1,285 @@
import re
from .common import InfoExtractor, ExtractorError
from ..utils import (
clean_html,
determine_ext,
int_or_none,
float_or_none,
js_to_json,
mimetype2ext,
traverse_obj,
urljoin,
url_or_none,
)
class RTVCPlayBaseIE(InfoExtractor):
_BASE_VALID_URL = r'https?://(?:www\.)?rtvcplay\.co'
def _extract_player_config(self, webpage, video_id):
return self._search_json(
r'<script\b[^>]*>[^<]*(?:var|let|const)\s+config\s*=', re.sub(r'"\s*\+\s*"', '', webpage),
'player_config', video_id, transform_source=js_to_json)
def _extract_formats_and_subtitles_player_config(self, player_config, video_id):
formats, subtitles = [], {}
for source in traverse_obj(player_config, ('sources', ..., lambda _, v: url_or_none(v['url']))):
ext = mimetype2ext(source.get('mimetype'), default=determine_ext(source['url']))
if ext == 'm3u8':
fmts, subs = self._extract_m3u8_formats_and_subtitles(
source['url'], video_id, 'mp4', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
else:
formats.append({
'url': source['url'],
'ext': ext,
})
return formats, subtitles
class RTVCPlayIE(RTVCPlayBaseIE):
_VALID_URL = RTVCPlayBaseIE._BASE_VALID_URL + r'/(?P<category>(?!embed)[^/]+)/(?:[^?#]+/)?(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.rtvcplay.co/en-vivo/canal-institucional',
'info_dict': {
'id': 'canal-institucional',
'title': r're:^Canal Institucional',
'description': 'md5:eff9e548394175928059320c006031ea',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'live_status': 'is_live',
'ext': 'mp4',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.rtvcplay.co/en-vivo/senal-colombia',
'info_dict': {
'id': 'senal-colombia',
'title': r're:^Señal Colombia',
'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'live_status': 'is_live',
'ext': 'mp4',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.rtvcplay.co/en-vivo/radio-nacional',
'info_dict': {
'id': 'radio-nacional',
'title': r're:^Radio Nacional',
'description': 'md5:5de009bc6a9fa79d2a6cf0b73f977d53',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'live_status': 'is_live',
'ext': 'mp4',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.rtvcplay.co/peliculas-ficcion/senoritas',
'md5': '1288ee6f6d1330d880f98bff2ed710a3',
'info_dict': {
'id': 'senoritas',
'title': 'Señoritas',
'description': 'md5:f095a2bb52cb6cf279daf6302f86fb32',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'ext': 'mp4',
},
}, {
'url': 'https://www.rtvcplay.co/competencias-basicas-ciudadanas-y-socioemocionales/profe-en-tu-casa/james-regresa-clases-28022022',
'md5': 'f040a7380a269ad633cf837384d5e9fc',
'info_dict': {
'id': 'james-regresa-clases-28022022',
'title': 'James regresa a clases - 28/02/2022',
'description': 'md5:c5dcdf757c7ab29305e8763c6007e675',
'ext': 'mp4',
},
}, {
'url': 'https://www.rtvcplay.co/peliculas-documentales/llinas-el-cerebro-y-el-universo',
'info_dict': {
'id': 'llinas-el-cerebro-y-el-universo',
'title': 'Llinás, el cerebro y el universo',
'description': 'md5:add875bf2309bb52b3e8b9b06116d9b0',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_mincount': 3,
}, {
'url': 'https://www.rtvcplay.co/competencias-basicas-ciudadanas-y-socioemocionales/profe-en-tu-casa',
'info_dict': {
'id': 'profe-en-tu-casa',
'title': 'Profe en tu casa',
'description': 'md5:47dbe20e263194413b1db2a2805a4f2e',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_mincount': 537,
}, {
'url': 'https://www.rtvcplay.co/series-al-oido/relato-de-un-naufrago-una-travesia-del-periodismo-a-la-literatura',
'info_dict': {
'id': 'relato-de-un-naufrago-una-travesia-del-periodismo-a-la-literatura',
'title': 'Relato de un náufrago: una travesía del periodismo a la literatura',
'description': 'md5:6da28fdca4a5a568ea47ef65ef775603',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_mincount': 5,
}, {
'url': 'https://www.rtvcplay.co/series-al-oido/diez-versiones',
'info_dict': {
'id': 'diez-versiones',
'title': 'Diez versiones',
'description': 'md5:997471ed971cb3fd8e41969457675306',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_mincount': 20,
}]
def _real_extract(self, url):
video_id, category = self._match_valid_url(url).group('id', 'category')
webpage = self._download_webpage(url, video_id)
hydration = self._search_json(
r'window\.__RTVCPLAY_STATE__\s*=', webpage, 'hydration',
video_id, transform_source=js_to_json)['content']['currentContent']
asset_id = traverse_obj(hydration, ('video', 'assetid'))
if asset_id:
hls_url = hydration['base_url_hls'].replace('[node:field_asset_id]', asset_id)
else:
hls_url = traverse_obj(hydration, ('channel', 'hls'))
metadata = traverse_obj(hydration, {
'title': 'title',
'description': 'description',
'thumbnail': ((('channel', 'image', 'logo'), ('resource', 'image', 'cover_desktop')), 'path'),
}, get_all=False)
# Probably it's a program's page
if not hls_url:
seasons = traverse_obj(
hydration, ('widgets', lambda _, y: y['type'] == 'seasonList', 'contents'),
get_all=False)
if not seasons:
podcast_episodes = hydration.get('audios')
if not podcast_episodes:
raise ExtractorError('Could not find asset_id nor program playlist nor podcast episodes')
return self.playlist_result([
self.url_result(episode['file'], url_transparent=True, **traverse_obj(episode, {
'title': 'title',
'description': ('description', {clean_html}),
'episode_number': ('chapter_number', {float_or_none}, {int_or_none}),
'season_number': ('season', {int_or_none}),
})) for episode in podcast_episodes], video_id, **metadata)
entries = [self.url_result(
urljoin(url, episode['slug']), url_transparent=True,
**traverse_obj(season, {
'season': 'title',
'season_number': ('season', {int_or_none}),
}), **traverse_obj(episode, {
'title': 'title',
'thumbnail': ('image', 'cover', 'path'),
'episode_number': ('chapter_number', {int_or_none}),
})) for season in seasons for episode in traverse_obj(season, ('contents', ...))]
return self.playlist_result(entries, video_id, **metadata)
formats, subtitles = self._extract_m3u8_formats_and_subtitles(hls_url, video_id, 'mp4')
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
'is_live': category == 'en-vivo',
**metadata,
}
class RTVCPlayEmbedIE(RTVCPlayBaseIE):
_VALID_URL = RTVCPlayBaseIE._BASE_VALID_URL + r'/embed/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.rtvcplay.co/embed/72b0e699-248b-4929-a4a8-3782702fa7f9',
'md5': 'ed529aeaee7aa2a72afe91ac7d1177a8',
'info_dict': {
'id': '72b0e699-248b-4929-a4a8-3782702fa7f9',
'title': 'Tráiler: Señoritas',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'ext': 'mp4',
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
player_config = self._extract_player_config(webpage, video_id)
formats, subtitles = self._extract_formats_and_subtitles_player_config(player_config, video_id)
asset_id = traverse_obj(player_config, ('rtvcplay', 'assetid'))
metadata = {} if not asset_id else self._download_json(
f'https://cms.rtvcplay.co/api/v1/video/asset-id/{asset_id}', video_id, fatal=False)
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(metadata, {
'title': 'title',
'description': 'description',
'thumbnail': ('image', ..., 'thumbnail', 'path'),
}, get_all=False)
}
class RTVCKalturaIE(RTVCPlayBaseIE):
_VALID_URL = r'https?://media\.rtvc\.gov\.co/kalturartvc/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://media.rtvc.gov.co/kalturartvc/indexSC.html',
'info_dict': {
'id': 'indexSC',
'title': r're:^Señal Colombia',
'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'live_status': 'is_live',
'ext': 'mp4',
},
'params': {
'skip_download': 'Livestream',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
player_config = self._extract_player_config(webpage, video_id)
formats, subtitles = self._extract_formats_and_subtitles_player_config(player_config, video_id)
channel_id = traverse_obj(player_config, ('rtvcplay', 'channelId'))
metadata = {} if not channel_id else self._download_json(
f'https://cms.rtvcplay.co/api/v1/taxonomy_term/streaming/{channel_id}', video_id, fatal=False)
fmts, subs = self._extract_m3u8_formats_and_subtitles(
traverse_obj(metadata, ('channel', 'hls')), video_id, 'mp4', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
'is_live': True,
**traverse_obj(metadata, {
'title': 'title',
'description': 'description',
'thumbnail': ('channel', 'image', 'logo', 'path'),
})
}

View file

@ -7,8 +7,11 @@
ExtractorError,
UnsupportedError,
clean_html,
determine_ext,
format_field,
get_element_by_class,
int_or_none,
join_nonempty,
parse_count,
parse_iso8601,
traverse_obj,
@ -164,7 +167,13 @@ def _real_extract(self, url):
formats = []
for ext, ext_info in (video.get('ua') or {}).items():
for height, video_info in (ext_info or {}).items():
if isinstance(ext_info, dict):
for height, video_info in ext_info.items():
if not traverse_obj(video_info, ('meta', 'h', {int_or_none})):
video_info.setdefault('meta', {})['h'] = height
ext_info = ext_info.values()
for video_info in ext_info:
meta = video_info.get('meta') or {}
if not video_info.get('url'):
continue
@ -175,12 +184,16 @@ def _real_extract(self, url):
video_info['url'], video_id,
ext='mp4', m3u8_id='hls', fatal=False, live=live_status == 'is_live'))
continue
timeline = ext == 'timeline'
if timeline:
ext = determine_ext(video_info['url'])
formats.append({
'ext': ext,
'acodec': 'none' if timeline else None,
'url': video_info['url'],
'format_id': '%s-%sp' % (ext, height),
'height': int_or_none(height),
'fps': video.get('fps'),
'format_id': join_nonempty(ext, format_field(meta, 'h', '%sp')),
'format_note': 'Timeline' if timeline else None,
'fps': None if timeline else video.get('fps'),
**traverse_obj(meta, {
'tbr': 'bitrate',
'filesize': 'size',
@ -247,6 +260,43 @@ class RumbleIE(InfoExtractor):
}, {
'url': 'http://www.rumble.com/vDMUM1?key=value',
'only_matching': True,
}, {
'note': 'timeline format',
'url': 'https://rumble.com/v2ea9qb-the-u.s.-cannot-hide-this-in-ukraine-anymore-redacted-with-natali-and-clayt.html',
'md5': '40d61fec6c0945bca3d0e1dc1aa53d79',
'params': {'format': 'wv'},
'info_dict': {
'id': 'v2bou5f',
'ext': 'mp4',
'uploader': 'Redacted News',
'upload_date': '20230322',
'timestamp': 1679445010,
'title': 'The U.S. CANNOT hide this in Ukraine anymore | Redacted with Natali and Clayton Morris',
'duration': 892,
'channel': 'Redacted News',
'description': 'md5:aaad0c5c3426d7a361c29bdaaced7c42',
'channel_url': 'https://rumble.com/c/Redacted',
'live_status': 'not_live',
'thumbnail': 'https://sp.rmbl.ws/s8/1/d/x/2/O/dx2Oi.qR4e-small-The-U.S.-CANNOT-hide-this-i.jpg',
},
}, {
'url': 'https://rumble.com/v2e7fju-the-covid-twitter-files-drop-protecting-fauci-while-censoring-the-truth-wma.html',
'info_dict': {
'id': 'v2blzyy',
'ext': 'mp4',
'live_status': 'was_live',
'release_timestamp': 1679446804,
'description': 'md5:2ac4908ccfecfb921f8ffa4b30c1e636',
'release_date': '20230322',
'timestamp': 1679445692,
'duration': 4435,
'upload_date': '20230322',
'title': 'The Covid Twitter Files Drop: Protecting Fauci While Censoring The Truth w/Matt Taibbi',
'uploader': 'Kim Iversen',
'channel_url': 'https://rumble.com/c/KimIversen',
'channel': 'Kim Iversen',
'thumbnail': 'https://sp.rmbl.ws/s8/1/6/b/w/O/6bwOi.qR4e-small-The-Covid-Twitter-Files-Dro.jpg',
},
}]
_WEBPAGE_TESTS = [{

View file

@ -1,7 +1,13 @@
from .common import InfoExtractor
from ..utils import (
smuggle_url,
ExtractorError,
HEADRequest,
float_or_none,
int_or_none,
parse_duration,
parse_iso8601,
traverse_obj,
update_url_query,
url_or_none,
)
@ -11,7 +17,7 @@ class SBSIE(InfoExtractor):
https?://(?:www\.)?sbs\.com\.au/(?:
ondemand(?:
/video/(?:single/)?|
/movie/[^/]+/|
/(?:movie|tv-program)/[^/]+/|
/(?:tv|news)-series/(?:[^/]+/){3}|
.*?\bplay=|/watch/
)|news/(?:embeds/)?video/
@ -27,18 +33,21 @@ class SBSIE(InfoExtractor):
# Original URL is handled by the generic IE which finds the iframe:
# http://www.sbs.com.au/thefeed/blog/2014/08/21/dingo-conservation
'url': 'http://www.sbs.com.au/ondemand/video/single/320403011771/?source=drupal&vertical=thefeed',
'md5': '3150cf278965eeabb5b4cea1c963fe0a',
'md5': '31f84a7a19b53635db63c73f8ab0c4a7',
'info_dict': {
'id': '_rFBPRPO4pMR',
'id': '320403011771', # '_rFBPRPO4pMR',
'ext': 'mp4',
'title': 'Dingo Conservation (The Feed)',
'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5',
'thumbnail': r're:http://.*\.jpg',
'thumbnail': r're:https?://.*\.jpg',
'duration': 308,
'timestamp': 1408613220,
'upload_date': '20140821',
'uploader': 'SBSC',
'tags': None,
'categories': None,
},
'expected_warnings': ['Unable to download JSON metadata'],
}, {
'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed',
'only_matching': True,
@ -70,34 +79,80 @@ class SBSIE(InfoExtractor):
}, {
'url': 'https://www.sbs.com.au/ondemand/tv-series/the-handmaids-tale/season-5/the-handmaids-tale-s5-ep1/2065631811776',
'only_matching': True,
}, {
'url': 'https://www.sbs.com.au/ondemand/tv-program/autun-romes-forgotten-sister/2116212803602',
'only_matching': True,
}]
_GEO_COUNTRIES = ['AU']
_AUS_TV_PARENTAL_GUIDELINES = {
'P': 0,
'C': 7,
'G': 0,
'PG': 0,
'M': 14,
'MA15+': 15,
'MAV15+': 15,
'R18+': 18,
}
_PLAYER_API = 'https://www.sbs.com.au/api/v3'
def _real_extract(self, url):
video_id = self._match_id(url)
player_params = self._download_json(
'http://www.sbs.com.au/api/video_pdkvars/id/%s?form=json' % video_id, video_id)
formats, subtitles = self._extract_smil_formats_and_subtitles(
update_url_query(f'{self._PLAYER_API}/video_smil', {'id': video_id}), video_id)
error = player_params.get('error')
if error:
error_message = 'Sorry, The video you are looking for does not exist.'
video_data = error.get('results') or {}
error_code = error.get('errorCode')
if error_code == 'ComingSoon':
error_message = '%s is not yet available.' % video_data.get('title', '')
elif error_code in ('Forbidden', 'intranetAccessOnly'):
error_message = 'Sorry, This video cannot be accessed via this website'
elif error_code == 'Expired':
error_message = 'Sorry, %s is no longer available.' % video_data.get('title', '')
raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True)
if not formats:
urlh = self._request_webpage(
HEADRequest('https://sbs-vod-prod-01.akamaized.net/'), video_id,
note='Checking geo-restriction', fatal=False, expected_status=403)
if urlh:
error_reasons = urlh.headers.get_all('x-error-reason') or []
if 'geo-blocked' in error_reasons:
self.raise_geo_restricted(countries=['AU'])
self.raise_no_formats('No formats are available', video_id=video_id)
urls = player_params['releaseUrls']
theplatform_url = (urls.get('progressive') or urls.get('html')
or urls.get('standard') or player_params['relatedItemsURL'])
media = traverse_obj(self._download_json(
f'{self._PLAYER_API}/video_stream', video_id, fatal=False,
query={'id': video_id, 'context': 'tv'}), ('video_object', {dict})) or {}
media.update(self._download_json(
f'https://catalogue.pr.sbsod.com/mpx-media/{video_id}',
video_id, fatal=not media) or {})
# For named episodes, use the catalogue's title to set episode, rather than generic 'Episode N'.
if traverse_obj(media, ('partOfSeries', {dict})):
media['epName'] = traverse_obj(media, ('title', {str}))
return {
'_type': 'url_transparent',
'ie_key': 'ThePlatform',
'id': video_id,
'url': smuggle_url(self._proto_relative_url(theplatform_url), {'force_smil_url': True}),
'is_live': player_params.get('streamType') == 'live',
**traverse_obj(media, {
'title': ('name', {str}),
'description': ('description', {str}),
'channel': ('taxonomy', 'channel', 'name', {str}),
'series': ((('partOfSeries', 'name'), 'seriesTitle'), {str}),
'series_id': ((('partOfSeries', 'uuid'), 'seriesID'), {str}),
'season_number': ('seasonNumber', {int_or_none}),
'episode': ('epName', {str}),
'episode_number': ('episodeNumber', {int_or_none}),
'timestamp': (('datePublished', ('publication', 'startDate')), {parse_iso8601}),
'release_year': ('releaseYear', {int_or_none}),
'duration': ('duration', ({float_or_none}, {parse_duration})),
'is_live': ('liveStream', {bool}),
'age_limit': (
('classificationID', 'contentRating'), {str.upper}, {self._AUS_TV_PARENTAL_GUIDELINES.get}),
}, get_all=False),
**traverse_obj(media, {
'categories': (('genres', ...), ('taxonomy', ('genre', 'subgenre'), 'name'), {str}),
'tags': (('consumerAdviceTexts', ('sbsSubCertification', 'consumerAdvice')), ..., {str}),
'thumbnails': ('thumbnails', lambda _, v: url_or_none(v['contentUrl']), {
'id': ('name', {str}),
'url': 'contentUrl',
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
}),
}),
'formats': formats,
'subtitles': subtitles,
'uploader': 'SBSC',
}

View file

@ -0,0 +1,31 @@
from .common import InfoExtractor
from .rtvcplay import RTVCKalturaIE
class SenalColombiaLiveIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?senalcolombia\.tv/(?P<id>senal-en-vivo)'
_TESTS = [{
'url': 'https://www.senalcolombia.tv/senal-en-vivo',
'info_dict': {
'id': 'indexSC',
'title': 're:^Señal Colombia',
'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'live_status': 'is_live',
'ext': 'mp4',
},
'params': {
'skip_download': 'Livestream',
},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
hydration = self._search_json(
r'<script\b[^>]*data-drupal-selector\s*=\s*"[^"]*drupal-settings-json[^"]*"[^>]*>',
webpage, 'hydration', display_id)
return self.url_result(hydration['envivosrc'], RTVCKalturaIE, display_id)

View file

@ -0,0 +1,518 @@
import json
import uuid
from .common import InfoExtractor
from ..utils import (
float_or_none,
traverse_obj,
try_call,
unified_timestamp,
url_or_none,
)
class StagePlusVODConcertIE(InfoExtractor):
_NETRC_MACHINE = 'stageplus'
_VALID_URL = r'https?://(?:www\.)?stage-plus\.com/video/(?P<id>vod_concert_\w+)'
_TESTS = [{
'url': 'https://www.stage-plus.com/video/vod_concert_APNM8GRFDPHMASJKBSPJACG',
'playlist_count': 6,
'info_dict': {
'id': 'vod_concert_APNM8GRFDPHMASJKBSPJACG',
'title': 'Yuja Wang plays Rachmaninoff\'s Piano Concerto No. 2 from Odeonsplatz',
'description': 'md5:50f78ec180518c9bdb876bac550996fc',
'artist': ['Yuja Wang', 'Lorenzo Viotti'],
'upload_date': '20230331',
'timestamp': 1680249600,
'release_date': '20210709',
'release_timestamp': 1625788800,
'thumbnails': 'count:3',
},
'playlist': [{
'info_dict': {
'id': 'performance_work_A1IN4PJFE9MM2RJ3CLBMUSJBBSOJAD9O',
'ext': 'mp4',
'title': 'Piano Concerto No. 2 in C Minor, Op. 18',
'description': 'md5:50f78ec180518c9bdb876bac550996fc',
'upload_date': '20230331',
'timestamp': 1680249600,
'release_date': '20210709',
'release_timestamp': 1625788800,
'duration': 2207,
'chapters': 'count:5',
'artist': ['Yuja Wang'],
'composer': ['Sergei Rachmaninoff'],
'album': 'Yuja Wang plays Rachmaninoff\'s Piano Concerto No. 2 from Odeonsplatz',
'album_artist': ['Yuja Wang', 'Lorenzo Viotti'],
'track': 'Piano Concerto No. 2 in C Minor, Op. 18',
'track_number': 1,
'genre': 'Instrumental Concerto',
},
}],
'params': {'skip_download': 'm3u8'},
}]
# TODO: Prune this after livestream and/or album extractors are added
_GRAPHQL_QUERY = '''query videoDetailPage($videoId: ID!, $sliderItemsFirst: Int = 24) {
node(id: $videoId) {
__typename
...LiveConcertFields
... on LiveConcert {
artists {
edges {
role {
...RoleFields
}
node {
id
name
sortName
}
}
}
isAtmos
maxResolution
groups {
id
name
typeDisplayName
}
shortDescription
performanceWorks {
...livePerformanceWorkFields
}
totalDuration
sliders {
...contentContainerFields
}
vodConcert {
__typename
id
}
}
...VideoFields
... on Video {
artists {
edges {
role {
...RoleFields
}
node {
id
name
sortName
}
}
}
isAtmos
maxResolution
isLossless
description
productionDate
takedownDate
sliders {
...contentContainerFields
}
}
...VodConcertFields
... on VodConcert {
artists {
edges {
role {
...RoleFields
}
node {
id
name
sortName
}
}
}
isAtmos
maxResolution
groups {
id
name
typeDisplayName
}
performanceWorks {
...PerformanceWorkFields
}
shortDescription
productionDate
takedownDate
sliders {
...contentContainerFields
}
}
}
}
fragment LiveConcertFields on LiveConcert {
endTime
id
pictures {
...PictureFields
}
reruns {
...liveConcertRerunFields
}
publicationLevel
startTime
streamStartTime
subtitle
title
typeDisplayName
stream {
...liveStreamFields
}
trailerStream {
...streamFields
}
geoAccessCountries
geoAccessMode
}
fragment PictureFields on Picture {
id
url
type
}
fragment liveConcertRerunFields on LiveConcertRerun {
streamStartTime
endTime
startTime
stream {
...rerunStreamFields
}
}
fragment rerunStreamFields on RerunStream {
publicationLevel
streamType
url
}
fragment liveStreamFields on LiveStream {
publicationLevel
streamType
url
}
fragment streamFields on Stream {
publicationLevel
streamType
url
}
fragment RoleFields on Role {
__typename
id
type
displayName
}
fragment livePerformanceWorkFields on LivePerformanceWork {
__typename
id
artists {
...artistWithRoleFields
}
groups {
edges {
node {
id
name
typeDisplayName
}
}
}
work {
...workFields
}
}
fragment artistWithRoleFields on ArtistWithRoleConnection {
edges {
role {
...RoleFields
}
node {
id
name
sortName
}
}
}
fragment workFields on Work {
id
title
movements {
id
title
}
composers {
id
name
}
genre {
id
title
}
}
fragment contentContainerFields on CuratedContentContainer {
__typename
...SliderFields
...BannerFields
}
fragment SliderFields on Slider {
id
headline
items(first: $sliderItemsFirst) {
edges {
node {
id
__typename
...AlbumFields
...ArtistFields
...EpochFields
...GenreFields
...GroupFields
...LiveConcertFields
...PartnerFields
...PerformanceWorkFields
...VideoFields
...VodConcertFields
}
}
}
}
fragment AlbumFields on Album {
artistAndGroupDisplayInfo
id
pictures {
...PictureFields
}
title
}
fragment ArtistFields on Artist {
id
name
roles {
...RoleFields
}
pictures {
...PictureFields
}
}
fragment EpochFields on Epoch {
id
endYear
pictures {
...PictureFields
}
startYear
title
}
fragment GenreFields on Genre {
id
pictures {
...PictureFields
}
title
}
fragment GroupFields on Group {
id
name
typeDisplayName
pictures {
...PictureFields
}
}
fragment PartnerFields on Partner {
id
name
typeDisplayName
subtypeDisplayName
pictures {
...PictureFields
}
}
fragment PerformanceWorkFields on PerformanceWork {
__typename
id
artists {
...artistWithRoleFields
}
groups {
edges {
node {
id
name
typeDisplayName
}
}
}
work {
...workFields
}
stream {
...streamFields
}
vodConcert {
__typename
id
}
duration
cuePoints {
mark
title
}
}
fragment VideoFields on Video {
id
archiveReleaseDate
title
subtitle
pictures {
...PictureFields
}
stream {
...streamFields
}
trailerStream {
...streamFields
}
duration
typeDisplayName
duration
geoAccessCountries
geoAccessMode
publicationLevel
takedownDate
}
fragment VodConcertFields on VodConcert {
id
archiveReleaseDate
pictures {
...PictureFields
}
subtitle
title
typeDisplayName
totalDuration
geoAccessCountries
geoAccessMode
trailerStream {
...streamFields
}
publicationLevel
takedownDate
}
fragment BannerFields on Banner {
description
link
pictures {
...PictureFields
}
title
}'''
_TOKEN = None
def _perform_login(self, username, password):
auth = self._download_json('https://audience.api.stageplus.io/oauth/token', None, headers={
'Content-Type': 'application/json',
'Origin': 'https://www.stage-plus.com',
}, data=json.dumps({
'grant_type': 'password',
'username': username,
'password': password,
'device_info': 'Chrome (Windows)',
'client_device_id': str(uuid.uuid4()),
}, separators=(',', ':')).encode(), note='Logging in')
if auth.get('access_token'):
self._TOKEN = auth['access_token']
def _real_initialize(self):
if self._TOKEN:
return
self._TOKEN = try_call(
lambda: self._get_cookies('https://www.stage-plus.com/')['dgplus_access_token'].value)
if not self._TOKEN:
self.raise_login_required()
def _real_extract(self, url):
concert_id = self._match_id(url)
data = self._download_json('https://audience.api.stageplus.io/graphql', concert_id, headers={
'authorization': f'Bearer {self._TOKEN}',
'content-type': 'application/json',
'Origin': 'https://www.stage-plus.com',
}, data=json.dumps({
'query': self._GRAPHQL_QUERY,
'variables': {'videoId': concert_id},
'operationName': 'videoDetailPage'
}, separators=(',', ':')).encode())['data']['node']
metadata = traverse_obj(data, {
'title': 'title',
'description': ('shortDescription', {str}),
'artist': ('artists', 'edges', ..., 'node', 'name'),
'timestamp': ('archiveReleaseDate', {unified_timestamp}),
'release_timestamp': ('productionDate', {unified_timestamp}),
})
thumbnails = traverse_obj(data, ('pictures', lambda _, v: url_or_none(v['url']), {
'id': 'name',
'url': 'url',
})) or None
m3u8_headers = {'jwt': self._TOKEN}
entries = []
for idx, video in enumerate(traverse_obj(data, (
'performanceWorks', lambda _, v: v['id'] and url_or_none(v['stream']['url']))), 1):
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
video['stream']['url'], video['id'], 'mp4', m3u8_id='hls', headers=m3u8_headers)
entries.append({
'id': video['id'],
'formats': formats,
'subtitles': subtitles,
'http_headers': m3u8_headers,
'album': metadata.get('title'),
'album_artist': metadata.get('artist'),
'track_number': idx,
**metadata,
**traverse_obj(video, {
'title': ('work', 'title'),
'track': ('work', 'title'),
'duration': ('duration', {float_or_none}),
'chapters': (
'cuePoints', lambda _, v: float_or_none(v['mark']) is not None, {
'title': 'title',
'start_time': ('mark', {float_or_none}),
}),
'artist': ('artists', 'edges', ..., 'node', 'name'),
'composer': ('work', 'composers', ..., 'name'),
'genre': ('work', 'genre', 'title'),
}),
})
return self.playlist_result(entries, concert_id, thumbnails=thumbnails, **metadata)

View file

@ -38,11 +38,23 @@ class TelecaribePlayIE(InfoExtractor):
'params': {
'skip_download': 'Livestream',
}
}, {
'url': 'https://www.play.telecaribe.co/liveplus',
'info_dict': {
'id': 'liveplus',
'title': r're:^Señal en vivo Plus',
'live_status': 'is_live',
'ext': 'mp4',
},
'params': {
'skip_download': 'Livestream',
},
'skip': 'Geo-restricted to Colombia',
}]
def _download_player_webpage(self, webpage, display_id):
page_id = self._search_regex(
(r'window.firstPageId\s*=\s*["\']([^"\']+)', r'<div[^>]+id\s*=\s*"pageBackground_([^"]+)'),
(r'window\.firstPageId\s*=\s*["\']([^"\']+)', r'<div[^>]+id\s*=\s*"pageBackground_([^"]+)'),
webpage, 'page_id')
props = self._download_json(self._search_regex(
@ -59,14 +71,16 @@ def _real_extract(self, url):
webpage = self._download_webpage(url, display_id)
player = self._download_player_webpage(webpage, display_id)
if display_id != 'live':
livestream_url = self._search_regex(
r'(?:let|const|var)\s+source\s*=\s*["\']([^"\']+)', player, 'm3u8 url', default=None)
if not livestream_url:
return self.playlist_from_matches(
re.findall(r'<a[^>]+href\s*=\s*"([^"]+\.mp4)', player), display_id,
self._get_clean_title(self._og_search_title(webpage)))
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
self._search_regex(r'(?:let|const|var)\s+source\s*=\s*["\']([^"\']+)', player, 'm3u8 url'),
display_id, 'mp4')
livestream_url, display_id, 'mp4', live=True)
return {
'id': display_id,

View file

@ -5,15 +5,22 @@
class TheSunIE(InfoExtractor):
_VALID_URL = r'https://(?:www\.)?thesun\.co\.uk/[^/]+/(?P<id>\d+)'
_TEST = {
_VALID_URL = r'https?://(?:www\.)?the-?sun(\.co\.uk|\.com)/[^/]+/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.thesun.co.uk/tvandshowbiz/2261604/orlando-bloom-and-katy-perry-post-adorable-instagram-video-together-celebrating-thanksgiving-after-split-rumours/',
'info_dict': {
'id': '2261604',
'title': 'md5:cba22f48bad9218b64d5bbe0e16afddf',
},
'playlist_count': 2,
}
}, {
'url': 'https://www.the-sun.com/entertainment/7611415/1000lb-sisters-fans-rip-amy-dangerous-health-decision/',
'info_dict': {
'id': '7611415',
'title': 'md5:e0b9b976f79dc770e5c80f22f40bb844',
},
'playlist_count': 1,
}]
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'
def _real_extract(self, url):

View file

@ -1,6 +1,7 @@
import itertools
import json
import random
import re
import string
import time
@ -12,15 +13,19 @@
LazyList,
UnsupportedError,
UserNotLive,
determine_ext,
format_field,
get_element_by_id,
get_first,
int_or_none,
join_nonempty,
merge_dicts,
qualities,
remove_start,
srt_subtitles_timecode,
str_or_none,
traverse_obj,
try_call,
try_get,
url_or_none,
)
@ -200,6 +205,16 @@ def parse_url_key(url_key):
known_resolutions = {}
def mp3_meta(url):
return {
'format_note': 'Music track',
'ext': 'mp3',
'acodec': 'mp3',
'vcodec': 'none',
'width': None,
'height': None,
} if determine_ext(url) == 'mp3' else {}
def extract_addr(addr, add_meta={}):
parsed_meta, res = parse_url_key(addr.get('url_key', ''))
if res:
@ -215,7 +230,8 @@ def extract_addr(addr, add_meta={}):
'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked
**add_meta, **parsed_meta,
'format_note': join_nonempty(
add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' ')
add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' '),
**mp3_meta(url),
} for url in addr.get('url_list') or []]
# Hack: Add direct video links first to prioritize them when removing duplicate formats
@ -271,17 +287,15 @@ def extract_addr(addr, add_meta={}):
thumbnails = []
for cover_id in ('cover', 'ai_dynamic_cover', 'animated_cover', 'ai_dynamic_cover_bak',
'origin_cover', 'dynamic_cover'):
cover = video_info.get(cover_id)
if cover:
for cover_url in cover['url_list']:
for cover_url in traverse_obj(video_info, (cover_id, 'url_list', ...)):
thumbnails.append({
'id': cover_id,
'url': cover_url,
})
stats_info = aweme_detail.get('statistics', {})
author_info = aweme_detail.get('author', {})
music_info = aweme_detail.get('music', {})
stats_info = aweme_detail.get('statistics') or {}
author_info = aweme_detail.get('author') or {}
music_info = aweme_detail.get('music') or {}
user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info,
'sec_uid', 'id', 'uid', 'unique_id',
expected_type=str_or_none, get_all=False))
@ -303,20 +317,27 @@ def extract_addr(addr, add_meta={}):
'extractor_key': TikTokIE.ie_key(),
'extractor': TikTokIE.IE_NAME,
'webpage_url': self._create_url(author_info.get('uid'), aweme_id),
'title': aweme_detail.get('desc'),
'description': aweme_detail.get('desc'),
'view_count': int_or_none(stats_info.get('play_count')),
'like_count': int_or_none(stats_info.get('digg_count')),
'repost_count': int_or_none(stats_info.get('share_count')),
'comment_count': int_or_none(stats_info.get('comment_count')),
'uploader': str_or_none(author_info.get('unique_id')),
'creator': str_or_none(author_info.get('nickname')),
'uploader_id': str_or_none(author_info.get('uid')),
**traverse_obj(aweme_detail, {
'title': ('desc', {str}),
'description': ('desc', {str}),
'timestamp': ('create_time', {int_or_none}),
}),
**traverse_obj(stats_info, {
'view_count': 'play_count',
'like_count': 'digg_count',
'repost_count': 'share_count',
'comment_count': 'comment_count',
}, expected_type=int_or_none),
**traverse_obj(author_info, {
'uploader': 'unique_id',
'uploader_id': 'uid',
'creator': 'nickname',
'channel_id': 'sec_uid',
}, expected_type=str_or_none),
'uploader_url': user_url,
'track': music_track,
'album': str_or_none(music_info.get('album')) or None,
'artist': music_author or None,
'timestamp': int_or_none(aweme_detail.get('create_time')),
'formats': formats,
'subtitles': self.extract_subtitles(aweme_detail, aweme_id),
'thumbnails': thumbnails,
@ -328,37 +349,27 @@ def extract_addr(addr, add_meta={}):
'_format_sort_fields': ('quality', 'codec', 'size', 'br'),
}
def _parse_aweme_video_web(self, aweme_detail, webpage_url):
def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id):
video_info = aweme_detail['video']
author_info = traverse_obj(aweme_detail, 'authorInfo', 'author', expected_type=dict, default={})
music_info = aweme_detail.get('music') or {}
stats_info = aweme_detail.get('stats') or {}
user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info,
'secUid', 'id', 'uid', 'uniqueId',
expected_type=str_or_none, get_all=False)
or aweme_detail.get('authorSecId'))
channel_id = traverse_obj(author_info or aweme_detail, (('authorSecId', 'secUid'), {str}), get_all=False)
user_url = self._UPLOADER_URL_FORMAT % channel_id if channel_id else None
formats = []
play_url = video_info.get('playAddr')
width = video_info.get('width')
height = video_info.get('height')
if isinstance(play_url, str):
formats = [{
width = int_or_none(video_info.get('width'))
height = int_or_none(video_info.get('height'))
for play_url in traverse_obj(video_info, ('playAddr', ((..., 'src'), None), {url_or_none})):
formats.append({
'url': self._proto_relative_url(play_url),
'ext': 'mp4',
'width': width,
'height': height,
}]
elif isinstance(play_url, list):
formats = [{
'url': self._proto_relative_url(url),
'ext': 'mp4',
'width': width,
'height': height,
} for url in traverse_obj(play_url, (..., 'src'), expected_type=url_or_none) if url]
})
download_url = url_or_none(video_info.get('downloadAddr')) or traverse_obj(video_info, ('download', 'url'), expected_type=url_or_none)
if download_url:
for download_url in traverse_obj(video_info, (('downloadAddr', ('download', 'url')), {url_or_none})):
formats.append({
'format_id': 'download',
'url': self._proto_relative_url(download_url),
@ -366,38 +377,48 @@ def _parse_aweme_video_web(self, aweme_detail, webpage_url):
'width': width,
'height': height,
})
self._remove_duplicate_formats(formats)
thumbnails = []
for thumbnail_name in ('thumbnail', 'cover', 'dynamicCover', 'originCover'):
if aweme_detail.get(thumbnail_name):
thumbnails = [{
'url': self._proto_relative_url(aweme_detail[thumbnail_name]),
for thumb_url in traverse_obj(aweme_detail, (
(None, 'video'), ('thumbnail', 'cover', 'dynamicCover', 'originCover'), {url_or_none})):
thumbnails.append({
'url': self._proto_relative_url(thumb_url),
'width': width,
'height': height
}]
'height': height,
})
return {
'id': traverse_obj(aweme_detail, 'id', 'awemeId', expected_type=str_or_none),
'title': aweme_detail.get('desc'),
'duration': try_get(aweme_detail, lambda x: x['video']['duration'], int),
'view_count': int_or_none(stats_info.get('playCount')),
'like_count': int_or_none(stats_info.get('diggCount')),
'repost_count': int_or_none(stats_info.get('shareCount')),
'comment_count': int_or_none(stats_info.get('commentCount')),
'timestamp': int_or_none(aweme_detail.get('createTime')),
'creator': str_or_none(author_info.get('nickname')),
'uploader': str_or_none(author_info.get('uniqueId') or aweme_detail.get('author')),
'uploader_id': str_or_none(traverse_obj(author_info, 'id', 'uid', 'authorId')),
'id': video_id,
**traverse_obj(aweme_detail, {
'title': ('desc', {str}),
'description': ('desc', {str}),
'duration': ('video', 'duration', {int_or_none}),
'timestamp': ('createTime', {int_or_none}),
}),
**traverse_obj(author_info or aweme_detail, {
'creator': ('nickname', {str}),
'uploader': (('uniqueId', 'author'), {str}),
'uploader_id': (('authorId', 'uid', 'id'), {str_or_none}),
}, get_all=False),
**traverse_obj(stats_info, {
'view_count': 'playCount',
'like_count': 'diggCount',
'repost_count': 'shareCount',
'comment_count': 'commentCount',
}, expected_type=int_or_none),
**traverse_obj(music_info, {
'track': 'title',
'album': ('album', {lambda x: x or None}),
'artist': 'authorName',
}, expected_type=str),
'channel_id': channel_id,
'uploader_url': user_url,
'track': str_or_none(music_info.get('title')),
'album': str_or_none(music_info.get('album')) or None,
'artist': str_or_none(music_info.get('authorName')),
'formats': formats,
'thumbnails': thumbnails,
'description': str_or_none(aweme_detail.get('desc')),
'http_headers': {
'Referer': webpage_url
'Referer': webpage_url,
}
}
@ -431,7 +452,8 @@ class TikTokIE(TikTokBaseIE):
'artist': 'Ysrbeats',
'album': 'Lehanga',
'track': 'Lehanga',
}
},
'skip': '404 Not Found',
}, {
'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en',
'md5': '6f3cf8cdd9b28cb8363fe0a9a160695b',
@ -446,6 +468,7 @@ class TikTokIE(TikTokBaseIE):
'uploader': 'patrox',
'uploader_id': '18702747',
'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
'channel_id': 'MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
'creator': 'patroX',
'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
'upload_date': '20190930',
@ -456,7 +479,7 @@ class TikTokIE(TikTokBaseIE):
'comment_count': int,
'artist': 'Evan Todd, Jessica Keenan Wynn, Alice Lee, Barrett Wilbert Weed & Jon Eidson',
'track': 'Big Fun',
}
},
}, {
# Banned audio, only available on the app
'url': 'https://www.tiktok.com/@barudakhb_/video/6984138651336838402',
@ -469,6 +492,7 @@ class TikTokIE(TikTokBaseIE):
'creator': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6',
'uploader_id': '6974687867511718913',
'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d',
'channel_id': 'MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d',
'track': 'Boka Dance',
'artist': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6',
'timestamp': 1626121503,
@ -479,7 +503,7 @@ class TikTokIE(TikTokBaseIE):
'like_count': int,
'repost_count': int,
'comment_count': int,
}
},
}, {
# Sponsored video, only available with feed workaround
'url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_/video/7042692929109986561',
@ -492,6 +516,7 @@ class TikTokIE(TikTokBaseIE):
'creator': 'Slap And Run',
'uploader_id': '7036055384943690754',
'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_',
'channel_id': 'MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_',
'track': 'Promoted Music',
'timestamp': 1639754738,
'duration': 30,
@ -502,7 +527,6 @@ class TikTokIE(TikTokBaseIE):
'repost_count': int,
'comment_count': int,
},
'expected_warnings': ['trying with webpage', 'Unable to find video in feed']
}, {
# Video without title and description
'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694',
@ -515,6 +539,7 @@ class TikTokIE(TikTokBaseIE):
'creator': 'Pokemon',
'uploader_id': '6820838815978423302',
'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
'channel_id': 'MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
'track': 'original sound',
'timestamp': 1643714123,
'duration': 6,
@ -549,6 +574,56 @@ class TikTokIE(TikTokBaseIE):
'comment_count': int,
},
'skip': 'This video is unavailable',
}, {
# slideshow audio-only mp3 format
'url': 'https://www.tiktok.com/@_le_cannibale_/video/7139980461132074283',
'info_dict': {
'id': '7139980461132074283',
'ext': 'mp3',
'title': 'TikTok video #7139980461132074283',
'description': '',
'creator': 'Antaura',
'uploader': '_le_cannibale_',
'uploader_id': '6604511138619654149',
'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP',
'channel_id': 'MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP',
'artist': 'nathan !',
'track': 'grahamscott canon',
'upload_date': '20220905',
'timestamp': 1662406249,
'view_count': int,
'like_count': int,
'repost_count': int,
'comment_count': int,
'thumbnail': r're:^https://.+\.webp',
},
}, {
# only available via web
'url': 'https://www.tiktok.com/@moxypatch/video/7206382937372134662',
'md5': '8d8c0be14127020cd9f5def4a2e6b411',
'info_dict': {
'id': '7206382937372134662',
'ext': 'mp4',
'title': 'md5:1d95c0b96560ca0e8a231af4172b2c0a',
'description': 'md5:1d95c0b96560ca0e8a231af4172b2c0a',
'creator': 'MoxyPatch',
'uploader': 'moxypatch',
'uploader_id': '7039142049363379205',
'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V',
'channel_id': 'MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V',
'artist': 'your worst nightmare',
'track': 'original sound',
'upload_date': '20230303',
'timestamp': 1677866781,
'duration': 10,
'view_count': int,
'like_count': int,
'repost_count': int,
'comment_count': int,
'thumbnail': r're:^https://.+',
'thumbnails': 'count:3',
},
'expected_warnings': ['Unable to find video in feed'],
}, {
# Auto-captions available
'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758',
@ -563,7 +638,7 @@ def _real_extract(self, url):
self.report_warning(f'{e}; trying with webpage')
url = self._create_url(user_id, video_id)
webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'User-Agent:Mozilla/5.0'})
webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'})
next_data = self._search_nextjs_data(webpage, video_id, default='{}')
if next_data:
status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode'), expected_type=int) or 0
@ -574,7 +649,7 @@ def _real_extract(self, url):
video_data = traverse_obj(sigi_data, ('ItemModule', video_id), expected_type=dict)
if status == 0:
return self._parse_aweme_video_web(video_data, url)
return self._parse_aweme_video_web(video_data, url, video_id)
elif status == 10216:
raise ExtractorError('This video is private', expected=True)
raise ExtractorError('Video not available', video_id=video_id)
@ -801,6 +876,7 @@ class DouyinIE(TikTokBaseIE):
'description': '#杨超越 小小水手带你去远航❤️',
'uploader_id': '110403406559',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'creator': '杨超越',
'duration': 19782,
'timestamp': 1620905839,
@ -810,6 +886,7 @@ class DouyinIE(TikTokBaseIE):
'like_count': int,
'repost_count': int,
'comment_count': int,
'thumbnail': r're:https?://.+\.jpe?g',
},
}, {
'url': 'https://www.douyin.com/video/6982497745948921092',
@ -821,8 +898,9 @@ class DouyinIE(TikTokBaseIE):
'description': '这个夏日和小羊@杨超越 一起遇见白色幻想',
'uploader_id': '408654318141572',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
'channel_id': 'MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
'creator': '杨超越工作室',
'duration': 42608,
'duration': 42479,
'timestamp': 1625739481,
'upload_date': '20210708',
'track': '@杨超越工作室创作的原声',
@ -830,6 +908,7 @@ class DouyinIE(TikTokBaseIE):
'like_count': int,
'repost_count': int,
'comment_count': int,
'thumbnail': r're:https?://.+\.jpe?g',
},
}, {
'url': 'https://www.douyin.com/video/6953975910773099811',
@ -841,8 +920,9 @@ class DouyinIE(TikTokBaseIE):
'description': '#一起看海 出现在你的夏日里',
'uploader_id': '110403406559',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'creator': '杨超越',
'duration': 17228,
'duration': 17343,
'timestamp': 1619098692,
'upload_date': '20210422',
'track': '@杨超越创作的原声',
@ -850,6 +930,7 @@ class DouyinIE(TikTokBaseIE):
'like_count': int,
'repost_count': int,
'comment_count': int,
'thumbnail': r're:https?://.+\.jpe?g',
},
}, {
'url': 'https://www.douyin.com/video/6950251282489675042',
@ -878,6 +959,7 @@ class DouyinIE(TikTokBaseIE):
'description': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
'uploader_id': '110403406559',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'creator': '杨超越',
'duration': 15115,
'timestamp': 1621261163,
@ -887,6 +969,7 @@ class DouyinIE(TikTokBaseIE):
'like_count': int,
'repost_count': int,
'comment_count': int,
'thumbnail': r're:https?://.+\.jpe?g',
},
}]
_APP_VERSIONS = [('23.3.0', '230300')]
@ -918,7 +1001,7 @@ def _real_extract(self, url):
render_data = self._parse_json(
render_data_json, video_id, transform_source=compat_urllib_parse_unquote)
return self._parse_aweme_video_web(get_first(render_data, ('aweme', 'detail')), url)
return self._parse_aweme_video_web(get_first(render_data, ('aweme', 'detail')), url, video_id)
class TikTokVMIE(InfoExtractor):
@ -983,40 +1066,173 @@ def _real_extract(self, url):
return self.url_result(new_url)
class TikTokLiveIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?tiktok\.com/@(?P<id>[\w\.-]+)/live'
class TikTokLiveIE(TikTokBaseIE):
_VALID_URL = r'''(?x)https?://(?:
(?:www\.)?tiktok\.com/@(?P<uploader>[\w.-]+)/live|
m\.tiktok\.com/share/live/(?P<id>\d+)
)'''
IE_NAME = 'tiktok:live'
_TESTS = [{
'url': 'https://www.tiktok.com/@weathernewslive/live',
'info_dict': {
'id': '7210809319192726273',
'ext': 'mp4',
'title': r're:ウェザーニュースLiVE[\d\s:-]*',
'creator': 'ウェザーニュースLiVE',
'uploader': 'weathernewslive',
'uploader_id': '6621496731283095554',
'uploader_url': 'https://www.tiktok.com/@weathernewslive',
'live_status': 'is_live',
'concurrent_view_count': int,
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://www.tiktok.com/@pilarmagenta/live',
'info_dict': {
'id': '7209423610325322522',
'ext': 'mp4',
'title': str,
'creator': 'Pilarmagenta',
'uploader': 'pilarmagenta',
'uploader_id': '6624846890674683909',
'uploader_url': 'https://www.tiktok.com/@pilarmagenta',
'live_status': 'is_live',
'concurrent_view_count': int,
},
'skip': 'Livestream',
}, {
'url': 'https://m.tiktok.com/share/live/7209423610325322522/?language=en',
'only_matching': True,
}, {
'url': 'https://www.tiktok.com/@iris04201/live',
'only_matching': True,
}]
def _call_api(self, url, param, room_id, uploader, key=None):
response = traverse_obj(self._download_json(
url, room_id, fatal=False, query={
'aid': '1988',
param: room_id,
}), (key, {dict}), default={})
# status == 2 if live else 4
if int_or_none(response.get('status')) == 2:
return response
# If room_id is obtained via mobile share URL and cannot be refreshed, do not wait for live
elif not uploader:
raise ExtractorError('This livestream has ended', expected=True)
raise UserNotLive(video_id=uploader)
def _real_extract(self, url):
uploader = self._match_id(url)
webpage = self._download_webpage(url, uploader, headers={'User-Agent': 'User-Agent:Mozilla/5.0'})
room_id = self._html_search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=None)
uploader, room_id = self._match_valid_url(url).group('uploader', 'id')
webpage = self._download_webpage(
url, uploader or room_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=not room_id)
if webpage:
data = try_call(lambda: self._get_sigi_state(webpage, uploader or room_id))
room_id = (traverse_obj(data, ('UserModule', 'users', ..., 'roomId', {str_or_none}), get_all=False)
or self._search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=None)
or room_id)
uploader = uploader or traverse_obj(
data, ('LiveRoom', 'liveRoomUserInfo', 'user', 'uniqueId'),
('UserModule', 'users', ..., 'uniqueId'), get_all=False, expected_type=str)
if not room_id:
raise UserNotLive(video_id=uploader)
live_info = traverse_obj(self._download_json(
'https://www.tiktok.com/api/live/detail/', room_id, query={
'aid': '1988',
'roomID': room_id,
}), 'LiveRoomInfo', expected_type=dict, default={})
if 'status' not in live_info:
raise ExtractorError('Unexpected response from TikTok API')
# status = 2 if live else 4
if not int_or_none(live_info['status']) == 2:
raise UserNotLive(video_id=uploader)
formats = []
live_info = self._call_api(
'https://webcast.tiktok.com/webcast/room/info', 'room_id', room_id, uploader, key='data')
get_quality = qualities(('SD1', 'ld', 'SD2', 'sd', 'HD1', 'hd', 'FULL_HD1', 'uhd', 'ORIGION', 'origin'))
parse_inner = lambda x: self._parse_json(x, None)
for quality, stream in traverse_obj(live_info, (
'stream_url', 'live_core_sdk_data', 'pull_data', 'stream_data',
{parse_inner}, 'data', {dict}), default={}).items():
sdk_params = traverse_obj(stream, ('main', 'sdk_params', {parse_inner}, {
'vcodec': ('VCodec', {str}),
'tbr': ('vbitrate', {lambda x: int_or_none(x, 1000)}),
'resolution': ('resolution', {lambda x: re.match(r'(?i)\d+x\d+|\d+p', x).group().lower()}),
}))
flv_url = traverse_obj(stream, ('main', 'flv', {url_or_none}))
if flv_url:
formats.append({
'url': flv_url,
'ext': 'flv',
'format_id': f'flv-{quality}',
'quality': get_quality(quality),
**sdk_params,
})
hls_url = traverse_obj(stream, ('main', 'hls', {url_or_none}))
if hls_url:
formats.append({
'url': hls_url,
'ext': 'mp4',
'protocol': 'm3u8_native',
'format_id': f'hls-{quality}',
'quality': get_quality(quality),
**sdk_params,
})
def get_vcodec(*keys):
return traverse_obj(live_info, (
'stream_url', *keys, {parse_inner}, 'VCodec', {str}))
for stream in ('hls', 'rtmp'):
stream_url = traverse_obj(live_info, ('stream_url', f'{stream}_pull_url', {url_or_none}))
if stream_url:
formats.append({
'url': stream_url,
'ext': 'mp4' if stream == 'hls' else 'flv',
'protocol': 'm3u8_native' if stream == 'hls' else 'https',
'format_id': f'{stream}-pull',
'vcodec': get_vcodec(f'{stream}_pull_url_params'),
'quality': get_quality('ORIGION'),
})
for f_id, f_url in traverse_obj(live_info, ('stream_url', 'flv_pull_url', {dict}), default={}).items():
if not url_or_none(f_url):
continue
formats.append({
'url': f_url,
'ext': 'flv',
'format_id': f'flv-{f_id}'.lower(),
'vcodec': get_vcodec('flv_pull_url_params', f_id),
'quality': get_quality(f_id),
})
# If uploader is a guest on another's livestream, primary endpoint will not have m3u8 URLs
if not traverse_obj(formats, lambda _, v: v['ext'] == 'mp4'):
live_info = merge_dicts(live_info, self._call_api(
'https://www.tiktok.com/api/live/detail/', 'roomID', room_id, uploader, key='LiveRoomInfo'))
if url_or_none(live_info.get('liveUrl')):
formats.append({
'url': live_info['liveUrl'],
'ext': 'mp4',
'protocol': 'm3u8_native',
'format_id': 'hls-fallback',
'vcodec': 'h264',
'quality': get_quality('origin'),
})
uploader = uploader or traverse_obj(live_info, ('ownerInfo', 'uniqueId'), ('owner', 'display_id'))
return {
'id': room_id,
'title': live_info.get('title') or self._html_search_meta(['og:title', 'twitter:title'], webpage, default=''),
'uploader': uploader,
'uploader_id': traverse_obj(live_info, ('ownerInfo', 'id')),
'creator': traverse_obj(live_info, ('ownerInfo', 'nickname')),
'concurrent_view_count': traverse_obj(live_info, ('liveRoomStats', 'userCount'), expected_type=int),
'formats': self._extract_m3u8_formats(live_info['liveUrl'], room_id, 'mp4', live=True),
'uploader_url': format_field(uploader, None, self._UPLOADER_URL_FORMAT) or None,
'is_live': True,
'formats': formats,
'_format_sort_fields': ('quality', 'ext'),
**traverse_obj(live_info, {
'title': 'title',
'uploader_id': (('ownerInfo', 'owner'), 'id', {str_or_none}),
'creator': (('ownerInfo', 'owner'), 'nickname'),
'concurrent_view_count': (('user_count', ('liveRoomStats', 'userCount')), {int_or_none}),
}, get_all=False),
}

View file

@ -1,15 +1,21 @@
import itertools
import json
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
HEADRequest,
UnsupportedError,
determine_ext,
int_or_none,
parse_resolution,
str_or_none,
traverse_obj,
unified_strdate,
unified_timestamp,
url_basename,
urljoin,
url_or_none,
)
@ -22,25 +28,22 @@ def _perform_login(self, username, password):
if self._API_HEADERS.get('Authorization'):
return
user_check = self._download_json(
headers = {**self._API_HEADERS, 'Content-Type': 'application/json'}
user_check = traverse_obj(self._download_json(
f'{self._API_BASE_URL}/api/user/is-valid-username', None, note='Checking username',
fatal=False, expected_status=400, headers={
'Content-Type': 'application/json',
'Origin': 'https://triller.co',
}, data=json.dumps({'username': username}, separators=(',', ':')).encode('utf-8'))
if user_check.get('status'): # endpoint returns "status":false if username exists
fatal=False, expected_status=400, headers=headers,
data=json.dumps({'username': username}, separators=(',', ':')).encode()), 'status')
if user_check: # endpoint returns `"status":false` if username exists
raise ExtractorError('Unable to login: Invalid username', expected=True)
credentials = {
login = self._download_json(
f'{self._API_BASE_URL}/user/auth', None, note='Logging in', fatal=False,
expected_status=400, headers=headers, data=json.dumps({
'username': username,
'password': password,
}
login = self._download_json(
f'{self._API_BASE_URL}/user/auth', None, note='Logging in',
fatal=False, expected_status=400, headers={
'Content-Type': 'application/json',
'Origin': 'https://triller.co',
}, data=json.dumps(credentials, separators=(',', ':')).encode('utf-8'))
}, separators=(',', ':')).encode()) or {}
if not login.get('auth_token'):
if login.get('error') == 1008:
raise ExtractorError('Unable to login: Incorrect password', expected=True)
@ -55,100 +58,100 @@ def _get_comments(self, video_id, limit=15):
headers=self._API_HEADERS, query={'limit': limit}) or {}
if not comment_info.get('comments'):
return
for comment_dict in comment_info['comments']:
yield {
'author': traverse_obj(comment_dict, ('author', 'username')),
'author_id': traverse_obj(comment_dict, ('author', 'user_id')),
'id': comment_dict.get('id'),
'text': comment_dict.get('body'),
'timestamp': unified_timestamp(comment_dict.get('timestamp')),
}
yield from traverse_obj(comment_info, ('comments', ..., {
'id': ('id', {str_or_none}),
'text': 'body',
'author': ('author', 'username'),
'author_id': ('author', 'user_id'),
'timestamp': ('timestamp', {unified_timestamp}),
}))
def _check_user_info(self, user_info):
if not user_info:
self.report_warning('Unable to extract user info')
elif user_info.get('private') and not user_info.get('followed_by_me'):
if user_info.get('private') and not user_info.get('followed_by_me'):
raise ExtractorError('This video is private', expected=True)
elif traverse_obj(user_info, 'blocked_by_user', 'blocking_user'):
raise ExtractorError('The author of the video is blocked', expected=True)
return user_info
def _parse_video_info(self, video_info, username, user_info=None):
video_uuid = video_info.get('video_uuid')
video_id = video_info.get('id')
def _parse_video_info(self, video_info, username, user_id, display_id=None):
video_id = str(video_info['id'])
display_id = display_id or video_info.get('video_uuid')
if traverse_obj(video_info, (
None, ('transcoded_url', 'video_url', 'stream_url', 'audio_url'),
{lambda x: re.search(r'/copyright/', x)}), get_all=False):
self.raise_no_formats('This video has been removed due to licensing restrictions', expected=True)
def format_info(url):
return {
'url': url,
'ext': determine_ext(url),
'format_id': url_basename(url).split('.')[0],
}
formats = []
video_url = traverse_obj(video_info, 'video_url', 'stream_url')
if video_url:
if determine_ext(video_info.get('transcoded_url')) == 'm3u8':
formats.extend(self._extract_m3u8_formats(
video_info['transcoded_url'], video_id, 'mp4', m3u8_id='hls', fatal=False))
for video in traverse_obj(video_info, ('video_set', lambda _, v: url_or_none(v['url']))):
formats.append({
'url': video_url,
'ext': 'mp4',
'vcodec': 'h264',
'width': video_info.get('width'),
'height': video_info.get('height'),
'format_id': url_basename(video_url).split('.')[0],
'filesize': video_info.get('filesize'),
})
video_set = video_info.get('video_set') or []
for video in video_set:
resolution = video.get('resolution') or ''
formats.append({
'url': video['url'],
'ext': 'mp4',
**format_info(video['url']),
**parse_resolution(video.get('resolution')),
'vcodec': video.get('codec'),
'vbr': int_or_none(video.get('bitrate'), 1000),
'width': int_or_none(resolution.split('x')[0]),
'height': int_or_none(resolution.split('x')[1]),
'format_id': url_basename(video['url']).split('.')[0],
})
audio_url = video_info.get('audio_url')
if audio_url:
video_url = traverse_obj(video_info, 'video_url', 'stream_url', expected_type=url_or_none)
if video_url:
formats.append({
'url': audio_url,
'ext': 'm4a',
'format_id': url_basename(audio_url).split('.')[0],
**format_info(video_url),
'vcodec': 'h264',
**traverse_obj(video_info, {
'width': 'width',
'height': 'height',
'filesize': 'filesize',
}, expected_type=int_or_none),
})
manifest_url = video_info.get('transcoded_url')
if manifest_url:
formats.extend(self._extract_m3u8_formats(
manifest_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
audio_url = url_or_none(video_info.get('audio_url'))
if audio_url:
formats.append(format_info(audio_url))
comment_count = int_or_none(video_info.get('comment_count'))
user_info = user_info or traverse_obj(video_info, 'user', default={})
comment_count = traverse_obj(video_info, ('comment_count', {int_or_none}))
return {
'id': str_or_none(video_id) or video_uuid,
'title': video_info.get('description') or f'Video by {username}',
'thumbnail': video_info.get('thumbnail_url'),
'description': video_info.get('description'),
'uploader': str_or_none(username),
'uploader_id': str_or_none(user_info.get('user_id')),
'creator': str_or_none(user_info.get('name')),
'timestamp': unified_timestamp(video_info.get('timestamp')),
'upload_date': unified_strdate(video_info.get('timestamp')),
'duration': int_or_none(video_info.get('duration')),
'view_count': int_or_none(video_info.get('play_count')),
'like_count': int_or_none(video_info.get('likes_count')),
'artist': str_or_none(video_info.get('song_artist')),
'track': str_or_none(video_info.get('song_title')),
'webpage_url': f'https://triller.co/@{username}/video/{video_uuid}',
'id': video_id,
'display_id': display_id,
'uploader': username,
'uploader_id': user_id or traverse_obj(video_info, ('user', 'user_id', {str_or_none})),
'webpage_url': urljoin(f'https://triller.co/@{username}/video/', display_id),
'uploader_url': f'https://triller.co/@{username}',
'extractor_key': TrillerIE.ie_key(),
'extractor': TrillerIE.IE_NAME,
'formats': formats,
'comment_count': comment_count,
'__post_extractor': self.extract_comments(video_id, comment_count),
**traverse_obj(video_info, {
'title': ('description', {lambda x: x.replace('\r\n', ' ')}),
'description': 'description',
'creator': ((('user'), ('users', lambda _, v: str(v['user_id']) == user_id)), 'name'),
'thumbnail': ('thumbnail_url', {url_or_none}),
'timestamp': ('timestamp', {unified_timestamp}),
'duration': ('duration', {int_or_none}),
'view_count': ('play_count', {int_or_none}),
'like_count': ('likes_count', {int_or_none}),
'artist': 'song_artist',
'track': 'song_title',
}, get_all=False),
}
class TrillerIE(TrillerBaseIE):
_VALID_URL = r'''(?x)
https?://(?:www\.)?triller\.co/
@(?P<username>[\w\._]+)/video/
(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})
@(?P<username>[\w.]+)/video/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})
'''
_TESTS = [{
'url': 'https://triller.co/@theestallion/video/2358fcd7-3df2-4c77-84c8-1d091610a6cf',
@ -165,16 +168,14 @@ class TrillerIE(TrillerBaseIE):
'timestamp': 1660598222,
'upload_date': '20220815',
'duration': 47,
'height': 3840,
'width': 2160,
'view_count': int,
'like_count': int,
'artist': 'Megan Thee Stallion',
'track': 'Her',
'webpage_url': 'https://triller.co/@theestallion/video/2358fcd7-3df2-4c77-84c8-1d091610a6cf',
'uploader_url': 'https://triller.co/@theestallion',
'comment_count': int,
}
},
'skip': 'This video has been removed due to licensing restrictions',
}, {
'url': 'https://triller.co/@charlidamelio/video/46c6fcfa-aa9e-4503-a50c-68444f44cddc',
'md5': '874055f462af5b0699b9dbb527a505a0',
@ -182,6 +183,7 @@ class TrillerIE(TrillerBaseIE):
'id': '71621339',
'ext': 'mp4',
'title': 'md5:4c91ea82760fe0fffb71b8c3aa7295fc',
'display_id': '46c6fcfa-aa9e-4503-a50c-68444f44cddc',
'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$',
'description': 'md5:4c91ea82760fe0fffb71b8c3aa7295fc',
'uploader': 'charlidamelio',
@ -190,59 +192,75 @@ class TrillerIE(TrillerBaseIE):
'timestamp': 1660773354,
'upload_date': '20220817',
'duration': 16,
'height': 1920,
'width': 1080,
'view_count': int,
'like_count': int,
'artist': 'Dixie',
'track': 'Someone to Blame',
'webpage_url': 'https://triller.co/@charlidamelio/video/46c6fcfa-aa9e-4503-a50c-68444f44cddc',
'uploader_url': 'https://triller.co/@charlidamelio',
'comment_count': int,
}
},
}, {
'url': 'https://triller.co/@theestallion/video/07f35f38-1f51-48e2-8c5f-f7a8e829988f',
'md5': 'af7b3553e4b8bfca507636471ee2eb41',
'info_dict': {
'id': '71837829',
'ext': 'mp4',
'title': 'UNGRATEFUL VIDEO OUT NOW 👏🏾👏🏾👏🏾 💙💙 link my bio #womeninhiphop',
'display_id': '07f35f38-1f51-48e2-8c5f-f7a8e829988f',
'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$',
'description': 'UNGRATEFUL VIDEO OUT NOW 👏🏾👏🏾👏🏾 💙💙 link my bio\r\n #womeninhiphop',
'uploader': 'theestallion',
'uploader_id': '18992236',
'creator': 'Megan Thee Stallion',
'timestamp': 1662486178,
'upload_date': '20220906',
'duration': 30,
'view_count': int,
'like_count': int,
'artist': 'Unknown',
'track': 'Unknown',
'uploader_url': 'https://triller.co/@theestallion',
'comment_count': int,
},
}]
def _real_extract(self, url):
username, video_uuid = self._match_valid_url(url).group('username', 'id')
username, display_id = self._match_valid_url(url).group('username', 'id')
video_info = traverse_obj(self._download_json(
f'{self._API_BASE_URL}/api/videos/{video_uuid}',
video_uuid, note='Downloading video info API JSON',
errnote='Unable to download video info API JSON',
headers=self._API_HEADERS), ('videos', 0))
if not video_info:
raise ExtractorError('No video info found in API response')
video_info = self._download_json(
f'{self._API_BASE_URL}/api/videos/{display_id}', display_id,
headers=self._API_HEADERS)['videos'][0]
user_info = self._check_user_info(video_info.get('user') or {})
return self._parse_video_info(video_info, username, user_info)
self._check_user_info(video_info.get('user') or {})
return self._parse_video_info(video_info, username, None, display_id)
class TrillerUserIE(TrillerBaseIE):
_VALID_URL = r'https?://(?:www\.)?triller\.co/@(?P<id>[\w\._]+)/?(?:$|[#?])'
_VALID_URL = r'https?://(?:www\.)?triller\.co/@(?P<id>[\w.]+)/?(?:$|[#?])'
_TESTS = [{
# first videos request only returns 2 videos
'url': 'https://triller.co/@theestallion',
'playlist_mincount': 9,
'playlist_mincount': 12,
'info_dict': {
'id': '18992236',
'title': 'theestallion',
'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$',
}
},
}, {
'url': 'https://triller.co/@charlidamelio',
'playlist_mincount': 25,
'playlist_mincount': 150,
'info_dict': {
'id': '1875551',
'title': 'charlidamelio',
'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$',
}
},
}]
def _real_initialize(self):
if not self._API_HEADERS.get('Authorization'):
guest = self._download_json(
f'{self._API_BASE_URL}/user/create_guest',
None, note='Creating guest session', data=b'', headers=self._API_HEADERS, query={
f'{self._API_BASE_URL}/user/create_guest', None,
note='Creating guest session', data=b'', headers=self._API_HEADERS, query={
'platform': 'Web',
'app_version': '',
})
@ -251,44 +269,65 @@ def _real_initialize(self):
self._API_HEADERS['Authorization'] = f'Bearer {guest["auth_token"]}'
def _extract_video_list(self, username, user_id, limit=6):
query = {
'limit': limit,
}
def _entries(self, username, user_id, limit=6):
query = {'limit': limit}
for page in itertools.count(1):
for retry in self.RetryManager():
try:
video_list = self._download_json(
videos = self._download_json(
f'{self._API_BASE_URL}/api/users/{user_id}/videos',
username, note=f'Downloading user video list page {page}',
errnote='Unable to download user video list', headers=self._API_HEADERS,
query=query)
except ExtractorError as e:
if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
retry.error = e
continue
raise
if not video_list.get('videos'):
break
yield from video_list['videos']
query['before_time'] = traverse_obj(video_list, ('videos', -1, 'timestamp'))
headers=self._API_HEADERS, query=query)
for video in traverse_obj(videos, ('videos', ...)):
yield self._parse_video_info(video, username, user_id)
query['before_time'] = traverse_obj(videos, ('videos', -1, 'timestamp'))
if not query['before_time']:
break
def _entries(self, videos, username, user_info):
for video in videos:
yield self._parse_video_info(video, username, user_info)
def _real_extract(self, url):
username = self._match_id(url)
user_info = self._check_user_info(self._download_json(
f'{self._API_BASE_URL}/api/users/by_username/{username}',
username, note='Downloading user info',
errnote='Failed to download user info', headers=self._API_HEADERS).get('user', {}))
username, note='Downloading user info', headers=self._API_HEADERS)['user'])
user_id = str_or_none(user_info.get('user_id'))
videos = self._extract_video_list(username, user_id)
thumbnail = user_info.get('avatar_url')
if not user_id:
raise ExtractorError('Unable to extract user ID')
return self.playlist_result(
self._entries(videos, username, user_info), user_id, username, thumbnail=thumbnail)
self._entries(username, user_id), user_id, username, thumbnail=user_info.get('avatar_url'))
class TrillerShortIE(InfoExtractor):
_VALID_URL = r'https?://v\.triller\.co/(?P<id>\w+)'
_TESTS = [{
'url': 'https://v.triller.co/WWZNWk',
'md5': '5eb8dc2c971bd8cd794ec9e8d5e9d101',
'info_dict': {
'id': '66210052',
'ext': 'mp4',
'title': 'md5:2dfc89d154cd91a4a18cd9582ba03e16',
'display_id': 'f4480e1f-fb4e-45b9-a44c-9e6c679ce7eb',
'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$',
'description': 'md5:2dfc89d154cd91a4a18cd9582ba03e16',
'uploader': 'statefairent',
'uploader_id': '487545193',
'creator': 'Official Summer Fair of LA',
'timestamp': 1629655457,
'upload_date': '20210822',
'duration': 19,
'view_count': int,
'like_count': int,
'artist': 'Unknown',
'track': 'Unknown',
'uploader_url': 'https://triller.co/@statefairent',
'comment_count': int,
},
}]
def _real_extract(self, url):
real_url = self._request_webpage(HEADRequest(url), self._match_id(url)).geturl()
if self.suitable(real_url): # Prevent infinite loop in case redirect fails
raise UnsupportedError(real_url)
return self.url_result(real_url)

View file

@ -482,21 +482,34 @@ def _real_extract(self, url):
class TVPVODBaseIE(InfoExtractor):
_API_BASE_URL = 'https://vod.tvp.pl/api/products'
def _call_api(self, resource, video_id, **kwargs):
return self._download_json(
def _call_api(self, resource, video_id, query={}, **kwargs):
is_valid = lambda x: 200 <= x < 300
document, urlh = self._download_json_handle(
f'{self._API_BASE_URL}/{resource}', video_id,
query={'lang': 'pl', 'platform': 'BROWSER'}, **kwargs)
query={'lang': 'pl', 'platform': 'BROWSER', **query},
expected_status=lambda x: is_valid(x) or 400 <= x < 500, **kwargs)
if is_valid(urlh.status):
return document
raise ExtractorError(f'Woronicza said: {document.get("code")} (HTTP {urlh.status})')
def _parse_video(self, video):
return {
def _parse_video(self, video, with_url=True):
info_dict = traverse_obj(video, {
'id': ('id', {str_or_none}),
'title': 'title',
'age_limit': ('rating', {int_or_none}),
'duration': ('duration', {int_or_none}),
'episode_number': ('number', {int_or_none}),
'series': ('season', 'serial', 'title', {str_or_none}),
'thumbnails': ('images', ..., ..., {'url': ('url', {url_or_none})}),
})
info_dict['description'] = clean_html(dict_get(video, ('lead', 'description')))
if with_url:
info_dict.update({
'_type': 'url',
'url': 'tvp:' + video['externalUid'],
'ie_key': TVPEmbedIE.ie_key(),
'title': video.get('title'),
'description': traverse_obj(video, ('lead', 'description')),
'age_limit': int_or_none(video.get('rating')),
'duration': int_or_none(video.get('duration')),
}
'url': video['webUrl'],
'ie_key': TVPVODVideoIE.ie_key(),
})
return info_dict
class TVPVODVideoIE(TVPVODBaseIE):
@ -506,37 +519,70 @@ class TVPVODVideoIE(TVPVODBaseIE):
_TESTS = [{
'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338/odcinek-24,S01E24,311357',
'info_dict': {
'id': '60468609',
'id': '311357',
'ext': 'mp4',
'title': 'Laboratorium alchemika, Tusze termiczne. Jak zobaczyć niewidoczne. Odcinek 24',
'title': 'Tusze termiczne. Jak zobaczyć niewidoczne. Odcinek 24',
'description': 'md5:1d4098d3e537092ccbac1abf49b7cd4c',
'duration': 300,
'episode_number': 24,
'episode': 'Episode 24',
'age_limit': 0,
'series': 'Laboratorium alchemika',
'thumbnail': 're:https://.+',
'thumbnail': 're:https?://.+',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://vod.tvp.pl/filmy-dokumentalne,163/ukrainski-sluga-narodu,339667',
'info_dict': {
'id': '51640077',
'id': '339667',
'ext': 'mp4',
'title': 'Ukraiński sługa narodu, Ukraiński sługa narodu',
'series': 'Ukraiński sługa narodu',
'title': 'Ukraiński sługa narodu',
'description': 'md5:b7940c0a8e439b0c81653a986f544ef3',
'age_limit': 12,
'episode': 'Episode 0',
'episode_number': 0,
'duration': 3051,
'thumbnail': 're:https://.+',
'thumbnail': 're:https?://.+',
'subtitles': 'count:2',
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'embed fails with "payment required"',
'url': 'https://vod.tvp.pl/seriale,18/polowanie-na-cmy-odcinki,390116/odcinek-7,S01E07,398869',
'info_dict': {
'id': '398869',
'ext': 'mp4',
'title': 'odc. 7',
'description': 'md5:dd2bb33f023dc5c2fbaddfbe4cb5dba0',
'duration': 2750,
'age_limit': 16,
'series': 'Polowanie na ćmy',
'episode_number': 7,
'episode': 'Episode 7',
'thumbnail': 're:https?://.+',
},
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
return self._parse_video(self._call_api(f'vods/{video_id}', video_id))
info_dict = self._parse_video(self._call_api(f'vods/{video_id}', video_id), with_url=False)
playlist = self._call_api(f'{video_id}/videos/playlist', video_id, query={'videoType': 'MOVIE'})
info_dict['formats'] = []
for manifest_url in traverse_obj(playlist, ('sources', 'HLS', ..., 'src')):
info_dict['formats'].extend(self._extract_m3u8_formats(manifest_url, video_id, fatal=False))
for manifest_url in traverse_obj(playlist, ('sources', 'DASH', ..., 'src')):
info_dict['formats'].extend(self._extract_mpd_formats(manifest_url, video_id, fatal=False))
info_dict['subtitles'] = {}
for sub in playlist.get('subtitles') or []:
info_dict['subtitles'].setdefault(sub.get('language') or 'und', []).append({
'url': sub['url'],
'ext': 'ttml',
})
return info_dict
class TVPVODSeriesIE(TVPVODBaseIE):
@ -551,7 +597,7 @@ class TVPVODSeriesIE(TVPVODBaseIE):
'age_limit': 12,
'categories': ['seriale'],
},
'playlist_count': 129,
'playlist_count': 130,
}, {
'url': 'https://vod.tvp.pl/programy,88/rolnik-szuka-zony-odcinki,284514',
'only_matching': True,

View file

@ -179,6 +179,14 @@ def _download_access_token(self, video_id, token_kind, param_name):
video_id, ops,
'Downloading %s access token GraphQL' % token_kind)['data'][method]
def _get_thumbnails(self, thumbnail):
return [{
'url': re.sub(r'\d+x\d+(\.\w+)($|(?=[?#]))', r'0x0\g<1>', thumbnail),
'preference': 1,
}, {
'url': thumbnail,
}] if thumbnail else None
class TwitchVodIE(TwitchBaseIE):
IE_NAME = 'twitch:vod'
@ -460,15 +468,13 @@ def _extract_info_gql(self, info, item_id):
is_live, thumbnail = True, None
else:
is_live = False
for p in ('width', 'height'):
thumbnail = thumbnail.replace('{%s}' % p, '0')
return {
'id': vod_id,
'title': info.get('title') or 'Untitled Broadcast',
'description': info.get('description'),
'duration': int_or_none(info.get('lengthSeconds')),
'thumbnail': thumbnail,
'thumbnails': self._get_thumbnails(thumbnail),
'uploader': try_get(info, lambda x: x['owner']['displayName'], compat_str),
'uploader_id': try_get(info, lambda x: x['owner']['login'], compat_str),
'timestamp': unified_timestamp(info.get('publishedAt')),
@ -1053,7 +1059,7 @@ def _real_extract(self, url):
'display_id': channel_name,
'title': title,
'description': description,
'thumbnail': thumbnail,
'thumbnails': self._get_thumbnails(thumbnail),
'uploader': uploader,
'uploader_id': channel_name,
'timestamp': timestamp,

View file

@ -1,6 +1,5 @@
import json
import re
import urllib.error
from .common import InfoExtractor
from .periscope import PeriscopeBaseIE, PeriscopeIE
@ -17,6 +16,7 @@
format_field,
int_or_none,
make_archive_id,
remove_end,
str_or_none,
strip_or_none,
traverse_obj,
@ -32,11 +32,9 @@
class TwitterBaseIE(InfoExtractor):
_API_BASE = 'https://api.twitter.com/1.1/'
_GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/'
_TOKENS = {
'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA': None,
'AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw': None,
}
_BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/'
_AUTH = {'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'}
_guest_token = None
def _extract_variant_formats(self, variant, video_id):
variant_url = variant.get('url')
@ -94,7 +92,7 @@ def is_logged_in(self):
def _call_api(self, path, video_id, query={}, graphql=False):
cookies = self._get_cookies(self._API_BASE)
headers = {}
headers = self._AUTH.copy()
csrf_cookie = cookies.get('ct0')
if csrf_cookie:
@ -107,52 +105,32 @@ def _call_api(self, path, video_id, query={}, graphql=False):
'x-twitter-active-user': 'yes',
})
last_error = None
for bearer_token in self._TOKENS:
for first_attempt in (True, False):
headers['Authorization'] = f'Bearer {bearer_token}'
if not self.is_logged_in:
if not self._TOKENS[bearer_token]:
if not self.is_logged_in and not self._guest_token:
headers.pop('x-guest-token', None)
guest_token_response = self._download_json(
self._API_BASE + 'guest/activate.json', video_id,
'Downloading guest token', data=b'', headers=headers)
self._TOKENS[bearer_token] = guest_token_response.get('guest_token')
if not self._TOKENS[bearer_token]:
self._guest_token = traverse_obj(self._download_json(
f'{self._API_BASE}guest/activate.json', video_id,
'Downloading guest token', data=b'', headers=headers), 'guest_token')
if self._guest_token:
headers['x-guest-token'] = self._guest_token
elif not self.is_logged_in:
raise ExtractorError('Could not retrieve guest token')
headers['x-guest-token'] = self._TOKENS[bearer_token]
try:
allowed_status = {400, 403, 404} if graphql else {403}
allowed_status = {400, 401, 403, 404} if graphql else {403}
result = self._download_json(
(self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path,
video_id, headers=headers, query=query, expected_status=allowed_status)
except ExtractorError as e:
if last_error:
raise last_error
if not isinstance(e.cause, urllib.error.HTTPError) or e.cause.code != 404:
raise
last_error = e
self.report_warning(
'Twitter API gave 404 response, retrying with deprecated auth token. '
'Only one media item can be extracted')
break # continue outer loop with next bearer_token
video_id, headers=headers, query=query, expected_status=allowed_status,
note=f'Downloading {"GraphQL" if graphql else "legacy API"} JSON')
if result.get('errors'):
errors = traverse_obj(result, ('errors', ..., 'message'), expected_type=str)
if first_attempt and any('bad guest token' in error.lower() for error in errors):
errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str}))))
if not self.is_logged_in and first_attempt and 'bad guest token' in errors.lower():
self.to_screen('Guest token has expired. Refreshing guest token')
self._TOKENS[bearer_token] = None
self._guest_token = None
continue
error_message = ', '.join(set(errors)) or 'Unknown error'
raise ExtractorError(f'Error(s) while querying API: {error_message}', expected=True)
raise ExtractorError(
f'Error(s) while querying API: {errors or "Unknown error"}', expected=True)
return result
@ -313,6 +291,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
'view_count': int,
'tags': [],
'age_limit': 18,
},
@ -391,6 +370,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
'view_count': int,
'tags': ['Damndaniel'],
'age_limit': 0,
},
@ -431,6 +411,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
'view_count': int,
'tags': [],
'age_limit': 0,
},
@ -480,6 +461,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
'view_count': int,
'tags': ['Maria'],
'age_limit': 0,
},
@ -505,6 +487,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
'view_count': int,
'tags': [],
'age_limit': 0,
},
@ -529,6 +512,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
'view_count': int,
'tags': [],
'age_limit': 0,
},
@ -589,6 +573,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
'view_count': int,
'tags': [],
'age_limit': 0,
},
@ -630,12 +615,12 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'repost_count': int,
'like_count': int,
'view_count': int,
'tags': ['HurricaneIan'],
'age_limit': 0,
},
}, {
# Adult content, uses old token
# Fails if not logged in (GraphQL)
# Adult content, fails if not logged in (GraphQL)
'url': 'https://twitter.com/Rizdraws/status/1575199173472927762',
'info_dict': {
'id': '1575199163847000068',
@ -655,9 +640,8 @@ class TwitterIE(TwitterBaseIE):
'age_limit': 18,
'tags': []
},
'expected_warnings': ['404'],
'skip': 'Requires authentication',
}, {
# Description is missing one https://t.co url (GraphQL)
'url': 'https://twitter.com/Srirachachau/status/1395079556562706435',
'playlist_mincount': 2,
'info_dict': {
@ -669,14 +653,13 @@ class TwitterIE(TwitterBaseIE):
'upload_date': '20210519',
'age_limit': 0,
'repost_count': int,
'description': 'Here it is! Finished my gothic western cartoon. Pretty proud of it. It\'s got some goofs and lots of splashy over the top violence, something for everyone, hope you like it https://t.co/fOsG5glUnw https://t.co/kbXZrozlY7',
'description': 'Here it is! Finished my gothic western cartoon. Pretty proud of it. It\'s got some goofs and lots of splashy over the top violence, something for everyone, hope you like it https://t.co/fOsG5glUnw',
'uploader_id': 'Srirachachau',
'comment_count': int,
'uploader_url': 'https://twitter.com/Srirachachau',
'timestamp': 1621447860,
},
}, {
# Description is missing one https://t.co url (GraphQL)
'url': 'https://twitter.com/DavidToons_/status/1578353380363501568',
'playlist_mincount': 2,
'info_dict': {
@ -688,7 +671,7 @@ class TwitterIE(TwitterBaseIE):
'uploader': str,
'timestamp': 1665143744,
'uploader_url': 'https://twitter.com/DavidToons_',
'description': 'Chris sounds like Linda from Bob\'s Burgers, so as an animator: this had to be done. https://t.co/glfQdgfFXH https://t.co/WgJauwIW1w',
'description': 'Chris sounds like Linda from Bob\'s Burgers, so as an animator: this had to be done. https://t.co/WgJauwIW1w',
'tags': [],
'comment_count': int,
'upload_date': '20221007',
@ -752,7 +735,7 @@ class TwitterIE(TwitterBaseIE):
'info_dict': {
'id': '1600649511827013632',
'ext': 'mp4',
'title': 'md5:dac4f4d4c591fcc4e88a253eba472dc3',
'title': 'md5:7662a0a27ce6faa3e5b160340f3cfab1',
'thumbnail': r're:^https?://.+\.jpg',
'timestamp': 1670459604.0,
'uploader_id': 'CTVJLaidlaw',
@ -764,6 +747,7 @@ class TwitterIE(TwitterBaseIE):
'uploader_url': 'https://twitter.com/CTVJLaidlaw',
'display_id': '1600649710662213632',
'like_count': int,
'view_count': int,
'description': 'md5:591c19ce66fadc2359725d5cd0d1052c',
'upload_date': '20221208',
'age_limit': 0,
@ -791,6 +775,7 @@ class TwitterIE(TwitterBaseIE):
'like_count': int,
'repost_count': int,
'comment_count': int,
'view_count': int,
},
}, {
'url': 'https://twitter.com/hlo_again/status/1599108751385972737/video/2',
@ -806,6 +791,7 @@ class TwitterIE(TwitterBaseIE):
'repost_count': int,
'duration': 9.531,
'comment_count': int,
'view_count': int,
'upload_date': '20221203',
'age_limit': 0,
'timestamp': 1670092210.0,
@ -815,7 +801,6 @@ class TwitterIE(TwitterBaseIE):
},
'params': {'noplaylist': True},
}, {
# Media view count is GraphQL only, force in test
'url': 'https://twitter.com/MunTheShinobi/status/1600009574919962625',
'info_dict': {
'id': '1600009362759733248',
@ -826,10 +811,10 @@ class TwitterIE(TwitterBaseIE):
'view_count': int,
'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig',
'age_limit': 0,
'uploader': 'Mün The Shinobi | BlaqBoi\'s Therapist',
'uploader': 'Mün The Shinobi',
'repost_count': int,
'upload_date': '20221206',
'title': 'Mün The Shinobi | BlaqBoi\'s Therapist - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525',
'title': 'Mün The Shinobi - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525',
'comment_count': int,
'like_count': int,
'tags': [],
@ -837,9 +822,8 @@ class TwitterIE(TwitterBaseIE):
'duration': 139.987,
'timestamp': 1670306984.0,
},
'params': {'extractor_args': {'twitter': {'force_graphql': ['']}}},
}, {
# url to retweet id
# url to retweet id, legacy API
'url': 'https://twitter.com/liberdalau/status/1623739803874349067',
'info_dict': {
'id': '1623274794488659969',
@ -860,6 +844,7 @@ class TwitterIE(TwitterBaseIE):
'repost_count': int,
'comment_count': int,
},
'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}},
}, {
# onion route
'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273',
@ -905,11 +890,13 @@ def _graphql_to_legacy(self, data, twid):
'tweet_results', 'result', ('tweet', None),
), expected_type=dict, default={}, get_all=False)
if result.get('__typename') not in ('Tweet', None):
if result.get('__typename') not in ('Tweet', 'TweetTombstone', None):
self.report_warning(f'Unknown typename: {result.get("__typename")}', twid, only_once=True)
if 'tombstone' in result:
cause = traverse_obj(result, ('tombstone', 'text', 'text'), expected_type=str)
cause = remove_end(traverse_obj(result, ('tombstone', 'text', 'text', {str})), '. Learn more')
if cause and 'adult content' in cause:
self.raise_login_required(cause)
raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True)
status = result.get('legacy', {})
@ -922,7 +909,7 @@ def _graphql_to_legacy(self, data, twid):
# extra transformation is needed since result does not match legacy format
binding_values = {
binding_value.get('key'): binding_value.get('value')
for binding_value in traverse_obj(status, ('card', 'binding_values', ...), expected_type=dict)
for binding_value in traverse_obj(status, ('card', 'binding_values', ..., {dict}))
}
if binding_values:
status['card']['binding_values'] = binding_values
@ -965,12 +952,7 @@ def _build_graphql_query(self, media_id):
def _real_extract(self, url):
twid, selected_index = self._match_valid_url(url).group('id', 'index')
if self.is_logged_in or self._configuration_arg('force_graphql'):
self.write_debug(f'Using GraphQL API (Auth = {self.is_logged_in})')
result = self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid)
status = self._graphql_to_legacy(result, twid)
else:
if self._configuration_arg('legacy_api') and not self.is_logged_in:
status = traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, {
'cards_platform': 'Web-12',
'include_cards': 1,
@ -978,6 +960,9 @@ def _real_extract(self, url):
'include_user_entities': 0,
'tweet_mode': 'extended',
}), 'retweeted_status', None)
else:
result = self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid)
status = self._graphql_to_legacy(result, twid)
title = description = status['full_text'].replace('\n', ' ')
# strip 'https -_t.co_BJYgOjSeGA' junk from filenames
@ -1142,7 +1127,8 @@ def get_binding_value(k):
if not entries:
expanded_url = traverse_obj(status, ('entities', 'urls', 0, 'expanded_url'), expected_type=url_or_none)
if not expanded_url or expanded_url == url:
raise ExtractorError('No video could be found in this tweet', expected=True)
self.raise_no_formats('No video could be found in this tweet', expected=True)
return info
return self.url_result(expanded_url, display_id=twid, **info)

108
yt_dlp/extractor/wevidi.py Normal file
View file

@ -0,0 +1,108 @@
from .common import InfoExtractor
from ..utils import clean_html, float_or_none, get_element_by_class, js_to_json, traverse_obj
class WeVidiIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?wevidi\.net/watch/(?P<id>[\w-]{11})'
_TESTS = [{
'url': 'https://wevidi.net/watch/2th7UO5F4KV',
'md5': 'b913d1ff5bbad499e2c7ef4aa6d829d7',
'info_dict': {
'id': '2th7UO5F4KV',
'ext': 'mp4',
'title': 'YouTube Alternative: WeVidi - customizable channels & more',
'thumbnail': r're:^https?://.*\.jpg$',
'description': 'md5:73a27d0a87d49fbcc5584566326ebeed',
'uploader': 'eclecRC',
'duration': 932.098,
}
}, {
'url': 'https://wevidi.net/watch/ievRuuQHbPS',
'md5': 'ce8a94989a959bff9003fa27ee572935',
'info_dict': {
'id': 'ievRuuQHbPS',
'ext': 'mp4',
'title': 'WeVidi Playlists',
'thumbnail': r're:^https?://.*\.jpg$',
'description': 'md5:32cdfca272687390d9bd9b0c9c6153ee',
'uploader': 'WeVidi',
'duration': 36.1999,
}
}, {
'url': 'https://wevidi.net/watch/PcMzDWaQSWb',
'md5': '55ee0d3434be5d9e5cc76b83f2bb57ec',
'info_dict': {
'id': 'PcMzDWaQSWb',
'ext': 'mp4',
'title': 'Cat blep',
'thumbnail': r're:^https?://.*\.jpg$',
'description': 'md5:e2c9e2b54b8bb424cc64937c8fdc068f',
'uploader': 'WeVidi',
'duration': 41.972,
}
}, {
'url': 'https://wevidi.net/watch/wJnRqDHNe_u',
'md5': 'c8f263dd47e66cc17546b3abf47b5a77',
'info_dict': {
'id': 'wJnRqDHNe_u',
'ext': 'mp4',
'title': 'Gissy Talks: YouTube Alternatives',
'thumbnail': r're:^https?://.*\.jpg$',
'description': 'md5:e65036f0d4af80e0af191bd11af5195e',
'uploader': 'GissyEva',
'duration': 630.451,
}
}, {
'url': 'https://wevidi.net/watch/4m1c4yJR_yc',
'md5': 'c63ce5ca6990dce86855fc02ca5bc1ed',
'info_dict': {
'id': '4m1c4yJR_yc',
'ext': 'mp4',
'title': 'Enough of that! - Awesome Exilez Podcast',
'thumbnail': r're:^https?://.*\.jpg$',
'description': 'md5:96af99dd63468b2dfab3020560e3e9b2',
'uploader': 'eclecRC',
'duration': 6.804,
}
}]
def _extract_formats(self, wvplayer_props):
# Taken from WeVidi player JS: https://wevidi.net/layouts/default/static/player.min.js
resolution_map = {
1: 144,
2: 240,
3: 360,
4: 480,
5: 720,
6: 1080
}
src_path = f'{wvplayer_props["srcVID"]}/{wvplayer_props["srcUID"]}/{wvplayer_props["srcNAME"]}'
for res in traverse_obj(wvplayer_props, ('resolutions', ..., {int}, {lambda x: x or None})):
format_id = str(-(res // -2) - 1)
yield {
'acodec': 'mp4a.40.2',
'ext': 'mp4',
'format_id': format_id,
'height': resolution_map.get(res),
'url': f'https://www.wevidi.net/videoplayback/{src_path}/{format_id}',
'vcodec': 'avc1.42E01E',
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
wvplayer_props = self._search_json(
r'WVPlayer\(', webpage, 'player', video_id,
transform_source=lambda x: js_to_json(x.replace('||', '}')))
return {
'id': video_id,
'title': clean_html(get_element_by_class('video_title', webpage)),
'description': clean_html(get_element_by_class('descr_long', webpage)),
'uploader': clean_html(get_element_by_class('username', webpage)),
'formats': list(self._extract_formats(wvplayer_props)),
'thumbnail': self._og_search_thumbnail(webpage),
'duration': float_or_none(wvplayer_props.get('duration')),
}

50
yt_dlp/extractor/whyp.py Normal file
View file

@ -0,0 +1,50 @@
from .common import InfoExtractor
from ..utils import (
float_or_none,
str_or_none,
traverse_obj,
url_or_none,
)
class WhypIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?whyp\.it/tracks/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.whyp.it/tracks/18337/home-page-example-track-b4kq7',
'md5': 'c1187b42ebf8605284e3dc92aeb33d16',
'info_dict': {
'url': 'https://cdn.whyp.it/50eb17cc-e9ff-4e18-b89b-dc9206a95cb1.mp3',
'id': '18337',
'title': 'Home Page Example Track',
'description': 'md5:bd758000fb93f3159339c852b5b9133c',
'ext': 'mp3',
'duration': 52.82,
'uploader': 'Brad',
'uploader_id': '1',
'thumbnail': 'https://cdn.whyp.it/a537bb36-3373-4c61-96c8-27fc1b2f427a.jpg',
},
}, {
'url': 'https://www.whyp.it/tracks/18337',
'only_matching': True,
}]
def _real_extract(self, url):
unique_id = self._match_id(url)
webpage = self._download_webpage(url, unique_id)
data = self._search_nuxt_data(webpage, unique_id)['rawTrack']
return {
'url': data['audio_url'],
'id': unique_id,
**traverse_obj(data, {
'title': 'title',
'description': 'description',
'duration': ('duration', {float_or_none}),
'uploader': ('user', 'username'),
'uploader_id': ('user', 'id', {str_or_none}),
'thumbnail': ('artwork_url', {url_or_none}),
}),
'ext': 'mp3',
'vcodec': 'none',
'http_headers': {'Referer': 'https://whyp.it/'},
}

View file

@ -2,7 +2,6 @@
import itertools
import urllib.parse
from .brightcove import BrightcoveNewIE
from .common import InfoExtractor, SearchInfoExtractor
from .youtube import YoutubeIE
from ..utils import (
@ -11,7 +10,6 @@
int_or_none,
mimetype2ext,
parse_iso8601,
smuggle_url,
traverse_obj,
try_get,
url_or_none,
@ -337,121 +335,6 @@ def _search_results(self, query):
break
class YahooGyaOPlayerIE(InfoExtractor):
IE_NAME = 'yahoo:gyao:player'
_VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:player|episode(?:/[^/]+)?)|streaming\.yahoo\.co\.jp/c/y)/(?P<id>\d+/v\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_TESTS = [{
'url': 'https://gyao.yahoo.co.jp/player/00998/v00818/v0000000000000008564/',
'info_dict': {
'id': '5993125228001',
'ext': 'mp4',
'title': 'フューリー 【字幕版】',
'description': 'md5:21e691c798a15330eda4db17a8fe45a5',
'uploader_id': '4235717419001',
'upload_date': '20190124',
'timestamp': 1548294365,
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
'url': 'https://streaming.yahoo.co.jp/c/y/01034/v00133/v0000000000000000706/',
'only_matching': True,
}, {
'url': 'https://gyao.yahoo.co.jp/episode/%E3%81%8D%E3%81%AE%E3%81%86%E4%BD%95%E9%A3%9F%E3%81%B9%E3%81%9F%EF%BC%9F%20%E7%AC%AC2%E8%A9%B1%202019%2F4%2F12%E6%94%BE%E9%80%81%E5%88%86/5cb02352-b725-409e-9f8d-88f947a9f682',
'only_matching': True,
}, {
'url': 'https://gyao.yahoo.co.jp/episode/5fa1226c-ef8d-4e93-af7a-fd92f4e30597',
'only_matching': True,
}]
_GEO_BYPASS = False
def _real_extract(self, url):
video_id = self._match_id(url).replace('/', ':')
headers = self.geo_verification_headers()
headers['Accept'] = 'application/json'
resp = self._download_json(
'https://gyao.yahoo.co.jp/apis/playback/graphql', video_id, query={
'appId': 'dj00aiZpPUNJeDh2cU1RazU3UCZzPWNvbnN1bWVyc2VjcmV0Jng9NTk-',
'query': '''{
content(parameter: {contentId: "%s", logicaAgent: PC_WEB}) {
video {
delivery {
id
}
title
}
}
}''' % video_id,
}, headers=headers)
content = resp['data']['content']
if not content:
msg = resp['errors'][0]['message']
if msg == 'not in japan':
self.raise_geo_restricted(countries=['JP'])
raise ExtractorError(msg)
video = content['video']
return {
'_type': 'url_transparent',
'id': video_id,
'title': video['title'],
'url': smuggle_url(
'http://players.brightcove.net/4235717419001/SyG5P0gjb_default/index.html?videoId=' + video['delivery']['id'],
{'geo_countries': ['JP']}),
'ie_key': BrightcoveNewIE.ie_key(),
}
class YahooGyaOIE(InfoExtractor):
IE_NAME = 'yahoo:gyao'
_VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:p|title(?:/[^/]+)?)|streaming\.yahoo\.co\.jp/p/y)/(?P<id>\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_TESTS = [{
'url': 'https://gyao.yahoo.co.jp/title/%E3%82%BF%E3%82%A4%E3%83%A0%E3%83%9C%E3%82%AB%E3%83%B3%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA%20%E3%83%A4%E3%83%83%E3%82%BF%E3%83%BC%E3%83%9E%E3%83%B3/5f60ceb3-6e5e-40ef-ba40-d68b598d067f',
'info_dict': {
'id': '5f60ceb3-6e5e-40ef-ba40-d68b598d067f',
},
'playlist_mincount': 80,
}, {
'url': 'https://gyao.yahoo.co.jp/p/00449/v03102/',
'only_matching': True,
}, {
'url': 'https://streaming.yahoo.co.jp/p/y/01034/v00133/',
'only_matching': True,
}, {
'url': 'https://gyao.yahoo.co.jp/title/%E3%81%97%E3%82%83%E3%81%B9%E3%81%8F%E3%82%8A007/5b025a49-b2e5-4dc7-945c-09c6634afacf',
'only_matching': True,
}, {
'url': 'https://gyao.yahoo.co.jp/title/5b025a49-b2e5-4dc7-945c-09c6634afacf',
'only_matching': True,
}]
def _entries(self, program_id):
page = 1
while True:
playlist = self._download_json(
f'https://gyao.yahoo.co.jp/api/programs/{program_id}/videos?page={page}&serviceId=gy', program_id,
note=f'Downloading JSON metadata page {page}')
if not playlist:
break
for video in playlist['videos']:
video_id = video.get('id')
if not video_id:
continue
if video.get('streamingAvailability') == 'notYet':
continue
yield self.url_result(
'https://gyao.yahoo.co.jp/player/%s/' % video_id.replace(':', '/'),
YahooGyaOPlayerIE.ie_key(), video_id)
if playlist.get('ended'):
break
page += 1
def _real_extract(self, url):
program_id = self._match_id(url).replace('/', ':')
return self.playlist_result(self._entries(program_id), program_id)
class YahooJapanNewsIE(InfoExtractor):
IE_NAME = 'yahoo:japannews'
IE_DESC = 'Yahoo! Japan News'

View file

@ -6,6 +6,7 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
clean_html,
get_element_by_class,
js_to_json,
str_or_none,
@ -26,48 +27,8 @@ class YoukuIE(InfoExtractor):
'''
_TESTS = [{
# MD5 is unstable
'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html',
'info_dict': {
'id': 'XMTc1ODE5Njcy',
'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.',
'ext': 'mp4',
'duration': 74.73,
'thumbnail': r're:^https?://.*',
'uploader': '。躲猫猫、',
'uploader_id': '36017967',
'uploader_url': 'http://i.youku.com/u/UMTQ0MDcxODY4',
'tags': list,
}
}, {
'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf',
'only_matching': True,
}, {
'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html',
'info_dict': {
'id': 'XODgxNjg1Mzk2',
'ext': 'mp4',
'title': '武媚娘传奇 85',
'duration': 1999.61,
'thumbnail': r're:^https?://.*',
'uploader': '疯狂豆花',
'uploader_id': '62583473',
'uploader_url': 'http://i.youku.com/u/UMjUwMzMzODky',
'tags': list,
},
}, {
'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html',
'info_dict': {
'id': 'XMTI1OTczNDM5Mg',
'ext': 'mp4',
'title': '花千骨 04',
'duration': 2363,
'thumbnail': r're:^https?://.*',
'uploader': '放剧场-花千骨',
'uploader_id': '772849359',
'uploader_url': 'http://i.youku.com/u/UMzA5MTM5NzQzNg==',
'tags': list,
},
}, {
'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html',
'note': 'Video protected with password',
@ -81,6 +42,7 @@ class YoukuIE(InfoExtractor):
'uploader_id': '322014285',
'uploader_url': 'http://i.youku.com/u/UMTI4ODA1NzE0MA==',
'tags': list,
'skip': '404',
},
'params': {
'videopassword': '100600',
@ -192,7 +154,7 @@ def _real_extract(self, url):
else:
msg = 'Youku server reported error %i' % error.get('code')
if error_note is not None:
msg += ': ' + error_note
msg += ': ' + clean_html(error_note)
raise ExtractorError(msg)
# get video title

View file

@ -6,6 +6,7 @@
int_or_none,
merge_dicts,
str_to_int,
traverse_obj,
unified_strdate,
url_or_none,
)
@ -86,32 +87,31 @@ class YouPornIE(InfoExtractor):
}]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
definitions = self._download_json(
'https://www.youporn.com/api/video/media_definitions/%s/' % video_id,
display_id)
f'https://www.youporn.com/api/video/media_definitions/{video_id}/', display_id or video_id)
def get_format_data(data, f):
return traverse_obj(data, lambda _, v: v['format'] == f and url_or_none(v['videoUrl']))
formats = []
for definition in definitions:
if not isinstance(definition, dict):
continue
video_url = url_or_none(definition.get('videoUrl'))
if not video_url:
continue
f = {
'url': video_url,
'filesize': int_or_none(definition.get('videoSize')),
}
# Try to extract only the actual master m3u8 first, avoiding the duplicate single resolution "master" m3u8s
for hls_url in traverse_obj(get_format_data(definitions, 'hls'), (
lambda _, v: not isinstance(v['defaultQuality'], bool), 'videoUrl'), (..., 'videoUrl')):
formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4', fatal=False, m3u8_id='hls'))
for definition in get_format_data(definitions, 'mp4'):
f = traverse_obj(definition, {
'url': 'videoUrl',
'filesize': ('videoSize', {int_or_none})
})
height = int_or_none(definition.get('quality'))
# Video URL's path looks like this:
# /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
# /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
# /videos/201703/11/109285532/1080P_4000K_109285532.mp4
# We will benefit from it by extracting some metadata
mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url)
mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', definition['videoUrl'])
if mobj:
if not height:
height = int(mobj.group('height'))
@ -179,6 +179,7 @@ def extract_tag_box(regex, title):
'tags')
data = self._search_json_ld(webpage, video_id, expected_type='VideoObject', fatal=False)
data.pop('url', None)
return merge_dicts(data, {
'id': video_id,
'display_id': display_id,

File diff suppressed because it is too large Load diff

View file

@ -5,6 +5,7 @@
str_or_none,
js_to_json,
parse_filesize,
traverse_obj,
urlencode_postdata,
urljoin,
)
@ -12,8 +13,8 @@
class ZoomIE(InfoExtractor):
IE_NAME = 'zoom'
_VALID_URL = r'(?P<base_url>https?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?:play|share)/(?P<id>[A-Za-z0-9_.-]+)'
_TEST = {
_VALID_URL = r'(?P<base_url>https?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?P<type>play|share)/(?P<id>[A-Za-z0-9_.-]+)'
_TESTS = [{
'url': 'https://economist.zoom.us/rec/play/dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5',
'md5': 'ab445e8c911fddc4f9adc842c2c5d434',
'info_dict': {
@ -22,17 +23,37 @@ class ZoomIE(InfoExtractor):
'title': 'China\'s "two sessions" and the new five-year plan',
},
'skip': 'Recording requires email authentication to access',
}
}, {
# play URL
'url': 'https://ffgolf.zoom.us/rec/play/qhEhXbrxq1Zoucx8CMtHzq1Z_2YZRPVCqWK_K-2FkEGRsSLDeOX8Tu4P6jtjZcRry8QhIbvKZdtr4UNo.QcPn2debFskI9whJ',
'md5': '2c4b1c4e5213ebf9db293e88d9385bee',
'info_dict': {
'id': 'qhEhXbrxq1Zoucx8CMtHzq1Z_2YZRPVCqWK_K-2FkEGRsSLDeOX8Tu4P6jtjZcRry8QhIbvKZdtr4UNo.QcPn2debFskI9whJ',
'ext': 'mp4',
'title': 'Prépa AF2023 - Séance 5 du 11 avril - R20/VM/GO',
},
}, {
# share URL
'url': 'https://us02web.zoom.us/rec/share/hkUk5Zxcga0nkyNGhVCRfzkA2gX_mzgS3LpTxEEWJz9Y_QpIQ4mZFOUx7KZRZDQA.9LGQBdqmDAYgiZ_8',
'md5': '90fdc7cfcaee5d52d1c817fc03c43c9b',
'info_dict': {
'id': 'hkUk5Zxcga0nkyNGhVCRfzkA2gX_mzgS3LpTxEEWJz9Y_QpIQ4mZFOUx7KZRZDQA.9LGQBdqmDAYgiZ_8',
'ext': 'mp4',
'title': 'Timea Andrea Lelik\'s Personal Meeting Room',
},
}]
def _real_extract(self, url):
base_url, play_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, play_id)
def _get_page_data(self, webpage, video_id):
return self._search_json(
r'window\.__data__\s*=', webpage, 'data', video_id, transform_source=js_to_json)
def _get_real_webpage(self, url, base_url, video_id, url_type):
webpage = self._download_webpage(url, video_id, note=f'Downloading {url_type} webpage')
try:
form = self._form_hidden_inputs('password_form', webpage)
except ExtractorError:
form = None
if form:
return webpage
password = self.get_param('videopassword')
if not password:
raise ExtractorError(
@ -40,18 +61,35 @@ def _real_extract(self, url):
is_meeting = form.get('useWhichPasswd') == 'meeting'
validation = self._download_json(
base_url + 'rec/validate%s_passwd' % ('_meet' if is_meeting else ''),
play_id, 'Validating passcode', 'Wrong passcode', data=urlencode_postdata({
video_id, 'Validating passcode', 'Wrong passcode', data=urlencode_postdata({
'id': form[('meet' if is_meeting else 'file') + 'Id'],
'passwd': password,
'action': form.get('action'),
}))
if not validation.get('status'):
raise ExtractorError(validation['errorMessage'], expected=True)
webpage = self._download_webpage(url, play_id)
return self._download_webpage(url, video_id, note=f'Re-downloading {url_type} webpage')
data = self._parse_json(self._search_regex(
r'(?s)window\.__data__\s*=\s*({.+?});',
webpage, 'data'), play_id, js_to_json)
def _real_extract(self, url):
base_url, url_type, video_id = self._match_valid_url(url).group('base_url', 'type', 'id')
if url_type == 'share':
webpage = self._get_real_webpage(url, base_url, video_id, 'share')
meeting_id = self._get_page_data(webpage, video_id)['meetingId']
redirect_path = self._download_json(
f'{base_url}nws/recording/1.0/play/share-info/{meeting_id}',
video_id, note='Downloading share info JSON')['result']['redirectUrl']
url = urljoin(base_url, redirect_path)
webpage = self._get_real_webpage(url, base_url, video_id, 'play')
file_id = self._get_page_data(webpage, video_id)['fileId']
if not file_id:
# When things go wrong, file_id can be empty string
raise ExtractorError('Unable to extract file ID')
data = self._download_json(
f'{base_url}nws/recording/1.0/play/info/{file_id}', video_id,
note='Downloading play info JSON')['result']
subtitles = {}
for _type in ('transcript', 'cc', 'chapter'):
@ -67,11 +105,11 @@ def _real_extract(self, url):
formats.append({
'format_note': 'Camera stream',
'url': str_or_none(data.get('viewMp4Url')),
'width': int_or_none(data.get('viewResolvtionsWidth')),
'height': int_or_none(data.get('viewResolvtionsHeight')),
'format_id': str_or_none(data.get('recordingId')),
'width': int_or_none(traverse_obj(data, ('viewResolvtions', 0))),
'height': int_or_none(traverse_obj(data, ('viewResolvtions', 1))),
'format_id': str_or_none(traverse_obj(data, ('recording', 'id'))),
'ext': 'mp4',
'filesize_approx': parse_filesize(data.get('fileSize')),
'filesize_approx': parse_filesize(str_or_none(traverse_obj(data, ('recording', 'fileSizeInMB')))),
'preference': 0
})
@ -79,16 +117,16 @@ def _real_extract(self, url):
formats.append({
'format_note': 'Screen share stream',
'url': str_or_none(data.get('shareMp4Url')),
'width': int_or_none(data.get('shareResolvtionsWidth')),
'height': int_or_none(data.get('shareResolvtionsHeight')),
'format_id': str_or_none(data.get('shareVideoId')),
'width': int_or_none(traverse_obj(data, ('shareResolvtions', 0))),
'height': int_or_none(traverse_obj(data, ('shareResolvtions', 1))),
'format_id': str_or_none(traverse_obj(data, ('shareVideo', 'id'))),
'ext': 'mp4',
'preference': -1
})
return {
'id': play_id,
'title': data.get('topic'),
'id': video_id,
'title': str_or_none(traverse_obj(data, ('meet', 'topic'))),
'subtitles': subtitles,
'formats': formats,
'http_headers': {

View file

@ -243,7 +243,7 @@ def _separate(expr, delim=',', max_split=None):
return
counters = {k: 0 for k in _MATCHING_PARENS.values()}
start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1
in_quote, escaping, after_op, in_regex_char_group = None, False, True, False
in_quote, escaping, after_op, in_regex_char_group, in_unary_op = None, False, True, False, False
for idx, char in enumerate(expr):
if not in_quote and char in _MATCHING_PARENS:
counters[_MATCHING_PARENS[char]] += 1
@ -258,9 +258,11 @@ def _separate(expr, delim=',', max_split=None):
elif in_quote == '/' and char in '[]':
in_regex_char_group = char == '['
escaping = not escaping and in_quote and char == '\\'
after_op = not in_quote and char in OP_CHARS or (char.isspace() and after_op)
in_unary_op = (not in_quote and not in_regex_char_group
and after_op not in (True, False) and char in '-+')
after_op = char if (not in_quote and char in OP_CHARS) else (char.isspace() and after_op)
if char != delim[pos] or any(counters.values()) or in_quote:
if char != delim[pos] or any(counters.values()) or in_quote or in_unary_op:
pos = 0
continue
elif pos != delim_len:

View file

@ -243,7 +243,7 @@ def _dict_from_options_callback(
if multiple_keys:
allowed_keys = fr'({allowed_keys})(,({allowed_keys}))*'
mobj = re.match(
fr'(?i)(?P<keys>{allowed_keys}){delimiter}(?P<val>.*)$',
fr'(?is)(?P<keys>{allowed_keys}){delimiter}(?P<val>.*)$',
value[0] if multiple_args else value)
if mobj is not None:
keys, val = mobj.group('keys').split(','), mobj.group('val')
@ -526,22 +526,27 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs):
'--cn-verification-proxy',
dest='cn_verification_proxy', default=None, metavar='URL',
help=optparse.SUPPRESS_HELP)
geo.add_option(
'--xff', metavar='VALUE',
dest='geo_bypass', default="default",
help=(
'How to fake X-Forwarded-For HTTP header to try bypassing geographic restriction. '
'One of "default" (Only when known to be useful), "never", '
'a two-letter ISO 3166-2 country code, or an IP block in CIDR notation'))
geo.add_option(
'--geo-bypass',
action='store_true', dest='geo_bypass', default=True,
help='Bypass geographic restriction via faking X-Forwarded-For HTTP header (default)')
action='store_const', dest='geo_bypass', const='default',
help=optparse.SUPPRESS_HELP)
geo.add_option(
'--no-geo-bypass',
action='store_false', dest='geo_bypass',
help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header')
action='store_const', dest='geo_bypass', const='never',
help=optparse.SUPPRESS_HELP)
geo.add_option(
'--geo-bypass-country', metavar='CODE',
dest='geo_bypass_country', default=None,
help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code')
'--geo-bypass-country', metavar='CODE', dest='geo_bypass',
help=optparse.SUPPRESS_HELP)
geo.add_option(
'--geo-bypass-ip-block', metavar='IP_BLOCK',
dest='geo_bypass_ip_block', default=None,
help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation')
'--geo-bypass-ip-block', metavar='IP_BLOCK', dest='geo_bypass',
help=optparse.SUPPRESS_HELP)
selection = optparse.OptionGroup(parser, 'Video Selection')
selection.add_option(
@ -1086,8 +1091,12 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs):
verbosity = optparse.OptionGroup(parser, 'Verbosity and Simulation Options')
verbosity.add_option(
'-q', '--quiet',
action='store_true', dest='quiet', default=False,
action='store_true', dest='quiet', default=None,
help='Activate quiet mode. If used with --verbose, print the log to stderr')
verbosity.add_option(
'--no-quiet',
action='store_false', dest='quiet',
help='Deactivate quiet mode. (Default)')
verbosity.add_option(
'--no-warnings',
dest='no_warnings', action='store_true', default=False,

View file

@ -107,7 +107,7 @@ def run(self, info):
options.extend(['-map', '-0:%d' % old_stream])
new_stream -= 1
options.extend([
'-attach', thumbnail_filename,
'-attach', self._ffmpeg_filename_argument(thumbnail_filename),
'-metadata:s:%d' % new_stream, 'mimetype=%s' % mimetype,
'-metadata:s:%d' % new_stream, 'filename=cover.%s' % thumbnail_ext])

View file

@ -809,7 +809,7 @@ def _get_infojson_opts(self, info, infofn):
new_stream -= 1
yield (
'-attach', infofn,
'-attach', self._ffmpeg_filename_argument(infofn),
f'-metadata:s:{new_stream}', 'mimetype=application/json',
f'-metadata:s:{new_stream}', 'filename=info.json',
)
@ -898,8 +898,11 @@ def _needs_fixup(self, info):
@PostProcessor._restrict_to(images=False)
def run(self, info):
if all(self._needs_fixup(info)):
args = ['-f', 'mp4']
if self.get_audio_codec(info['filepath']) == 'aac':
args.extend(['-bsf:a', 'aac_adtstoasc'])
self._fixup('Fixing MPEG-TS in MP4 container', info['filepath'], [
*self.stream_copy_opts(), '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'])
*self.stream_copy_opts(), *args])
return [], info

View file

@ -2187,10 +2187,11 @@ def _lock_file(f, exclusive, block):
fcntl.lockf(f, flags)
def _unlock_file(f):
try:
fcntl.flock(f, fcntl.LOCK_UN)
except OSError:
fcntl.lockf(f, fcntl.LOCK_UN)
with contextlib.suppress(OSError):
return fcntl.flock(f, fcntl.LOCK_UN)
with contextlib.suppress(OSError):
return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
except ImportError:
@ -3278,8 +3279,14 @@ def multipart_encode(data, boundary=None):
return out, content_type
def variadic(x, allowed_types=(str, bytes, dict)):
return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
if blocked_types is NO_DEFAULT:
blocked_types = (str, bytes, collections.abc.Mapping)
return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
def variadic(x, allowed_types=NO_DEFAULT):
return x if is_iterable_like(x, blocked_types=allowed_types) else (x,)
def dict_get(d, key_or_keys, default=None, skip_false_values=True):
@ -3371,7 +3378,7 @@ def strip_jsonp(code):
def js_to_json(code, vars={}, *, strict=False):
# vars is a dict of var, val pairs to substitute
STRING_QUOTES = '\'"'
STRING_QUOTES = '\'"`'
STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
@ -3389,6 +3396,12 @@ def process_escape(match):
else '' if escape == '\n'
else escape)
def template_substitute(match):
evaluated = js_to_json(match.group(1), vars, strict=strict)
if evaluated[0] == '"':
return json.loads(evaluated)
return evaluated
def fix_kv(m):
v = m.group(0)
if v in ('true', 'false', 'null'):
@ -3399,7 +3412,8 @@ def fix_kv(m):
return ''
if v[0] in STRING_QUOTES:
escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1])
v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
return f'"{escaped}"'
for regex, base in INTEGER_TABLE:
@ -4091,6 +4105,10 @@ def data(self, data):
def close(self):
return self._out.strip()
# Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
# This will not trigger false positives since only UTF-8 text is being replaced
dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
def parse_node(node):
target = TTMLPElementParser()
parser = xml.etree.ElementTree.XMLParser(target=target)
@ -5461,7 +5479,7 @@ def traverse_obj(
obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
casesense=True, is_user_input=False, traverse_string=False):
"""
Safely traverse nested `dict`s and `Sequence`s
Safely traverse nested `dict`s and `Iterable`s
>>> obj = [{}, {"key": "value"}]
>>> traverse_obj(obj, (1, "key"))
@ -5469,7 +5487,7 @@ def traverse_obj(
Each of the provided `paths` is tested and the first producing a valid result will be returned.
The next path will also be tested if the path branched but no results could be found.
Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
Supported values for traversal are `Mapping`, `Iterable` and `re.Match`.
Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded.
The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
@ -5486,7 +5504,7 @@ def traverse_obj(
Read as: `[traverse_obj(obj, branch) for branch in branches]`.
- `function`: Branch out and return values filtered by the function.
Read as: `[value for key, value in obj if function(key, value)]`.
For `Sequence`s, `key` is the index of the value.
For `Iterable`s, `key` is the index of the value.
For `re.Match`es, `key` is the group number (0 = full match)
as well as additionally any group names, if given.
- `dict` Transform the current object and return a matching dict.
@ -5522,7 +5540,6 @@ def traverse_obj(
If no `default` is given and the last path branches, a `list` of results
is always returned. If a path ends on a `dict` that result will always be a `dict`.
"""
is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes))
casefold = lambda k: k.casefold() if isinstance(k, str) else k
if isinstance(expected_type, type):
@ -5535,7 +5552,9 @@ def apply_key(key, obj, is_last):
result = None
if obj is None and traverse_string:
pass
if key is ... or callable(key) or isinstance(key, slice):
branching = True
result = ()
elif key is None:
result = obj
@ -5558,7 +5577,7 @@ def apply_key(key, obj, is_last):
branching = True
if isinstance(obj, collections.abc.Mapping):
result = obj.values()
elif is_sequence(obj):
elif is_iterable_like(obj):
result = obj
elif isinstance(obj, re.Match):
result = obj.groups()
@ -5572,7 +5591,7 @@ def apply_key(key, obj, is_last):
branching = True
if isinstance(obj, collections.abc.Mapping):
iter_obj = obj.items()
elif is_sequence(obj):
elif is_iterable_like(obj):
iter_obj = enumerate(obj)
elif isinstance(obj, re.Match):
iter_obj = itertools.chain(
@ -5596,7 +5615,7 @@ def apply_key(key, obj, is_last):
} or None
elif isinstance(obj, collections.abc.Mapping):
result = (obj.get(key) if casesense or (key in obj) else
result = (try_call(obj.get, args=(key,)) if casesense or try_call(obj.__contains__, args=(key,)) else
next((v for k, v in obj.items() if casefold(k) == key), None))
elif isinstance(obj, re.Match):
@ -5608,7 +5627,7 @@ def apply_key(key, obj, is_last):
result = next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
elif isinstance(key, (int, slice)):
if is_sequence(obj):
if is_iterable_like(obj, collections.abc.Sequence):
branching = isinstance(key, slice)
with contextlib.suppress(IndexError):
result = obj[key]