From ebab01bb7300f1bd45ddb1b40d1ddafeaa16570f Mon Sep 17 00:00:00 2001 From: NightMachinery Date: Tue, 6 Dec 2022 06:58:49 +0330 Subject: [PATCH 1/4] [underline] Add extractor (draft) --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/underline.py | 103 ++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 yt_dlp/extractor/underline.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 2fe15f6d2..96040b58c 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2027,6 +2027,7 @@ from .dlive import ( ) from .drooble import DroobleIE from .umg import UMGDeIE +from .underline import UnderlineIE from .unistra import UnistraIE from .unity import UnityIE from .unscripted import UnscriptedNewsVideoIE diff --git a/yt_dlp/extractor/underline.py b/yt_dlp/extractor/underline.py new file mode 100644 index 000000000..6d3d1588b --- /dev/null +++ b/yt_dlp/extractor/underline.py @@ -0,0 +1,103 @@ +from .common import InfoExtractor + +DEBUG_P = False +if DEBUG_P: + import json + from icecream import ic + from IPython import embed + + +def gen_dict_extract(var, key): + if hasattr(var, "items"): + for k, v in var.items(): + if k == key: + yield v + if isinstance(v, dict): + for result in gen_dict_extract(v, key): + yield result + elif isinstance(v, list): + for d in v: + for result in gen_dict_extract(d, key): + yield result + + +class UnderlineIE(InfoExtractor): + _VALID_URL = r"https?://(?:www\.)?underline\.io/events/(?P[^?]+).*" + + _TESTS = [ + { + "params": { + "skip_download": True, + }, + "url": "https://underline.io/events/342/posters/12863/poster/66463-mbti-personality-prediction-approach-on-persian-twitter?tab=video", + "md5": "md5:eaa894161adaef6efd6008681e1cd2c5", + # md5 sum of the first 10241 bytes of the video file (use --test) + "info_dict": { + "id": "342/posters/12863/poster/66463-mbti-personality-prediction-approach-on-persian-twitter", + "ext": "mp4", + "title": "MBTI Personality Prediction Approach on Persian Twitter", + # * A value + # * MD5 checksum; start the string with md5: + # * A regular expression; start the string with re: + # * Any Python type, e.g. int or float + }, + } + ] + + def _real_extract(self, url): + # cookies = self._get_cookies(url) + # if DEBUG_P: + # ic(cookies) + + # if not cookies: + # self.raise_login_required('Cookies are needed to download from this website', method='cookies') + + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + webpage_info = self._search_json( + r'', + webpage, + "idk_what_this_arg_does", + video_id, + end_pattern=r"", + ) + + if DEBUG_P: + # ic(webpage_info) + with open("./tmp.json", "w") as f: + json.dump(webpage_info, f) + + # ic(webpage_info["props"]["pageProps"]["snapshot"]["models"][10]["title"]) + # embed() + + title = list(gen_dict_extract(webpage_info, "title")) + if DEBUG_P: + ic(title) + + if len(title) == 0: + title = None + else: + title = title[0] + + playlist_urls = list(gen_dict_extract(webpage_info, "playlist")) + if DEBUG_P: + ic(playlist_urls) + + if len(playlist_urls) == 0: + url = None + else: + url = playlist_urls[0] + + formats = [] + + m3u8_url = url + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native')) + + return { + "id": video_id, + "title": title, + "formats": formats, + } From 2cb546cf8e62e009a230571be16d0c1994a149c7 Mon Sep 17 00:00:00 2001 From: NightMachinery Date: Tue, 6 Dec 2022 07:00:41 +0330 Subject: [PATCH 2/4] [underline] Add extractor --- yt_dlp/extractor/underline.py | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/yt_dlp/extractor/underline.py b/yt_dlp/extractor/underline.py index 6d3d1588b..9593aa069 100644 --- a/yt_dlp/extractor/underline.py +++ b/yt_dlp/extractor/underline.py @@ -27,7 +27,7 @@ class UnderlineIE(InfoExtractor): _TESTS = [ { "params": { - "skip_download": True, + "skip_download": True, # needs cookies }, "url": "https://underline.io/events/342/posters/12863/poster/66463-mbti-personality-prediction-approach-on-persian-twitter?tab=video", "md5": "md5:eaa894161adaef6efd6008681e1cd2c5", @@ -36,22 +36,11 @@ class UnderlineIE(InfoExtractor): "id": "342/posters/12863/poster/66463-mbti-personality-prediction-approach-on-persian-twitter", "ext": "mp4", "title": "MBTI Personality Prediction Approach on Persian Twitter", - # * A value - # * MD5 checksum; start the string with md5: - # * A regular expression; start the string with re: - # * Any Python type, e.g. int or float }, } ] def _real_extract(self, url): - # cookies = self._get_cookies(url) - # if DEBUG_P: - # ic(cookies) - - # if not cookies: - # self.raise_login_required('Cookies are needed to download from this website', method='cookies') - video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -64,13 +53,9 @@ class UnderlineIE(InfoExtractor): ) if DEBUG_P: - # ic(webpage_info) with open("./tmp.json", "w") as f: json.dump(webpage_info, f) - # ic(webpage_info["props"]["pageProps"]["snapshot"]["models"][10]["title"]) - # embed() - title = list(gen_dict_extract(webpage_info, "title")) if DEBUG_P: ic(title) @@ -93,8 +78,11 @@ class UnderlineIE(InfoExtractor): m3u8_url = url if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native')) + formats.extend( + self._extract_m3u8_formats( + m3u8_url, video_id, ext="mp4", entry_protocol="m3u8_native" + ) + ) return { "id": video_id, From 93a079d865112e31675ed165985549027737f26a Mon Sep 17 00:00:00 2001 From: NightMachinery Date: Tue, 6 Dec 2022 07:30:19 +0330 Subject: [PATCH 3/4] [underline] Added slide_info to info_dict --- yt_dlp/extractor/underline.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/yt_dlp/extractor/underline.py b/yt_dlp/extractor/underline.py index 9593aa069..475361787 100644 --- a/yt_dlp/extractor/underline.py +++ b/yt_dlp/extractor/underline.py @@ -36,6 +36,13 @@ class UnderlineIE(InfoExtractor): "id": "342/posters/12863/poster/66463-mbti-personality-prediction-approach-on-persian-twitter", "ext": "mp4", "title": "MBTI Personality Prediction Approach on Persian Twitter", + "slide_info": [ + { + "url": "https://assets.underline.io/lecture/66463/slideshow/b236b5cfb38966a761a5443bf47fbdf9.pdf", + "filename": "Personality-Prediction-WINLP-slides.pdf", + "size": 780319, + } + ], }, } ] @@ -84,8 +91,11 @@ class UnderlineIE(InfoExtractor): ) ) + slide_info = list(gen_dict_extract(webpage_info, "slideshow")) + return { "id": video_id, "title": title, "formats": formats, + "slide_info": slide_info, } From 4bc859b3368388d6ba2cff85f4202627587809a7 Mon Sep 17 00:00:00 2001 From: NightMachinery Date: Tue, 6 Dec 2022 14:06:20 +0330 Subject: [PATCH 4/4] [underline] Removed debugging code --- yt_dlp/extractor/underline.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/yt_dlp/extractor/underline.py b/yt_dlp/extractor/underline.py index 475361787..542c14cd4 100644 --- a/yt_dlp/extractor/underline.py +++ b/yt_dlp/extractor/underline.py @@ -1,11 +1,5 @@ from .common import InfoExtractor -DEBUG_P = False -if DEBUG_P: - import json - from icecream import ic - from IPython import embed - def gen_dict_extract(var, key): if hasattr(var, "items"): @@ -35,7 +29,9 @@ class UnderlineIE(InfoExtractor): "info_dict": { "id": "342/posters/12863/poster/66463-mbti-personality-prediction-approach-on-persian-twitter", "ext": "mp4", - "title": "MBTI Personality Prediction Approach on Persian Twitter", + "title": ( + "MBTI Personality Prediction Approach on Persian Twitter" + ), "slide_info": [ { "url": "https://assets.underline.io/lecture/66463/slideshow/b236b5cfb38966a761a5443bf47fbdf9.pdf", @@ -59,13 +55,7 @@ class UnderlineIE(InfoExtractor): end_pattern=r"", ) - if DEBUG_P: - with open("./tmp.json", "w") as f: - json.dump(webpage_info, f) - title = list(gen_dict_extract(webpage_info, "title")) - if DEBUG_P: - ic(title) if len(title) == 0: title = None @@ -73,8 +63,6 @@ class UnderlineIE(InfoExtractor): title = title[0] playlist_urls = list(gen_dict_extract(webpage_info, "playlist")) - if DEBUG_P: - ic(playlist_urls) if len(playlist_urls) == 0: url = None @@ -87,7 +75,10 @@ class UnderlineIE(InfoExtractor): if m3u8_url: formats.extend( self._extract_m3u8_formats( - m3u8_url, video_id, ext="mp4", entry_protocol="m3u8_native" + m3u8_url, + video_id, + ext="mp4", + entry_protocol="m3u8_native", ) )