From 03e4ca498a33db0f5a3287eab6df4b048a0143ff Mon Sep 17 00:00:00 2001 From: Jesse Millwood Date: Sat, 6 May 2023 06:52:13 -0400 Subject: [PATCH] [extractor/fosdem] Move parsing logic --- yt_dlp/extractor/fosdem.py | 51 ++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/yt_dlp/extractor/fosdem.py b/yt_dlp/extractor/fosdem.py index 6c710ddcec..3d166af0eb 100644 --- a/yt_dlp/extractor/fosdem.py +++ b/yt_dlp/extractor/fosdem.py @@ -44,8 +44,6 @@ def _real_extract(self, url): video_id = self._match_id(url) groups = self._match_valid_url(url).groupdict() webpage = self._download_webpage(url, video_id) - if groups['url_type'] == 'event': - print("This is an event url") elif groups['url_type'] == 'track': print("This is a track") # Download all videos on this page @@ -54,28 +52,27 @@ def _real_extract(self, url): year = groups['year'] title_rgx = r"
\n\s+

(.+?)

" title = self._html_search_regex(title_rgx, webpage, 'title') - print(f'TITLE: {title}') - evnt_blurb_rgx = r"
\n*(?P(
(

(.+?)

\n*)+
)+\n*(
(

(.+?)

\n*)*
))+\n*
" - evnt_blurb = self._html_search_regex(evnt_blurb_rgx, - webpage, - 'event blurb', - group='blurb', flags=re.DOTALL) - description = evnt_blurb - print(f"DESCRIPTION: {description}") - video_url_rgx = r"
  • " - video_url = self._html_search_regex(video_url_rgx, - webpage, - 'video url') - print(f"VIDEO URL: {video_url}") - print('\n\n___________________________') - return { - 'id': video_id, - 'title': title, - 'description': description, - 'uploader': 'FOSDEM', - 'url': video_url, - 'thumbnail': None, - # TODO more properties (see yt_dlp/extractor/common.py) - 'release_date': year, - # 'presenter/author - } + if groups['url_type'] == 'event': + evnt_blurb_rgx = r"
    \n*(?P(
    (

    (.+?)

    \n*)+
    )+\n*(
    (

    (.+?)

    \n*)*
    ))+\n*
    " + evnt_blurb = self._html_search_regex(evnt_blurb_rgx, + webpage, + 'event blurb', + group='blurb', flags=re.DOTALL) + description = evnt_blurb + video_url_rgx = r"
  • " + video_url = self._html_search_regex(video_url_rgx, + webpage, + 'video url') + cast_rgx = r"(?P\w+ \w+)" + cast = re.findall(cast_rgx, webpage, flags=re.UNICODE) + return { + 'id': video_id, + 'title': title, + 'description': description, + 'uploader': 'FOSDEM', + 'url': video_url, + 'thumbnail': None, + 'release_date': year, + 'cast': cast, + 'webpage_url': url, + }