From 04f3fd2c8948621612d852f8f68ef549a484bfb6 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 4 Apr 2022 13:57:35 +0530 Subject: [PATCH] [cleanup] Use `_html_extract_title` --- CONTRIBUTING.md | 4 ++-- yt_dlp/extractor/adobeconnect.py | 2 +- yt_dlp/extractor/allocine.py | 6 ++---- yt_dlp/extractor/archiveorg.py | 3 +-- yt_dlp/extractor/asiancrush.py | 3 +-- yt_dlp/extractor/bbc.py | 5 ++--- yt_dlp/extractor/breitbart.py | 5 ++--- yt_dlp/extractor/callin.py | 2 +- yt_dlp/extractor/cbc.py | 6 +++--- yt_dlp/extractor/closertotruth.py | 3 +-- yt_dlp/extractor/common.py | 10 ++++------ yt_dlp/extractor/cspan.py | 2 +- yt_dlp/extractor/fivetv.py | 3 +-- yt_dlp/extractor/foxgay.py | 3 +-- yt_dlp/extractor/generic.py | 6 ++---- yt_dlp/extractor/glide.py | 4 +--- yt_dlp/extractor/hellporno.py | 3 +-- yt_dlp/extractor/huya.py | 3 +-- yt_dlp/extractor/imdb.py | 2 +- yt_dlp/extractor/infoq.py | 2 +- yt_dlp/extractor/iwara.py | 3 +-- yt_dlp/extractor/linkedin.py | 2 +- yt_dlp/extractor/miaopai.py | 3 +-- yt_dlp/extractor/mojvideo.py | 3 +-- yt_dlp/extractor/newgrounds.py | 6 ++---- yt_dlp/extractor/nhk.py | 4 +++- yt_dlp/extractor/playvid.py | 3 +-- yt_dlp/extractor/rule34video.py | 2 +- yt_dlp/extractor/senategov.py | 2 +- yt_dlp/extractor/sunporno.py | 3 +-- yt_dlp/extractor/thisav.py | 4 +--- yt_dlp/extractor/traileraddict.py | 3 +-- yt_dlp/extractor/varzesh3.py | 3 +-- yt_dlp/extractor/vshare.py | 3 +-- yt_dlp/extractor/vupload.py | 2 +- yt_dlp/extractor/weibo.py | 3 +-- yt_dlp/extractor/yahoo.py | 2 +- yt_dlp/extractor/youjizz.py | 3 +-- 38 files changed, 51 insertions(+), 80 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1897f73e02..ea1893d15a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -534,13 +534,13 @@ #### Example Correct: ```python -title = self._html_search_regex(r'([^<]+)', webpage, 'title') +title = self._html_search_regex(r'

([^<]+)

', webpage, 'title') ``` Incorrect: ```python -TITLE_RE = r'([^<]+)' +TITLE_RE = r'

([^<]+)

' # ...some lines of code... title = self._html_search_regex(TITLE_RE, webpage, 'title') ``` diff --git a/yt_dlp/extractor/adobeconnect.py b/yt_dlp/extractor/adobeconnect.py index e688dddcbb..e2e6f93f31 100644 --- a/yt_dlp/extractor/adobeconnect.py +++ b/yt_dlp/extractor/adobeconnect.py @@ -14,7 +14,7 @@ class AdobeConnectIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'(.+?)', webpage, 'title') + title = self._html_extract_title(webpage) qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1]) is_live = qs.get('isLive', ['false'])[0] == 'true' formats = [] diff --git a/yt_dlp/extractor/allocine.py b/yt_dlp/extractor/allocine.py index cd533acfc7..403a277e97 100644 --- a/yt_dlp/extractor/allocine.py +++ b/yt_dlp/extractor/allocine.py @@ -7,6 +7,7 @@ int_or_none, qualities, remove_end, + strip_or_none, try_get, unified_timestamp, url_basename, @@ -102,10 +103,7 @@ def _real_extract(self, url): video_id = display_id media_data = self._download_json( 'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id) - title = remove_end( - self._html_search_regex( - r'(?s)(.+?)', webpage, 'title').strip(), - ' - AlloCiné') + title = remove_end(strip_or_none(self._html_extract_title(webpage), ' - AlloCiné')) for key, value in media_data['video'].items(): if not key.endswith('Path'): continue diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index b06ac74aed..2ab3c1bebd 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -483,8 +483,7 @@ def _extract_yt_initial_variable(self, webpage, regex, video_id, name): regex), webpage, name, default='{}'), video_id, fatal=False) def _extract_webpage_title(self, webpage): - page_title = self._html_search_regex( - r'([^<]*)', webpage, 'title', default='') + page_title = self._html_extract_title(webpage, default='') # YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix. return self._html_search_regex( r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)', diff --git a/yt_dlp/extractor/asiancrush.py b/yt_dlp/extractor/asiancrush.py index 75a6329589..7f1940fcab 100644 --- a/yt_dlp/extractor/asiancrush.py +++ b/yt_dlp/extractor/asiancrush.py @@ -181,8 +181,7 @@ def _real_extract(self, url): 'title', default=None) or self._og_search_title( webpage, default=None) or self._html_search_meta( 'twitter:title', webpage, 'title', - default=None) or self._search_regex( - r'([^<]+)', webpage, 'title', fatal=False) + default=None) or self._html_extract_title(webpage) if title: title = re.sub(r'\s*\|\s*.+?$', '', title) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 8231557300..29ad7ded77 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -906,9 +906,8 @@ def _real_extract(self, url): playlist_title = json_ld_info.get('title') if not playlist_title: - playlist_title = self._og_search_title( - webpage, default=None) or self._html_search_regex( - r'(.+?)', webpage, 'playlist title', default=None) + playlist_title = (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'playlist title', default=None)) if playlist_title: playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() diff --git a/yt_dlp/extractor/breitbart.py b/yt_dlp/extractor/breitbart.py index f50f719dc2..e029aa627f 100644 --- a/yt_dlp/extractor/breitbart.py +++ b/yt_dlp/extractor/breitbart.py @@ -29,9 +29,8 @@ def _real_extract(self, url): self._sort_formats(formats) return { 'id': video_id, - 'title': self._og_search_title( - webpage, default=None) or self._html_search_regex( - r'(?s)(.*?)', webpage, 'video title'), + 'title': (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'video title')), 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), 'age_limit': self._rta_search(webpage), diff --git a/yt_dlp/extractor/callin.py b/yt_dlp/extractor/callin.py index acf327ace6..1f3b7cfff9 100644 --- a/yt_dlp/extractor/callin.py +++ b/yt_dlp/extractor/callin.py @@ -54,7 +54,7 @@ def _real_extract(self, url): id = episode['id'] title = (episode.get('title') or self._og_search_title(webpage, fatal=False) - or self._html_search_regex('(.*?)', webpage, 'title')) + or self._html_extract_title(webpage)) url = episode['m3u8'] formats = self._extract_m3u8_formats(url, display_id, ext='ts') self._sort_formats(formats) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index ac1272f7b5..fba8bf965f 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -127,9 +127,9 @@ def _extract_player_init(self, player_init, display_id): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - title = self._og_search_title(webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( - r'([^<]+)', webpage, 'title', fatal=False) + title = (self._og_search_title(webpage, default=None) + or self._html_search_meta('twitter:title', webpage, 'title', default=None) + or self._html_extract_title(webpage)) entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] diff --git a/yt_dlp/extractor/closertotruth.py b/yt_dlp/extractor/closertotruth.py index 26243d52d5..517e121e02 100644 --- a/yt_dlp/extractor/closertotruth.py +++ b/yt_dlp/extractor/closertotruth.py @@ -54,8 +54,7 @@ def _real_extract(self, url): r']+src=["\'].*?\b(?:partner_id|p)/(\d+)', webpage, 'kaltura partner_id') - title = self._search_regex( - r'(.+?)\s*\|\s*.+?', webpage, 'video title') + title = self._html_extract_title(webpage, 'video title') select = self._search_regex( r'(?s)]+id="select-version"[^>]*>(.+?)', diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index af964c5278..81688eb547 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1329,9 +1329,8 @@ def _og_search_thumbnail(self, html, **kargs): def _og_search_description(self, html, **kargs): return self._og_search_property('description', html, fatal=False, **kargs) - def _og_search_title(self, html, **kargs): - kargs.setdefault('fatal', False) - return self._og_search_property('title', html, **kargs) + def _og_search_title(self, html, *, fatal=False, **kargs): + return self._og_search_property('title', html, fatal=fatal, **kargs) def _og_search_video_url(self, html, name='video url', secure=True, **kargs): regexes = self._og_regexes('video') + self._og_regexes('video:url') @@ -1342,9 +1341,8 @@ def _og_search_video_url(self, html, name='video url', secure=True, **kargs): def _og_search_url(self, html, **kargs): return self._og_search_property('url', html, **kargs) - def _html_extract_title(self, html, name, **kwargs): - return self._html_search_regex( - r'(?s)(.*?)', html, name, **kwargs) + def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs): + return self._html_search_regex(r'(?s)([^<]+)', html, name, fatal=fatal, **kwargs) def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): name = variadic(name) diff --git a/yt_dlp/extractor/cspan.py b/yt_dlp/extractor/cspan.py index d29b58ba6f..f51159bbe6 100644 --- a/yt_dlp/extractor/cspan.py +++ b/yt_dlp/extractor/cspan.py @@ -278,7 +278,7 @@ def _real_extract(self, url): video_id, transform_source=js_to_json) title = (self._og_search_title(webpage, default=None) - or self._html_search_regex(r'(?s)(.*?)', webpage, 'video title')) + or self._html_extract_title(webpage, 'video title')) description = (self._og_search_description(webpage, default=None) or self._html_search_meta('description', webpage, 'description', default=None)) diff --git a/yt_dlp/extractor/fivetv.py b/yt_dlp/extractor/fivetv.py index be81fccb8f..d6bebd19bd 100644 --- a/yt_dlp/extractor/fivetv.py +++ b/yt_dlp/extractor/fivetv.py @@ -75,8 +75,7 @@ def _real_extract(self, url): r']+?href="([^"]+)"[^>]+?class="videoplayer"'], webpage, 'video url') - title = self._og_search_title(webpage, default=None) or self._search_regex( - r'([^<]+)', webpage, 'title') + title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage) duration = int_or_none(self._og_search_property( 'video:duration', webpage, 'duration', default=None)) diff --git a/yt_dlp/extractor/foxgay.py b/yt_dlp/extractor/foxgay.py index 512a106455..1c53e0642b 100644 --- a/yt_dlp/extractor/foxgay.py +++ b/yt_dlp/extractor/foxgay.py @@ -29,8 +29,7 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = remove_end(self._html_search_regex( - r'([^<]+)', webpage, 'title'), ' - Foxgay.com') + title = remove_end(self._html_extract_title(webpage), ' - Foxgay.com') description = get_element_by_id('inf_tit', webpage) # The default user-agent with foxgay cookies leads to pages without videos diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 4a2e301580..65e803dd70 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2873,10 +2873,8 @@ def _real_extract(self, url): # Site Name | Video Title # Video Title - Tagline | Site Name # and so on and so forth; it's just not practical - video_title = self._og_search_title( - webpage, default=None) or self._html_search_regex( - r'(?s)(.*?)', webpage, 'video title', - default='video') + video_title = (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'video title', default='video')) # Try to detect age limit automatically age_limit = self._rta_search(webpage) diff --git a/yt_dlp/extractor/glide.py b/yt_dlp/extractor/glide.py index d94dfbf093..12af859be3 100644 --- a/yt_dlp/extractor/glide.py +++ b/yt_dlp/extractor/glide.py @@ -23,9 +23,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'(.+?)', webpage, - 'title', default=None) or self._og_search_title(webpage) + title = self._html_extract_title(webpage, default=None) or self._og_search_title(webpage) video_url = self._proto_relative_url(self._search_regex( r']+src=(["\'])(?P.+?)\1', webpage, 'video URL', default=None, diff --git a/yt_dlp/extractor/hellporno.py b/yt_dlp/extractor/hellporno.py index fae4251034..92d32cdcc8 100644 --- a/yt_dlp/extractor/hellporno.py +++ b/yt_dlp/extractor/hellporno.py @@ -38,8 +38,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) - title = remove_end(self._html_search_regex( - r'([^<]+)', webpage, 'title'), ' - Hell Porno') + title = remove_end(self._html_extract_title(webpage), ' - Hell Porno') info = self._parse_html5_media_entries(url, webpage, display_id)[0] self._sort_formats(info['formats']) diff --git a/yt_dlp/extractor/huya.py b/yt_dlp/extractor/huya.py index b814396820..4e96f22faa 100644 --- a/yt_dlp/extractor/huya.py +++ b/yt_dlp/extractor/huya.py @@ -66,8 +66,7 @@ def _real_extract(self, url): room_info = try_get(stream_data, lambda x: x['data'][0]['gameLiveInfo']) if not room_info: raise ExtractorError('Can not extract the room info', expected=True) - title = room_info.get('roomName') or room_info.get('introduction') or self._html_search_regex( - r'([^<]+)', webpage, 'title') + title = room_info.get('roomName') or room_info.get('introduction') or self._html_extract_title(webpage) screen_type = room_info.get('screenType') live_source_type = room_info.get('liveSourceType') stream_info_list = stream_data['data'][0]['gameStreamInfoList'] diff --git a/yt_dlp/extractor/imdb.py b/yt_dlp/extractor/imdb.py index 7eb66d8216..96cee2e2fc 100644 --- a/yt_dlp/extractor/imdb.py +++ b/yt_dlp/extractor/imdb.py @@ -68,7 +68,7 @@ def _real_extract(self, url): video_info = traverse_obj(info, ('props', 'pageProps', 'videoPlaybackData', 'video'), default={}) title = (traverse_obj(video_info, ('name', 'value'), ('primaryTitle', 'titleText', 'text')) or self._html_search_meta(('og:title', 'twitter:title'), webpage, default=None) - or self._html_search_regex(r'(.+?)', webpage, 'title')) + or self._html_extract_title(webpage)) data = video_info.get('playbackURLs') or try_get(self._download_json( 'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id, query={ diff --git a/yt_dlp/extractor/infoq.py b/yt_dlp/extractor/infoq.py index 0a70a1fb44..347cc51544 100644 --- a/yt_dlp/extractor/infoq.py +++ b/yt_dlp/extractor/infoq.py @@ -115,7 +115,7 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_title = self._html_search_regex(r'(.*?)', webpage, 'title') + video_title = self._html_extract_title(webpage) video_description = self._html_search_meta('description', webpage, 'description') if '/cn/' in url: diff --git a/yt_dlp/extractor/iwara.py b/yt_dlp/extractor/iwara.py index 254d986923..c0e01e3522 100644 --- a/yt_dlp/extractor/iwara.py +++ b/yt_dlp/extractor/iwara.py @@ -76,8 +76,7 @@ def _real_extract(self, url): 'age_limit': age_limit, } - title = remove_end(self._html_search_regex( - r'([^<]+)', webpage, 'title'), ' | Iwara') + title = remove_end(self._html_extract_title(webpage), ' | Iwara') thumbnail = self._html_search_regex( r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None) diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py index bf549e1641..0f57bfa06f 100644 --- a/yt_dlp/extractor/linkedin.py +++ b/yt_dlp/extractor/linkedin.py @@ -102,7 +102,7 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'([^<]+)', webpage, 'title') + title = self._html_extract_title(webpage) description = clean_html(get_element_by_class('share-update-card__update-text', webpage)) like_count = int_or_none(get_element_by_class('social-counts-reactions__social-counts-numRections', webpage)) creator = strip_or_none(clean_html(get_element_by_class('comment__actor-name', webpage))) diff --git a/yt_dlp/extractor/miaopai.py b/yt_dlp/extractor/miaopai.py index f9e35ac7f6..cf0610bdf4 100644 --- a/yt_dlp/extractor/miaopai.py +++ b/yt_dlp/extractor/miaopai.py @@ -24,8 +24,7 @@ def _real_extract(self, url): webpage = self._download_webpage( url, video_id, headers={'User-Agent': self._USER_AGENT_IPAD}) - title = self._html_search_regex( - r'([^<]+)', webpage, 'title') + title = self._html_extract_title(webpage) thumbnail = self._html_search_regex( r']+class=(?P[\'"]).*\bvideo_img\b.*(?P=q1)[^>]+data-url=(?P[\'"])(?P[^\'"]+)(?P=q2)', webpage, 'thumbnail', fatal=False, group='url') diff --git a/yt_dlp/extractor/mojvideo.py b/yt_dlp/extractor/mojvideo.py index 0421f3f447..16d94052bd 100644 --- a/yt_dlp/extractor/mojvideo.py +++ b/yt_dlp/extractor/mojvideo.py @@ -38,8 +38,7 @@ def _real_extract(self, url): r'([^<]*)', playerapi, 'error description', fatal=False) raise ExtractorError('%s said: %s' % (self.IE_NAME, error_desc), expected=True) - title = self._html_search_regex( - r'([^<]+)', playerapi, 'title') + title = self._html_extract_title(playerapi) video_url = self._html_search_regex( r'([^<]+)', playerapi, 'video URL') thumbnail = self._html_search_regex( diff --git a/yt_dlp/extractor/newgrounds.py b/yt_dlp/extractor/newgrounds.py index 1e1274ef05..6525a6d8a3 100644 --- a/yt_dlp/extractor/newgrounds.py +++ b/yt_dlp/extractor/newgrounds.py @@ -106,8 +106,7 @@ def _real_extract(self, url): uploader = None webpage = self._download_webpage(url, media_id) - title = self._html_search_regex( - r'(.+?)', webpage, 'title') + title = self._html_extract_title(webpage) media_url_string = self._search_regex( r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None) @@ -219,8 +218,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, playlist_id) - title = self._search_regex( - r'([^>]+)', webpage, 'title', default=None) + title = self._html_extract_title(webpage, default=None) # cut left menu webpage = self._search_regex( diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index 626c6379b3..3b8efc3e60 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -309,7 +309,9 @@ def _real_extract(self, url): webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id) - title = self._og_search_title(webpage, fatal=False) or self._html_extract_title(webpage, fatal=False) or self._html_search_regex(r'

([^<]+?)とは?\s*

', webpage, 'title', fatal=False) + title = (self._og_search_title(webpage) + or self._html_extract_title(webpage) + or self._html_search_regex(r'

([^<]+?)とは?\s*

', webpage, 'title', fatal=False)) title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None description = self._html_search_regex( r'(?s)\s*

[^<]+

', diff --git a/yt_dlp/extractor/playvid.py b/yt_dlp/extractor/playvid.py index 4aef186ea2..e1c406b6c2 100644 --- a/yt_dlp/extractor/playvid.py +++ b/yt_dlp/extractor/playvid.py @@ -85,8 +85,7 @@ def _real_extract(self, url): # Extract title - should be in the flashvars; if not, look elsewhere if video_title is None: - video_title = self._html_search_regex( - r'(.*?)</title', webpage, 'title') + video_title = self._html_extract_title(webpage) return { 'id': video_id, diff --git a/yt_dlp/extractor/rule34video.py b/yt_dlp/extractor/rule34video.py index 522d4ccd5b..a602a9f335 100644 --- a/yt_dlp/extractor/rule34video.py +++ b/yt_dlp/extractor/rule34video.py @@ -49,7 +49,7 @@ def _real_extract(self, url): 'quality': quality, }) - title = self._html_search_regex(r'<title>([^<]+)', webpage, 'title') + title = self._html_extract_title(webpage) thumbnail = self._html_search_regex(r'preview_url:\s+\'([^\']+)\'', webpage, 'thumbnail', default=None) duration = self._html_search_regex(r'"icon-clock">\s+((?:\d+:?)+)', webpage, 'duration', default=None) diff --git a/yt_dlp/extractor/senategov.py b/yt_dlp/extractor/senategov.py index 6f4240422a..b295184a19 100644 --- a/yt_dlp/extractor/senategov.py +++ b/yt_dlp/extractor/senategov.py @@ -112,7 +112,7 @@ def _real_extract(self, url): if smuggled_data.get('force_title'): title = smuggled_data['force_title'] else: - title = self._html_search_regex(r'([^<]+)', webpage, video_id) + title = self._html_extract_title(webpage) poster = qs.get('poster') thumbnail = poster[0] if poster else None diff --git a/yt_dlp/extractor/sunporno.py b/yt_dlp/extractor/sunporno.py index 68051169b9..59b77bf92f 100644 --- a/yt_dlp/extractor/sunporno.py +++ b/yt_dlp/extractor/sunporno.py @@ -36,8 +36,7 @@ def _real_extract(self, url): webpage = self._download_webpage( 'http://www.sunporno.com/videos/%s' % video_id, video_id) - title = self._html_search_regex( - r'([^<]+)', webpage, 'title') + title = self._html_extract_title(webpage) description = self._html_search_meta( 'description', webpage, 'description') thumbnail = self._html_search_regex( diff --git a/yt_dlp/extractor/thisav.py b/yt_dlp/extractor/thisav.py index 4af286e6d9..6bb00b3aba 100644 --- a/yt_dlp/extractor/thisav.py +++ b/yt_dlp/extractor/thisav.py @@ -37,9 +37,7 @@ def _real_extract(self, url): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - title = remove_end(self._html_search_regex( - r'([^<]+)', webpage, 'title'), - ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站') + title = remove_end(self._html_extract_title(webpage), ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站') video_url = self._html_search_regex( r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None) if video_url: diff --git a/yt_dlp/extractor/traileraddict.py b/yt_dlp/extractor/traileraddict.py index 10100fbcf3..514f4793e6 100644 --- a/yt_dlp/extractor/traileraddict.py +++ b/yt_dlp/extractor/traileraddict.py @@ -24,8 +24,7 @@ def _real_extract(self, url): name = mobj.group('movie') + '/' + mobj.group('trailer_name') webpage = self._download_webpage(url, name) - title = self._search_regex(r'(.+?)', - webpage, 'video title').replace(' - Trailer Addict', '') + title = self._html_extract_title(webpage, 'video title').replace(' - Trailer Addict', '') view_count_str = self._search_regex( r'([0-9,.]+)', webpage, 'view count', fatal=False) diff --git a/yt_dlp/extractor/varzesh3.py b/yt_dlp/extractor/varzesh3.py index 81313dc9d7..32655b96de 100644 --- a/yt_dlp/extractor/varzesh3.py +++ b/yt_dlp/extractor/varzesh3.py @@ -42,8 +42,7 @@ def _real_extract(self, url): video_url = self._search_regex( r']+src="([^"]+)"', webpage, 'video url') - title = remove_start(self._html_search_regex( - r'([^<]+)', webpage, 'title'), 'ویدیو ورزش 3 | ') + title = remove_start(self._html_extract_title(webpage), 'ویدیو ورزش 3 | ') description = self._html_search_regex( r'(?s)
(.+?)
', diff --git a/yt_dlp/extractor/vshare.py b/yt_dlp/extractor/vshare.py index c631ac1faa..b4874ac390 100644 --- a/yt_dlp/extractor/vshare.py +++ b/yt_dlp/extractor/vshare.py @@ -50,8 +50,7 @@ def _real_extract(self, url): 'https://vshare.io/v/%s/width-650/height-430/1' % video_id, video_id, headers={'Referer': url}) - title = self._html_search_regex( - r'([^<]+)', webpage, 'title') + title = self._html_extract_title(webpage) title = title.split(' - ')[0] error = self._html_search_regex( diff --git a/yt_dlp/extractor/vupload.py b/yt_dlp/extractor/vupload.py index 2229a6591f..b561f63f73 100644 --- a/yt_dlp/extractor/vupload.py +++ b/yt_dlp/extractor/vupload.py @@ -28,7 +28,7 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'(.+?)', webpage, 'title') + title = self._html_extract_title(webpage) video_json = self._parse_json(self._html_search_regex(r'sources:\s*(.+?]),', webpage, 'video'), video_id, transform_source=js_to_json) formats = [] for source in video_json: diff --git a/yt_dlp/extractor/weibo.py b/yt_dlp/extractor/weibo.py index 621df5b549..dafa2af3be 100644 --- a/yt_dlp/extractor/weibo.py +++ b/yt_dlp/extractor/weibo.py @@ -73,8 +73,7 @@ def _real_extract(self, url): webpage = self._download_webpage( url, video_id, note='Revisiting webpage') - title = self._html_search_regex( - r'(.+?)', webpage, 'title') + title = self._html_extract_title(webpage) video_formats = compat_parse_qs(self._search_regex( r'video-sources=\\\"(.+?)\"', webpage, 'video_sources')) diff --git a/yt_dlp/extractor/yahoo.py b/yt_dlp/extractor/yahoo.py index 6cf3b1de25..20504de2c0 100644 --- a/yt_dlp/extractor/yahoo.py +++ b/yt_dlp/extractor/yahoo.py @@ -533,7 +533,7 @@ def _real_extract(self, url): title = self._html_search_meta( ['og:title', 'twitter:title'], webpage, 'title', default=None - ) or self._html_search_regex('([^<]+)', webpage, 'title') + ) or self._html_extract_title(webpage) if display_id == host: # Headline page (w/ multiple BC playlists) ('news.yahoo.co.jp', 'headlines.yahoo.co.jp/videonews/', ...) diff --git a/yt_dlp/extractor/youjizz.py b/yt_dlp/extractor/youjizz.py index 5f5fbf21cf..111623ffe6 100644 --- a/yt_dlp/extractor/youjizz.py +++ b/yt_dlp/extractor/youjizz.py @@ -36,8 +36,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'(.+?)', webpage, 'title') + title = self._html_extract_title(webpage) formats = []