From 65f91148fc6fcbce967d775527edb95b567db0cb Mon Sep 17 00:00:00 2001 From: Marcel Date: Tue, 29 Nov 2022 15:01:18 +0100 Subject: [PATCH] [parsing] search for case-insensitive tag names --- test/test_parsing.py | 4 ++++ yt_dlp/parsing.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/test/test_parsing.py b/test/test_parsing.py index 1898ee8ab..8a36beda4 100644 --- a/test/test_parsing.py +++ b/test/test_parsing.py @@ -222,6 +222,10 @@ def test_get_element_text_and_html_by_tag_malformed(self): get_element_text_and_html_by_tag('orphan', f'{html}'), ('', '')) self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}')) + # ignore case on tags + ci_html = f'{html}' + self.assertEqual(get_element_text_and_html_by_tag('span', ci_html), (html, ci_html)) + def test_strict_html_parsing(self): class StrictTagParser(HTMLTagParser): STRICT = True diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py index 1698591e3..1db6704dd 100644 --- a/yt_dlp/parsing.py +++ b/yt_dlp/parsing.py @@ -245,7 +245,7 @@ def matching_tag_regex(tag, attribute, value_regex, escape=True): value_regex = re.escape(value_regex) return rf'''(?x) - <(?:{tag}) + <(?i:{tag}) (?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)? \s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q) ''' @@ -263,7 +263,7 @@ def tags_by_name(cls, tag, html): def matchfunc(tag_str, _attrs): return tag_str == tag - tag_regex = rf'''<\s*{re.escape(tag)}(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>''' + tag_regex = rf'''<\s*(?i:{re.escape(tag)})(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>''' yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc) @classmethod