[extractor/youtube] Improve description parsing performance (#7315)

* The parsing is skipped when not needed
* The regex is improved by simulating atomic groups with lookaheads

Authored by: pukkandan, berkanteber
This commit is contained in:
Berkan Teber 2023-06-22 10:27:54 +03:00 committed by GitHub
parent 98cb1eda7a
commit 71dc18fa29
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -4346,15 +4346,21 @@ def process_language(container, base_url, lang_code, sub_name, query):
info[d_k] = parse_duration(query[k][0]) info[d_k] = parse_duration(query[k][0])
# Youtube Music Auto-generated description # Youtube Music Auto-generated description
if video_description: if (video_description or '').strip().endswith('\nAuto-generated by YouTube.'):
# XXX: Causes catastrophic backtracking if description has "·"
# E.g. https://www.youtube.com/watch?v=DoPaAxMQoiI
# Simulating atomic groups: (?P<a>[^xy]+)x => (?=(?P<a>[^xy]+))(?P=a)x
# reduces it, but does not fully fix it. https://regex101.com/r/8Ssf2h/2
mobj = re.search( mobj = re.search(
r'''(?xs) r'''(?xs)
(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+ (?=(?P<track>[^\n·]+))(?P=track)·
(?P<album>[^\n]+) (?=(?P<artist>[^\n]+))(?P=artist)\n+
(?=(?P<album>[^\n]+))(?P=album)\n
(?:.+?\s*(?P<release_year>\d{4})(?!\d))? (?:.+?\s*(?P<release_year>\d{4})(?!\d))?
(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))? (?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?
(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))? (.+?\nArtist\s*:\s*
.+\nAuto-generated\ by\ YouTube\.\s*$ (?=(?P<clean_artist>[^\n]+))(?P=clean_artist)\n
)?.+\nAuto-generated\ by\ YouTube\.\s*$
''', video_description) ''', video_description)
if mobj: if mobj:
release_year = mobj.group('release_year') release_year = mobj.group('release_year')