mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-16 21:33:19 +00:00
[extractor/youtube] Improve description parsing performance (#7315)
* The parsing is skipped when not needed * The regex is improved by simulating atomic groups with lookaheads Authored by: pukkandan, berkanteber
This commit is contained in:
parent
98cb1eda7a
commit
71dc18fa29
|
@ -4346,15 +4346,21 @@ def process_language(container, base_url, lang_code, sub_name, query):
|
||||||
info[d_k] = parse_duration(query[k][0])
|
info[d_k] = parse_duration(query[k][0])
|
||||||
|
|
||||||
# Youtube Music Auto-generated description
|
# Youtube Music Auto-generated description
|
||||||
if video_description:
|
if (video_description or '').strip().endswith('\nAuto-generated by YouTube.'):
|
||||||
|
# XXX: Causes catastrophic backtracking if description has "·"
|
||||||
|
# E.g. https://www.youtube.com/watch?v=DoPaAxMQoiI
|
||||||
|
# Simulating atomic groups: (?P<a>[^xy]+)x => (?=(?P<a>[^xy]+))(?P=a)x
|
||||||
|
# reduces it, but does not fully fix it. https://regex101.com/r/8Ssf2h/2
|
||||||
mobj = re.search(
|
mobj = re.search(
|
||||||
r'''(?xs)
|
r'''(?xs)
|
||||||
(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+
|
(?=(?P<track>[^\n·]+))(?P=track)·
|
||||||
(?P<album>[^\n]+)
|
(?=(?P<artist>[^\n]+))(?P=artist)\n+
|
||||||
|
(?=(?P<album>[^\n]+))(?P=album)\n
|
||||||
(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?
|
(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?
|
||||||
(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?
|
(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?
|
||||||
(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?
|
(.+?\nArtist\s*:\s*
|
||||||
.+\nAuto-generated\ by\ YouTube\.\s*$
|
(?=(?P<clean_artist>[^\n]+))(?P=clean_artist)\n
|
||||||
|
)?.+\nAuto-generated\ by\ YouTube\.\s*$
|
||||||
''', video_description)
|
''', video_description)
|
||||||
if mobj:
|
if mobj:
|
||||||
release_year = mobj.group('release_year')
|
release_year = mobj.group('release_year')
|
||||||
|
|
Loading…
Reference in a new issue