Add option --use-extractors

Deprecates `--force-generic-extractor`

Closes #3234, Closes #2044

Related: #4307, #1791
This commit is contained in:
pukkandan 2022-08-24 05:42:16 +05:30
parent 5314b52192
commit fe7866d0ed
No known key found for this signature in database
GPG key ID: 7EEE9E1E817D0A39
5 changed files with 58 additions and 18 deletions

View file

@ -375,7 +375,13 @@ ## General Options:
--list-extractors List all supported extractors and exit --list-extractors List all supported extractors and exit
--extractor-descriptions Output descriptions of all supported --extractor-descriptions Output descriptions of all supported
extractors and exit extractors and exit
--force-generic-extractor Force extraction to use the generic extractor --use-extractors, --ies NAMES Extractor names to use separated by commas.
You can also use regexes, "all", "default"
and "end" (end URL matching); e.g. --ies
"holodex.*,end,youtube". Prefix the name
with a "-" to exclude it, e.g. --ies
default,-generic. Use --list-extractors for
a list of available extractor names
--default-search PREFIX Use this prefix for unqualified URLs. E.g. --default-search PREFIX Use this prefix for unqualified URLs. E.g.
"gvsearch2:python" downloads two videos from "gvsearch2:python" downloads two videos from
google videos for the search term "python". google videos for the search term "python".
@ -2058,6 +2064,7 @@ #### Redundant options
#### Not recommended #### Not recommended
While these options still work, their use is not recommended since there are other alternatives to achieve the same While these options still work, their use is not recommended since there are other alternatives to achieve the same
--force-generic-extractor --ies generic,default
--exec-before-download CMD --exec "before_dl:CMD" --exec-before-download CMD --exec "before_dl:CMD"
--no-exec-before-download --no-exec --no-exec-before-download --no-exec
--all-formats -f all --all-formats -f all

View file

@ -29,6 +29,7 @@
from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name
from .downloader.rtmp import rtmpdump_version from .downloader.rtmp import rtmpdump_version
from .extractor import gen_extractor_classes, get_info_extractor from .extractor import gen_extractor_classes, get_info_extractor
from .extractor.common import UnsupportedURLIE
from .extractor.openload import PhantomJSwrapper from .extractor.openload import PhantomJSwrapper
from .minicurses import format_text from .minicurses import format_text
from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors
@ -237,7 +238,7 @@ class YoutubeDL:
Default is 'only_download' for CLI, but False for API Default is 'only_download' for CLI, but False for API
skip_playlist_after_errors: Number of allowed failures until the rest of skip_playlist_after_errors: Number of allowed failures until the rest of
the playlist is skipped the playlist is skipped
force_generic_extractor: Force downloader to use the generic extractor allowed_extractors: List of regexes to match against extractor names that are allowed
overwrites: Overwrite all video and metadata files if True, overwrites: Overwrite all video and metadata files if True,
overwrite only non-video files if None overwrite only non-video files if None
and don't overwrite any file if False and don't overwrite any file if False
@ -477,6 +478,8 @@ class YoutubeDL:
The following options are deprecated and may be removed in the future: The following options are deprecated and may be removed in the future:
force_generic_extractor: Force downloader to use the generic extractor
- Use allowed_extractors = ['generic', 'default']
playliststart: - Use playlist_items playliststart: - Use playlist_items
Playlist item to start at. Playlist item to start at.
playlistend: - Use playlist_items playlistend: - Use playlist_items
@ -758,13 +761,6 @@ def add_info_extractor(self, ie):
self._ies_instances[ie_key] = ie self._ies_instances[ie_key] = ie
ie.set_downloader(self) ie.set_downloader(self)
def _get_info_extractor_class(self, ie_key):
ie = self._ies.get(ie_key)
if ie is None:
ie = get_info_extractor(ie_key)
self.add_info_extractor(ie)
return ie
def get_info_extractor(self, ie_key): def get_info_extractor(self, ie_key):
""" """
Get an instance of an IE with name ie_key, it will try to get one from Get an instance of an IE with name ie_key, it will try to get one from
@ -781,8 +777,19 @@ def add_default_info_extractors(self):
""" """
Add the InfoExtractors returned by gen_extractors to the end of the list Add the InfoExtractors returned by gen_extractors to the end of the list
""" """
for ie in gen_extractor_classes(): all_ies = {ie.IE_NAME.lower(): ie for ie in gen_extractor_classes()}
self.add_info_extractor(ie) all_ies['end'] = UnsupportedURLIE()
try:
ie_names = orderedSet_from_options(
self.params.get('allowed_extractors', ['default']), {
'all': list(all_ies),
'default': [name for name, ie in all_ies.items() if ie._ENABLED],
}, use_regex=True)
except re.error as e:
raise ValueError(f'Wrong regex for allowed_extractors: {e.pattern}')
for name in ie_names:
self.add_info_extractor(all_ies[name])
self.write_debug(f'Loaded {len(ie_names)} extractors')
def add_post_processor(self, pp, when='post_process'): def add_post_processor(self, pp, when='post_process'):
"""Add a PostProcessor object to the end of the chain.""" """Add a PostProcessor object to the end of the chain."""
@ -1413,11 +1420,11 @@ def extract_info(self, url, download=True, ie_key=None, extra_info=None,
ie_key = 'Generic' ie_key = 'Generic'
if ie_key: if ie_key:
ies = {ie_key: self._get_info_extractor_class(ie_key)} ies = {ie_key: self._ies[ie_key]} if ie_key in self._ies else {}
else: else:
ies = self._ies ies = self._ies
for ie_key, ie in ies.items(): for key, ie in ies.items():
if not ie.suitable(url): if not ie.suitable(url):
continue continue
@ -1426,14 +1433,16 @@ def extract_info(self, url, download=True, ie_key=None, extra_info=None,
'and will probably not work.') 'and will probably not work.')
temp_id = ie.get_temp_id(url) temp_id = ie.get_temp_id(url)
if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': key}):
self.to_screen(f'[{ie_key}] {temp_id}: has already been recorded in the archive') self.to_screen(f'[{key}] {temp_id}: has already been recorded in the archive')
if self.params.get('break_on_existing', False): if self.params.get('break_on_existing', False):
raise ExistingVideoReached() raise ExistingVideoReached()
break break
return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process) return self.__extract_info(url, self.get_info_extractor(key), download, extra_info, process)
else: else:
self.report_error('no suitable InfoExtractor for URL %s' % url) extractors_restricted = self.params.get('allowed_extractors') not in (None, ['default'])
self.report_error(f'No suitable extractor{format_field(ie_key, None, " (%s)")} found for URL {url}',
tb=False if extractors_restricted else None)
def _handle_extraction_exceptions(func): def _handle_extraction_exceptions(func):
@functools.wraps(func) @functools.wraps(func)

View file

@ -766,6 +766,7 @@ def parse_options(argv=None):
'windowsfilenames': opts.windowsfilenames, 'windowsfilenames': opts.windowsfilenames,
'ignoreerrors': opts.ignoreerrors, 'ignoreerrors': opts.ignoreerrors,
'force_generic_extractor': opts.force_generic_extractor, 'force_generic_extractor': opts.force_generic_extractor,
'allowed_extractors': opts.allowed_extractors or ['default'],
'ratelimit': opts.ratelimit, 'ratelimit': opts.ratelimit,
'throttledratelimit': opts.throttledratelimit, 'throttledratelimit': opts.throttledratelimit,
'overwrites': opts.overwrites, 'overwrites': opts.overwrites,

View file

@ -480,6 +480,9 @@ class InfoExtractor:
will be used by geo restriction bypass mechanism similarly will be used by geo restriction bypass mechanism similarly
to _GEO_COUNTRIES. to _GEO_COUNTRIES.
The _ENABLED attribute should be set to False for IEs that
are disabled by default and must be explicitly enabled.
The _WORKING attribute should be set to False for broken IEs The _WORKING attribute should be set to False for broken IEs
in order to warn the users and skip the tests. in order to warn the users and skip the tests.
""" """
@ -491,6 +494,7 @@ class InfoExtractor:
_GEO_COUNTRIES = None _GEO_COUNTRIES = None
_GEO_IP_BLOCKS = None _GEO_IP_BLOCKS = None
_WORKING = True _WORKING = True
_ENABLED = True
_NETRC_MACHINE = None _NETRC_MACHINE = None
IE_DESC = None IE_DESC = None
SEARCH_KEY = None SEARCH_KEY = None
@ -3941,3 +3945,12 @@ def _search_results(self, query):
@classproperty @classproperty
def SEARCH_KEY(cls): def SEARCH_KEY(cls):
return cls._SEARCH_KEY return cls._SEARCH_KEY
class UnsupportedURLIE(InfoExtractor):
_VALID_URL = '.*'
_ENABLED = False
IE_DESC = False
def _real_extract(self, url):
raise UnsupportedError(url)

View file

@ -353,10 +353,20 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs):
'--extractor-descriptions', '--extractor-descriptions',
action='store_true', dest='list_extractor_descriptions', default=False, action='store_true', dest='list_extractor_descriptions', default=False,
help='Output descriptions of all supported extractors and exit') help='Output descriptions of all supported extractors and exit')
general.add_option(
'--use-extractors', '--ies',
action='callback', dest='allowed_extractors', metavar='NAMES', type='str',
default=[], callback=_list_from_options_callback,
help=(
'Extractor names to use separated by commas. '
'You can also use regexes, "all", "default" and "end" (end URL matching); '
'e.g. --ies "holodex.*,end,youtube". '
'Prefix the name with a "-" to exclude it, e.g. --ies default,-generic. '
'Use --list-extractors for a list of available extractor names'))
general.add_option( general.add_option(
'--force-generic-extractor', '--force-generic-extractor',
action='store_true', dest='force_generic_extractor', default=False, action='store_true', dest='force_generic_extractor', default=False,
help='Force extraction to use the generic extractor') help=optparse.SUPPRESS_HELP)
general.add_option( general.add_option(
'--default-search', '--default-search',
dest='default_search', metavar='PREFIX', dest='default_search', metavar='PREFIX',