[GoogleDriveFolderIE] improve code readability

- Combined `_extract_json_ds` and`_extract_json_hash` into one method(`_extract_json_meta`)
- Improved `item_url_getter`'s traversal path of item info
- Add notations to improve code readability
This commit is contained in:
grqx_wsl 2024-08-17 20:21:02 +12:00
parent 99d9105f33
commit 6e98d99dd5

View file

@ -303,7 +303,7 @@ class GoogleDriveFolderIE(InfoExtractor):
}, },
'playlist_count': 3, 'playlist_count': 3,
}, { }, {
# Contains various formats and a subfolder 'note': 'Contains various formats and a subfolder, folder name was formerly mismatched',
'url': 'https://drive.google.com/drive/folders/1CkqRsNlzZ0o3IL083j17s6sH5Q83DcGo', 'url': 'https://drive.google.com/drive/folders/1CkqRsNlzZ0o3IL083j17s6sH5Q83DcGo',
'info_dict': { 'info_dict': {
'id': '1CkqRsNlzZ0o3IL083j17s6sH5Q83DcGo', 'id': '1CkqRsNlzZ0o3IL083j17s6sH5Q83DcGo',
@ -311,45 +311,43 @@ class GoogleDriveFolderIE(InfoExtractor):
}, },
'playlist_count': 6, 'playlist_count': 6,
}] }]
_JSON_DS_RE = r'key\s*?:\s*?([\'"])ds:\s*?%d\1,[^}]*data:'
_JSON_HASH_RE = r'hash\s*?:\s*?([\'"])%d\1,[^}]*data:'
_ARRAY_RE = r'\[(?s:.+)\]'
def _extract_json_ds(self, dsval, webpage, video_id, **kwargs): def _extract_json_meta(self, webpage, video_id, dsval=None, hashval=None, name=None, **kwargs):
""" """
Searches for json with the 'ds' value(0~5) from the webpage with regex. Uses regex to search for json metadata with 'ds' value(0-5) or 'hash' value(1-6)
Folder info: ds=0; Folder items: ds=4. from the webpage.
Folder info: ds=0, hash=1; Folder items: ds=4, hash=6.
For example, if the webpage contains the line below, the empty data array For example, if the webpage contains the line below, the empty data array
can be got by passing dsval=3 to this function. can be got by passing dsval=3 or hashval=2 to this method.
AF_initDataCallback({key: 'ds:3', hash: '2', data:[], sideChannel: {}}); AF_initDataCallback({key: 'ds:3', hash: '2', data:[], sideChannel: {}});
""" """
return self._search_json(self._JSON_DS_RE % dsval, webpage, _ARRAY_RE = r'\[(?s:.+)\]'
f'webpage JSON ds:{dsval}', video_id, _META_END_RE = r', sideChannel: \{\}\}\);' # greedy match to deal with the 2nd test case
contains_pattern=self._ARRAY_RE, **kwargs) if dsval:
if not name:
def _extract_json_hash(self, hashval, webpage, video_id, **kwargs): name = f'webpage JSON metadata ds:{dsval}'
""" return self._search_json(
Searches for json with the 'hash' value(1~6) from the webpage with regex. rf'''key\s*?:\s*?(['"])ds:\s*?{dsval}\1,[^\[]*?data:''', webpage, name, video_id,
Folder info: hash=1; Folder items: hash=6. end_pattern=_META_END_RE, contains_pattern=_ARRAY_RE, **kwargs)
For example, if the webpage contains the line below, the empty data array elif hashval:
can be got by passing hashval=2 to this function. if not name:
AF_initDataCallback({key: 'ds:3', hash: '2', data:[], sideChannel: {}}); name = f'webpage JSON metadata hash:{hashval}'
""" return self._search_json(
return self._search_json(self._JSON_HASH_RE % hashval, webpage, rf'''hash\s*?:\s*?(['"]){hashval}\1,[^\[]*?data:''', webpage, name, video_id,
f'webpage JSON hash:{hashval}', video_id, end_pattern=_META_END_RE, contains_pattern=_ARRAY_RE, **kwargs)
contains_pattern=self._ARRAY_RE, **kwargs) return None
def _real_extract(self, url): def _real_extract(self, url):
def item_url_getter(item, video_id): def item_url_getter(item, video_id):
available_IEs = [GoogleDriveFolderIE, GoogleDriveIE] available_IEs = [GoogleDriveFolderIE, GoogleDriveIE] # subfolder or item
if 'application/vnd.google-apps.shortcut' in item: if 'application/vnd.google-apps.shortcut' in item: # extract real link
entry_url = traverse_obj( entry_url = traverse_obj(
item, (..., ..., lambda _, v: any(ie.suitable(v) for ie in available_IEs), item,
{str}, any)) (..., ..., lambda _, v: any(ie.suitable(v) for ie in available_IEs), any))
else: else:
entry_url = traverse_obj( entry_url = traverse_obj(
item, (lambda _, v: any(ie.suitable(v) for ie in available_IEs), item,
{str}, any)) (lambda _, v: any(ie.suitable(v) for ie in available_IEs), any))
if not entry_url: if not entry_url:
return None return None
return self.url_result(entry_url, video_id=video_id, video_title=item[2]) return self.url_result(entry_url, video_id=video_id, video_title=item[2])
@ -359,17 +357,17 @@ def item_url_getter(item, video_id):
webpage = self._download_webpage(url, folder_id, headers=headers) webpage = self._download_webpage(url, folder_id, headers=headers)
json_folder_info = ( json_folder_info = (
self._extract_json_ds(0, webpage, folder_id, default=None) self._extract_json_meta(webpage, folder_id, dsval=0, name='folder info', default=None)
or self._extract_json_hash(1, webpage, folder_id) or self._extract_json_meta(webpage, folder_id, hashval=1)
) )
json_items = ( json_items = (
self._extract_json_ds(4, webpage, folder_id, default=None) self._extract_json_meta(webpage, folder_id, dsval=4, name='folder items', default=None)
or self._extract_json_hash(6, webpage, folder_id) or self._extract_json_meta(webpage, folder_id, hashval=6)
) )
title = json_folder_info[1][2] title = json_folder_info[1][2]
items = json_items[-1] items = json_items[-1]
if not isinstance(items, list): if not isinstance(items, list): # empty folder
return self.playlist_result([], folder_id, title) return self.playlist_result([], folder_id, title)
return self.playlist_result( return self.playlist_result(