[minhateca] Fix duration parsing

This commit is contained in:
Philipp Hagemeister 2014-12-04 17:35:40 +01:00
parent ab07963b5c
commit e8df5cee12
3 changed files with 21 additions and 6 deletions

View file

@ -220,6 +220,9 @@ def test_parse_duration(self):
self.assertEqual(parse_duration('0s'), 0) self.assertEqual(parse_duration('0s'), 0)
self.assertEqual(parse_duration('01:02:03.05'), 3723.05) self.assertEqual(parse_duration('01:02:03.05'), 3723.05)
self.assertEqual(parse_duration('T30M38S'), 1838) self.assertEqual(parse_duration('T30M38S'), 1838)
self.assertEqual(parse_duration('5 s'), 5)
self.assertEqual(parse_duration('3 min'), 180)
self.assertEqual(parse_duration('2.5 hours'), 9000)
def test_fix_xml_ampersands(self): def test_fix_xml_ampersands(self):
self.assertEqual( self.assertEqual(

View file

@ -8,6 +8,7 @@
) )
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
parse_duration,
parse_filesize, parse_filesize,
) )
@ -52,8 +53,8 @@ def _real_extract(self, url):
filesize_approx = parse_filesize(self._html_search_regex( filesize_approx = parse_filesize(self._html_search_regex(
r'<p class="fileSize">(.*?)</p>', r'<p class="fileSize">(.*?)</p>',
webpage, 'file size approximation', fatal=False)) webpage, 'file size approximation', fatal=False))
duration = int_or_none(self._html_search_regex( duration = parse_duration(self._html_search_regex(
r'(?s)<p class="fileLeng[ht][th]">.*?([0-9]+)\s*s', r'(?s)<p class="fileLeng[ht][th]">.*?class="bold">(.*?)<',
webpage, 'duration', fatal=False)) webpage, 'duration', fatal=False))
view_count = int_or_none(self._html_search_regex( view_count = int_or_none(self._html_search_regex(
r'<p class="downloadsCounter">([0-9]+)</p>', r'<p class="downloadsCounter">([0-9]+)</p>',

View file

@ -1206,18 +1206,29 @@ def parse_duration(s):
m = re.match( m = re.match(
r'''(?ix)T? r'''(?ix)T?
(?:
(?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
(?P<only_hours>[0-9.]+)\s*(?:hours?)|
(?: (?:
(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)? (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s* (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
)? )?
(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$''', s) (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
)$''', s)
if not m: if not m:
return None return None
res = int(m.group('secs')) res = 0
if m.group('only_mins'):
return float_or_none(m.group('only_mins'), invscale=60)
if m.group('only_hours'):
return float_or_none(m.group('only_hours'), invscale=60 * 60)
if m.group('secs'):
res += int(m.group('secs'))
if m.group('mins'): if m.group('mins'):
res += int(m.group('mins')) * 60 res += int(m.group('mins')) * 60
if m.group('hours'): if m.group('hours'):
res += int(m.group('hours')) * 60 * 60 res += int(m.group('hours')) * 60 * 60
if m.group('ms'): if m.group('ms'):
res += float(m.group('ms')) res += float(m.group('ms'))
return res return res