Handle comments while getting char count

This commit is contained in:
Kovid Goyal 2019-10-27 12:52:12 +05:30
parent a687204ec3
commit 78a97a0e37
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 11 additions and 3 deletions

View File

@ -202,8 +202,11 @@ def get_length(root):
strip_space = re.compile(r'\s+') strip_space = re.compile(r'\s+')
def count(elem): def count(elem):
tag = getattr(elem, 'tag', count)
if callable(tag):
return len(strip_space.sub('', getattr(elem, 'tail', None) or ''))
num = 0 num = 0
tname = elem.tag.rpartition('}')[-1].lower() tname = tag.rpartition('}')[-1].lower()
if elem.text and tname not in ignore_tags: if elem.text and tname not in ignore_tags:
num += len(strip_space.sub('', elem.text)) num += len(strip_space.sub('', elem.text))
if elem.tail: if elem.tail:
@ -213,11 +216,14 @@ def get_length(root):
return num return num
else: else:
def count(elem): def count(elem):
return fast(elem.tag, elem.text, elem.tail) tag = getattr(elem, 'tag', count)
if callable(tag):
return fast('', None, getattr(elem, 'tail', None))
return fast(tag, elem.text, elem.tail)
for body in root.iterchildren(XHTML('body')): for body in root.iterchildren(XHTML('body')):
ans += count(body) ans += count(body)
for elem in body.iterdescendants('*'): for elem in body.iterdescendants():
ans += count(elem) ans += count(elem)
return ans return ans

View File

@ -234,4 +234,6 @@ class ContentTest(LibraryBaseTest):
self.ae(get_length(root), 5) self.ae(get_length(root), 5)
root = html5_parse('<script>xyz</script>a<iMg>b') root = html5_parse('<script>xyz</script>a<iMg>b')
self.ae(get_length(root), 1002) self.ae(get_length(root), 1002)
root = html5_parse('<p><!-- abc -->m')
self.ae(get_length(root), 1)
# }}} # }}}