mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix unicode bug and make get_text a little more memory efficient
This commit is contained in:
parent
a569112e43
commit
3cf2b17af5
@ -483,17 +483,19 @@ class HTMLConverter(object):
|
|||||||
raise ConversionError, 'Could not parse ' + self.file_name
|
raise ConversionError, 'Could not parse ' + self.file_name
|
||||||
|
|
||||||
|
|
||||||
def get_text(self, tag):
|
def get_text(self, tag, limit=None):
|
||||||
css = self.tag_css(tag)
|
css = self.tag_css(tag)
|
||||||
if (css.has_key('display') and css['display'].lower() == 'none') or \
|
if (css.has_key('display') and css['display'].lower() == 'none') or \
|
||||||
(css.has_key('visibility') and css['visibility'].lower() == 'hidden'):
|
(css.has_key('visibility') and css['visibility'].lower() == 'hidden'):
|
||||||
return ''
|
return ''
|
||||||
text = ''
|
text = u''
|
||||||
for c in tag.contents:
|
for c in tag.contents:
|
||||||
|
if limit != None and len(text) > limit:
|
||||||
|
break
|
||||||
if isinstance(c, HTMLConverter.IGNORED_TAGS):
|
if isinstance(c, HTMLConverter.IGNORED_TAGS):
|
||||||
return ''
|
return u''
|
||||||
if isinstance(c, NavigableString):
|
if isinstance(c, NavigableString):
|
||||||
text += str(c)
|
text += unicode(c)
|
||||||
elif isinstance(c, Tag):
|
elif isinstance(c, Tag):
|
||||||
if c.name.lower() == 'img' and c.has_key('alt'):
|
if c.name.lower() == 'img' and c.has_key('alt'):
|
||||||
text += c['alt']
|
text += c['alt']
|
||||||
@ -1046,7 +1048,7 @@ class HTMLConverter(object):
|
|||||||
self.current_block = self.book.create_text_block(textStyle=pb.textStyle,
|
self.current_block = self.book.create_text_block(textStyle=pb.textStyle,
|
||||||
blockStyle=pb.blockStyle)
|
blockStyle=pb.blockStyle)
|
||||||
elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||||
src = self.get_text(tag)
|
src = self.get_text(tag, limit=1000)
|
||||||
if self.chapter_detection and tagname.startswith('h'):
|
if self.chapter_detection and tagname.startswith('h'):
|
||||||
if self.chapter_regex.search(src):
|
if self.chapter_regex.search(src):
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user