From 3cf2b17af54d1ccc267a13500d3e7c593f95a485 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 29 Jun 2007 06:38:52 +0000 Subject: [PATCH] Fix unicode bug and make get_text a little more memory efficient --- src/libprs500/ebooks/lrf/html/convert_from.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index eb8ec48a33..48de2309af 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -483,17 +483,19 @@ class HTMLConverter(object): raise ConversionError, 'Could not parse ' + self.file_name - def get_text(self, tag): + def get_text(self, tag, limit=None): css = self.tag_css(tag) if (css.has_key('display') and css['display'].lower() == 'none') or \ (css.has_key('visibility') and css['visibility'].lower() == 'hidden'): return '' - text = '' + text = u'' for c in tag.contents: + if limit != None and len(text) > limit: + break if isinstance(c, HTMLConverter.IGNORED_TAGS): - return '' + return u'' if isinstance(c, NavigableString): - text += str(c) + text += unicode(c) elif isinstance(c, Tag): if c.name.lower() == 'img' and c.has_key('alt'): text += c['alt'] @@ -1046,7 +1048,7 @@ class HTMLConverter(object): self.current_block = self.book.create_text_block(textStyle=pb.textStyle, blockStyle=pb.blockStyle) elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']: - src = self.get_text(tag) + src = self.get_text(tag, limit=1000) if self.chapter_detection and tagname.startswith('h'): if self.chapter_regex.search(src): if self.verbose: