From 3d007aacf1c358463560d65311092c64f8ec37e5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 16 Aug 2007 07:09:05 +0000 Subject: [PATCH] Reduce memory usage of html2lrf. This commit could have introduced many regressions. --- src/libprs500/ebooks/lrf/html/convert_from.py | 257 ++++++++---------- src/libprs500/ebooks/lrf/pylrs/pylrs.py | 4 +- 2 files changed, 120 insertions(+), 141 deletions(-) diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py index c5d3d7ae4e..0601bb7e83 100644 --- a/src/libprs500/ebooks/lrf/html/convert_from.py +++ b/src/libprs500/ebooks/lrf/html/convert_from.py @@ -299,13 +299,6 @@ class HTMLConverter(object): ] - class Link(object): - def __init__(self, para, tag): - self.para = para - self.tag = tag - - processed_files = {} #: Files that have been processed - def __hasattr__(self, attr): if hasattr(self.options, attr): return True @@ -322,9 +315,28 @@ class HTMLConverter(object): else: object.__setattr__(self, attr, val) - def __init__(self, book, fonts, path, options, logger, - link_level=0, is_root=True, - rotated_images={}, scaled_images={}, images={}, memory=[]): + CSS = { + 'h1' : {"font-size" : "xx-large", "font-weight":"bold", 'text-indent':'0pt'}, + 'h2' : {"font-size" : "x-large", "font-weight":"bold", 'text-indent':'0pt'}, + 'h3' : {"font-size" : "large", "font-weight":"bold", 'text-indent':'0pt'}, + 'h4' : {"font-size" : "large", 'text-indent':'0pt'}, + 'h5' : {"font-weight" : "bold", 'text-indent':'0pt'}, + 'b' : {"font-weight" : "bold"}, + 'strong' : {"font-weight" : "bold"}, + 'i' : {"font-style" : "italic"}, + 'cite' : {'font-style' : 'italic'}, + 'em' : {"font-style" : "italic"}, + 'small' : {'font-size' : 'small'}, + 'pre' : {'font-family' : 'monospace' }, + 'code' : {'font-family' : 'monospace' }, + 'tt' : {'font-family' : 'monospace'}, + 'center' : {'text-align' : 'center'}, + 'th' : {'font-size' : 'large', 'font-weight':'bold'}, + 'big' : {'font-size' : 'large', 'font-weight':'bold'}, + '.libprs500_dropcaps' : {'font-size': 'xx-large'}, + } + + def __init__(self, book, fonts, options, logger, path): ''' Convert HTML file at C{path} and add it to C{book}. After creating the object, you must call L{self.process_links} on it to create the links and @@ -333,57 +345,43 @@ class HTMLConverter(object): @param book: The LRF book @type book: L{libprs500.lrf.pylrs.Book} @param fonts: dict specifying the font families to use - @param path: path to the HTML file to process - @type path: C{str} ''' # Defaults for various formatting tags object.__setattr__(self, 'options', options) - self.css = dict( - h1 = {"font-size" : "xx-large", "font-weight":"bold", 'text-indent':'0pt'}, - h2 = {"font-size" : "x-large", "font-weight":"bold", 'text-indent':'0pt'}, - h3 = {"font-size" : "large", "font-weight":"bold", 'text-indent':'0pt'}, - h4 = {"font-size" : "large", 'text-indent':'0pt'}, - h5 = {"font-weight" : "bold", 'text-indent':'0pt'}, - b = {"font-weight" : "bold"}, - strong = {"font-weight" : "bold"}, - i = {"font-style" : "italic"}, - cite = {'font-style' : 'italic'}, - em = {"font-style" : "italic"}, - small = {'font-size' : 'small'}, - pre = {'font-family' : 'monospace' }, - code = {'font-family' : 'monospace' }, - tt = {'font-family' : 'monospace'}, - center = {'text-align' : 'center'}, - th = {'font-size' : 'large', 'font-weight':'bold'}, - big = {'font-size' : 'large', 'font-weight':'bold'}, - ) - self.css['.libprs500_dropcaps'] = {'font-size': 'xx-large'} self.logger = logger - self.fonts = fonts #: dict specifting font families to use - self.scaled_images = scaled_images #: Temporary files with scaled version of images - self.rotated_images = rotated_images #: Temporary files with rotated version of images - self.link_level = link_level #: Current link level + self.fonts = fonts #: dict specifying font families to use + # Memory + self.scaled_images = {} #: Temporary files with scaled version of images + self.rotated_images = {} #: Temporary files with rotated version of images + self.text_styles = []#: Keep track of already used textstyles + self.block_styles = []#: Keep track of already used blockstyles + self.images = {} #: Images referenced in the HTML document + self.targets = {} #: and id elements + self.links = {} #: elements + self.processed_files = [] + self.link_level = 0 #: Current link level + self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported + self.tops = {} #: element representing the top of each HTML file in the LRF file + # Styles self.blockquote_style = book.create_block_style(sidemargin=60, topskip=20, footskip=20) self.unindented_style = book.create_text_style(parindent=0) - self.text_styles = []#: Keep track of already used textstyles - self.block_styles = []#: Keep track of already used blockstyles - self.images = images #: Images referenced in the HTML document - self.targets = {} #: elements - self.links = [] #: elements - self.files = {} #: links that point to other files - self.links_processed = False #: Whether links_processed has been called on this object + + # Set by table processing code so that any within the table # point to the previous element self.anchor_to_previous = None self.in_table = False + # List processing self.list_level = 0 self.list_indent = 20 self.list_counter = 1 - self.memory = memory #: Used to ensure that duplicate CSS unhandled erros are not reported + self.book = book #: The Book object representing a BBeB book - self.is_root = is_root #: Are we converting the root HTML file self.lstrip_toggle = False #: If true the next add_text call will do an lstrip + self.start_on_file(path, is_root=True) + + def start_on_file(self, path, is_root=True, link_level=0): path = os.path.abspath(path) os.chdir(os.path.dirname(path)) self.file_name = os.path.basename(path) @@ -398,20 +396,25 @@ class HTMLConverter(object): if self.pdftohtml: nmassage.extend(HTMLConverter.PDFTOHTML) raw = unicode(raw, 'utf8', 'replace') - self.soup = BeautifulSoup(raw, + soup = BeautifulSoup(raw, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=nmassage) - logger.info('\tConverting to BBeB...') + self.logger.info('\tConverting to BBeB...') sys.stdout.flush() self.current_page = None self.current_para = None self.current_style = {} self.page_break_found = False - match = self.PAGE_BREAK_PAT.search(unicode(self.soup)) + match = self.PAGE_BREAK_PAT.search(unicode(soup)) if match and not re.match('avoid', match.group(1), re.IGNORECASE): self.page_break_found = True - self.parse_file() - HTMLConverter.processed_files[path] = self + self.css = HTMLConverter.CSS.copy() + self.target_prefix = path + self.links[path] = [] + self.tops[path] = self.parse_file(soup, is_root) + self.processed_files.append(path) + self.process_links(is_root, path) + def parse_css(self, style): """ @@ -480,7 +483,7 @@ class HTMLConverter(object): prop.update(self.parse_style_properties(tag["style"])) return prop - def parse_file(self): + def parse_file(self, soup, is_root): def get_valid_block(page): for item in page.contents: if isinstance(item, (Canvas, TextBlock, ImageBlock, RuledLine)): @@ -489,11 +492,11 @@ class HTMLConverter(object): self.current_page = self.book.create_page() self.current_block = self.book.create_text_block() self.current_para = Paragraph() - if self.cover and self.is_root: + if self.cover and is_root: self.add_image_page(self.cover) - self.top = self.current_block + top = self.current_block - self.process_children(self.soup, {}) + self.process_children(soup, {}) if self.current_para and self.current_block: self.current_para.append_to(self.current_block) @@ -502,16 +505,16 @@ class HTMLConverter(object): if self.current_page and self.current_page.has_text(): self.book.append(self.current_page) - if not self.top.parent: + if not top.parent: if not previous: try: previous = self.book.pages()[0] except IndexError: raise ConversionError, self.file_name + ' does not seem to have any content' - self.top = get_valid_block(previous) - if not self.top or not self.top.parent: + top = get_valid_block(previous) + if not top or not top.parent: raise ConversionError, self.file_name + ' does not seem to have any content' - return + return top found = False for page in self.book.pages(): @@ -519,15 +522,30 @@ class HTMLConverter(object): found = True continue if found: - self.top = get_valid_block(page) - if not self.top: + top = get_valid_block(page) + if not top: continue break - if not self.top or not self.top.parent: + if not top or not top.parent: raise ConversionError, 'Could not parse ' + self.file_name + return top - + def create_link(self, para, tag): + text = self.get_text(tag, 1000) + if not text: + text = 'Link' + img = tag.find('img') + if img: + try: + text = img['alt'] + except KeyError: + pass + + url = urlparse(tag['href']) + return {'para':para, 'text':text, 'url':url} + + def get_text(self, tag, limit=None): css = self.tag_css(tag) if (css.has_key('display') and css['display'].lower() == 'none') or \ @@ -548,7 +566,7 @@ class HTMLConverter(object): text += self.get_text(c) return text - def process_links(self): + def process_links(self, is_root, selfpath, link_level=0): def add_toc_entry(text, target): # TextBlocks in Canvases have a None parent or an Objects Parent if target.parent != None and \ @@ -590,85 +608,39 @@ class HTMLConverter(object): page.contents.remove(bs) return ans - cwd = os.getcwd() - for link in self.links: - para, tag = link.para, link.tag - text = self.get_text(tag, 1000) - # Needed for TOC entries due to bug in LRF - ascii_text = text.encode('ascii', 'replace') - if not text: - text = 'Link' - img = tag.find('img') - if img: - try: - text = img['alt'] - except KeyError: - pass - purl = urlparse(link.tag['href']) - if purl[1]: # Not a link to a file on the local filesystem - continue - path, fragment = unquote(purl[2]), purl[5] - if not path or os.path.basename(path) == self.file_name: - if fragment in self.targets.keys(): - tb = get_target_block(fragment, self.targets) - if self.is_root: - add_toc_entry(ascii_text, tb) - sys.stdout.flush() - jb = JumpButton(tb) - self.book.append(jb) - cb = CharButton(jb, text=text) - para.contents = [] - para.append(cb) - elif self.link_level < self.link_levels: - try: # os.access raises Exceptions in path has null bytes - if not os.access(path.encode('utf8', 'replace'), os.R_OK): - continue - except Exception: - self.logger.exception('Skipping %s', link) + cwd = os.getcwd() + for link in self.links[selfpath]: + try: + para, text, purl = link['para'], link['text'], link['url'] + # Needed for TOC entries due to bug in LRF + ascii_text = text.encode('ascii', 'replace') + if purl[1]: # Not a link to a file on the local filesystem continue - path = os.path.abspath(path) - if not path in HTMLConverter.processed_files.keys(): - try: - self.files[path] = HTMLConverter( - self.book, self.fonts, path, self.options, - self.logger, - link_level = self.link_level+1, - is_root = False, - rotated_images=self.rotated_images, - scaled_images=self.scaled_images, - images=self.images, - memory=self.memory) - HTMLConverter.processed_files[path] = self.files[path] + basepath, fragment = unquote(purl[2]), purl[5] + if not basepath: + basepath = selfpath + path = os.path.abspath(basepath) + if link_level < self.link_levels and path not in self.processed_files: + try: + self.start_on_file(path, is_root=False, link_level=link_level+1) except Exception: self.logger.warning('Unable to process %s', path) if self.verbose: self.logger.exception(' ') continue finally: - os.chdir(cwd) + os.chdir(cwd) + if path+fragment in self.targets.keys(): + tb = get_target_block(path+fragment, self.targets) else: - self.files[path] = HTMLConverter.processed_files[path] - conv = self.files[path] - if fragment in conv.targets.keys(): - tb = get_target_block(fragment, conv.targets) - else: - tb = conv.top - if self.is_root: + tb = self.tops[path] + if is_root: add_toc_entry(ascii_text, tb) jb = JumpButton(tb) self.book.append(jb) cb = CharButton(jb, text=text) para.contents = [] - para.append(cb) - - self.links_processed = True - - for path in self.files.keys(): - if self.files[path].links_processed: - continue - try: - os.chdir(os.path.dirname(path)) - self.files[path].process_links() + para.append(cb) finally: os.chdir(cwd) @@ -704,13 +676,18 @@ class HTMLConverter(object): def process_children(self, ptag, pcss): """ Process the children of ptag """ - for c in ptag.contents: + # Need to make a copy of contents as when + # extract is called on a child, it will + # mess up the iteration. + contents = [i for i in ptag.contents] + for c in contents: if isinstance(c, HTMLConverter.IGNORED_TAGS): continue elif isinstance(c, Tag): self.parse_tag(c, pcss) elif isinstance(c, NavigableString): self.add_text(c, pcss) + ptag.extract() def process_alignment(self, css): ''' @@ -991,22 +968,22 @@ class HTMLConverter(object): if not text.strip(): text = "Link" self.add_text(text, tag_css) - self.links.append(HTMLConverter.Link(self.current_para.contents[-1], tag)) + self.links[self.target_prefix].append(self.create_link(self.current_para.contents[-1], tag)) if tag.has_key('id') or tag.has_key('name'): key = 'name' if tag.has_key('name') else 'id' - self.targets[tag[key]] = self.current_block + self.targets[self.target_prefix+tag[key]] = self.current_block elif tag.has_key('name') or tag.has_key('id'): key = 'name' if tag.has_key('name') else 'id' if self.anchor_to_previous: self.process_children(tag, tag_css) for c in self.anchor_to_previous.contents: if isinstance(c, (TextBlock, ImageBlock)): - self.targets[tag[key]] = c + self.targets[self.target_prefix+tag[key]] = c return tb = self.book.create_text_block() tb.Paragraph(" ") self.anchor_to_previous.append(tb) - self.targets[tag[key]] = tb + self.targets[self.target_prefix+tag[key]] = tb return previous = self.current_block self.process_children(tag, tag_css) @@ -1047,7 +1024,7 @@ class HTMLConverter(object): else: target = BlockSpace() self.current_page.append(target) - self.targets[tag[key]] = target + self.targets[self.target_prefix+tag[key]] = target elif tagname == 'img': if tag.has_key('src') and os.access(unquote(tag['src']), os.R_OK): path = os.path.abspath(unquote(tag['src'])) @@ -1202,7 +1179,7 @@ class HTMLConverter(object): if tag.has_key('id'): target = self.book.create_text_block(textStyle=self.current_block.textStyle, blockStyle=self.current_block.blockStyle) - self.targets[tag['id']] = target + self.targets[self.target_prefix+tag['id']] = target self.end_current_block() self.current_page.append(target) src = self.get_text(tag, limit=1000) @@ -1371,8 +1348,7 @@ def process_file(path, options, logger=None): fpba = ['$', '', '$'] options.force_page_break_attr = [re.compile(fpba[0], re.IGNORECASE), fpba[1], re.compile(fpba[2], re.IGNORECASE)] - conv = HTMLConverter(book, fonts, path, options, logger) - conv.process_links() + conv = HTMLConverter(book, fonts, options, logger, path) oname = options.output if not oname: suffix = '.lrs' if options.lrs else '.lrf' @@ -1438,7 +1414,7 @@ def option_parser(): return lrf_option_parser('''Usage: %prog [options] mybook.html\n\n''' '''%prog converts mybook.html to mybook.lrf''') -def main(args=sys.argv): +def main(args=sys.argv): try: parser = option_parser() options, args = parser.parse_args(args) @@ -1453,7 +1429,8 @@ def main(args=sys.argv): warnings.defaultaction = 'error' except Exception, err: print >> sys.stderr, err - return 1 + return 1 + process_file(src, options) return 0 diff --git a/src/libprs500/ebooks/lrf/pylrs/pylrs.py b/src/libprs500/ebooks/lrf/pylrs/pylrs.py index f30722c1d1..f476c32620 100644 --- a/src/libprs500/ebooks/lrf/pylrs/pylrs.py +++ b/src/libprs500/ebooks/lrf/pylrs/pylrs.py @@ -484,6 +484,8 @@ class Book(Delegator): self.applySettings(settings, testValid=True) self.allow_new_page = True #: If False L{create_page} raises an exception + self.gc_count = 0 + def create_text_style(self, **settings): ans = TextStyle(**self.defaultTextStyle.attrs.copy()) @@ -1448,7 +1450,7 @@ class TextBlock(LrsObject, LrsContainer): self.blockStyle = blockStyle # create a textStyle with our current text settings (for Span to find) - self.currentTextStyle = textStyle.copy() + self.currentTextStyle = textStyle.copy() if self.textSettings else textStyle self.currentTextStyle.attrs.update(self.textSettings)