TXT output: Optimize string manipulation.

This commit is contained in:
John Schember 2009-09-02 17:05:11 -04:00
parent b6c7517f86
commit 34e9857ab0
2 changed files with 34 additions and 19 deletions

View File

@ -19,7 +19,7 @@ class TxtNewlines(object):
self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep) self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)
def specified_newlines(newline, text): def specified_newlines(newline, text):
if newline == os.linesep: if newline == '\n':
return text return text
return text.replace(os.linesep, newline) return text.replace(os.linesep, newline)

View File

@ -8,7 +8,8 @@ __docformat__ = 'restructuredtext en'
Transform OEB content into plain text Transform OEB content into plain text
''' '''
import os, re import os
import re
from lxml import etree from lxml import etree
@ -43,15 +44,15 @@ class TXTMLizer(object):
return self.mlize_spine() return self.mlize_spine()
def mlize_spine(self): def mlize_spine(self):
output = u'' output = [u'']
output += self.get_toc() output.append(self.get_toc())
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
self.log.debug('Converting %s to TXT...' % item.href) self.log.debug('Converting %s to TXT...' % item.href)
stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
content = self.remove_newlines(content) content = self.remove_newlines(content)
output += self.dump_text(etree.fromstring(content), stylizer) output.append(self.get_text(etree.fromstring(content), stylizer))
output = self.cleanup_text(output) output = self.cleanup_text(u''.join(output))
return output return output
@ -64,13 +65,13 @@ class TXTMLizer(object):
return text return text
def get_toc(self): def get_toc(self):
toc = u'' toc = [u'']
if getattr(self.opts, 'inline_toc', None): if getattr(self.opts, 'inline_toc', None):
self.log.debug('Generating table of contents...') self.log.debug('Generating table of contents...')
toc += u'%s\n\n' % _(u'Table of Contents:') toc.append(u'%s\n\n' % _(u'Table of Contents:'))
for item in self.oeb_book.toc: for item in self.oeb_book.toc:
toc += u'* %s\n\n' % item.title toc.append(u'* %s\n\n' % item.title)
return toc return ''.join(toc)
def cleanup_text(self, text): def cleanup_text(self, text):
self.log.debug('\tClean up text...') self.log.debug('\tClean up text...')
@ -99,6 +100,17 @@ class TXTMLizer(object):
return text return text
def get_text(self, elem, stylizer):
'''
@elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element.
@end: The last two characters of the text from the previous element.
This is used to determine if a blank line is needed when starting
a new block element.
'''
return u''.join(self.dump_text(elem, stylizer))
def dump_text(self, elem, stylizer, end=''): def dump_text(self, elem, stylizer, end=''):
''' '''
@elem: The element in the etree that we are working on. @elem: The element in the etree that we are working on.
@ -110,14 +122,14 @@ class TXTMLizer(object):
if not isinstance(elem.tag, basestring) \ if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) != XHTML_NS: or namespace(elem.tag) != XHTML_NS:
return u'' return ['']
text = u'' text = ['']
style = stylizer.style(elem) style = stylizer.style(elem)
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden': or style['visibility'] == 'hidden':
return u'' return ['']
tag = barename(elem.tag) tag = barename(elem.tag)
in_block = False in_block = False
@ -125,20 +137,23 @@ class TXTMLizer(object):
# Are we in a paragraph block? # Are we in a paragraph block?
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
in_block = True in_block = True
if not end.endswith(os.linesep + os.linesep) and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': if not end.endswith('\n\n') and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
text += os.linesep + os.linesep text.append('\n\n')
# Proccess tags that contain text. # Proccess tags that contain text.
if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
text += elem.text text.append(elem.text)
for item in elem: for item in elem:
text += self.dump_text(item, stylizer, text[-2:]) en = u''
if len(text) >= 2:
en = text[-1][-2:]
text += self.dump_text(item, stylizer, en)
if in_block: if in_block:
text += os.linesep + os.linesep text.append('\n\n')
if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
text += elem.tail text.append(elem.tail)
return text return text