diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py index 2ca38176d5..a96adc5772 100644 --- a/src/calibre/ebooks/pml/pmlconverter.py +++ b/src/calibre/ebooks/pml/pmlconverter.py @@ -38,7 +38,7 @@ PML_HTML_RULES = [ (re.compile(r'\\U(?P\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))), (re.compile(r'\\m="(?P.+?)"'), lambda match: '' % image_name(match.group('name')).strip('\x00')), (re.compile(r'\\q="(?P#.+?)"(?P.+?)\\q', re.DOTALL), lambda match: '%s' % (match.group('target'), match.group('text'))), - (re.compile(r'\\Q="(?P.+?)"'), lambda match: '
' % match.group('target')), + (re.compile(r'\\Q="(?P.+?)"'), lambda match: '' % match.group('target')), (re.compile(r'\\-'), lambda match: ''), (re.compile(r'\\Fn="(?P.+?)"(?P.+?)\\Fn'), lambda match: '%s' % (match.group('target'), match.group('text'))), (re.compile(r'\\Sd="(?P.+?)"(?P.+?)\\Sd'), lambda match: '%s' % (match.group('target'), match.group('text'))), diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py index 6cb854df10..f1767700e0 100644 --- a/src/calibre/ebooks/txt/output.py +++ b/src/calibre/ebooks/txt/output.py @@ -32,8 +32,12 @@ class TXTOutput(OutputFormatPlugin): ]) def convert(self, oeb_book, output_path, input_plugin, opts, log): - writer = TxtWriter(TxtNewlines(opts.newline).newline, log) - txt = writer.dump(oeb_book.spine) +# writer = TxtWriter(TxtNewlines(opts.newline).newline, log) +# txt = writer.dump(oeb_book.spine) + + from calibre.ebooks.txt.txtml import TXTMLizer + writer = TXTMLizer(log) + txt = writer.extract_content(oeb_book, opts) close = False if not hasattr(output_path, 'write'): diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py new file mode 100644 index 0000000000..5bc7ed45f8 --- /dev/null +++ b/src/calibre/ebooks/txt/txtml.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into plain text +''' + +import os + +from lxml import etree + +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace +from calibre.ebooks.oeb.stylizer import Stylizer + +BLOCK_TAGS = [ + 'div', + 'p', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'li', +] + +BLOCK_STYLES = [ + 'block', +] + +class TXTMLizer(object): + def __init__(self, log): + self.log = log + + def extract_content(self, oeb_book, opts): + self.log.info('Converting XHTML to PML markup...') + self.oeb_book = oeb_book + self.opts = opts + return self.mlize_spine() + + def mlize_spine(self): + output = u'' + for item in self.oeb_book.spine: + self.log.debug('Converting %s to TXT...' % item.href) + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile) + content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode)) + content = self.remove_newlines(content) + output += self.dump_text(etree.fromstring(content), stylizer) + + return output + + def remove_newlines(self, text): + self.log.debug('\tRemove newlines for processing...') + text = text.replace('\r\n', ' ') + text = text.replace('\n', ' ') + text = text.replace('\r', ' ') + + return text + + def dump_text(self, elem, stylizer): + if not isinstance(elem.tag, basestring) \ + or namespace(elem.tag) != XHTML_NS: + return u'' + + text = u'' + style = stylizer.style(elem) + + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + return u'' + + tag = barename(elem.tag) + in_block = False + + # Are we in a paragraph block? + if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: + in_block = True + #if not text.endswith(os.linesep) and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + # print '"%s"' % text + # text += os.linesep + os.linesep + + # Proccess tags that contain text. + if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '': + text += elem.text + + for item in elem: + text += self.dump_text(item, stylizer) + + if in_block: + text += os.linesep + os.linesep + + if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '': + text += elem.tail + + return text