Star of new html to text parser.

2025-12-24 22:07:21 -05:00 · 2009-07-12 12:47:33 -04:00 · 2009-07-12 12:47:33 -04:00 · c354272030
commit c354272030
parent 1fbf2cee84
3 changed files with 105 additions and 3 deletions
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@ -38,7 +38,7 @@ PML_HTML_RULES = [
    (re.compile(r'\\U(?P<num>\d+)'), lambda match: '%s' % unichr(int(match.group('num'), 16))),
    (re.compile(r'\\m="(?P<name>.+?)"'), lambda match: '<img src="images/%s" />' % image_name(match.group('name')).strip('\x00')),
    (re.compile(r'\\q="(?P<target>#.+?)"(?P<text>.+?)\\q', re.DOTALL), lambda match: '<a href="%s">%s</a>' % (match.group('target'), match.group('text'))),
-    (re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<div id="%s"></div>' % match.group('target')),
+    (re.compile(r'\\Q="(?P<target>.+?)"'), lambda match: '<span id="%s"></span>' % match.group('target')),
    (re.compile(r'\\-'), lambda match: ''),
    (re.compile(r'\\Fn="(?P<target>.+?)"(?P<text>.+?)\\Fn'), lambda match: '<a href="#footnote-%s">%s</a>' % (match.group('target'), match.group('text'))),
    (re.compile(r'\\Sd="(?P<target>.+?)"(?P<text>.+?)\\Sd'), lambda match: '<a href="#sidebar-%s">%s</a>' % (match.group('target'), match.group('text'))),
--- a/src/calibre/ebooks/txt/output.py
+++ b/src/calibre/ebooks/txt/output.py
@ -32,8 +32,12 @@ class TXTOutput(OutputFormatPlugin):
                 ])

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
-        writer = TxtWriter(TxtNewlines(opts.newline).newline, log)
-        txt = writer.dump(oeb_book.spine)
+#        writer = TxtWriter(TxtNewlines(opts.newline).newline, log)
+#        txt = writer.dump(oeb_book.spine)
+
+        from calibre.ebooks.txt.txtml import TXTMLizer
+        writer = TXTMLizer(log)
+        txt = writer.extract_content(oeb_book, opts)

        close = False
        if not hasattr(output_path, 'write'):
--- a/src/calibre/ebooks/txt/txtml.py
+++ b/src/calibre/ebooks/txt/txtml.py
@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Transform OEB content into plain text
+'''
+
+import os
+
+from lxml import etree
+
+from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
+from calibre.ebooks.oeb.stylizer import Stylizer
+
+BLOCK_TAGS = [
+    'div',
+    'p',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'li',
+]
+
+BLOCK_STYLES = [
+    'block',
+]
+
+class TXTMLizer(object):
+    def __init__(self, log):
+        self.log = log
+
+    def extract_content(self, oeb_book, opts):
+        self.log.info('Converting XHTML to PML markup...')
+        self.oeb_book = oeb_book
+        self.opts = opts
+        return self.mlize_spine()
+
+    def mlize_spine(self):
+        output = u''
+        for item in self.oeb_book.spine:
+            self.log.debug('Converting %s to TXT...' % item.href)
+            stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts.output_profile)
+            content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
+            content = self.remove_newlines(content)
+            output += self.dump_text(etree.fromstring(content), stylizer)
+
+        return output
+
+    def remove_newlines(self, text):
+        self.log.debug('\tRemove newlines for processing...')
+        text = text.replace('\r\n', ' ')
+        text = text.replace('\n', ' ')
+        text = text.replace('\r', ' ')
+
+        return text
+
+    def dump_text(self, elem, stylizer):
+        if not isinstance(elem.tag, basestring) \
+           or namespace(elem.tag) != XHTML_NS:
+            return u''
+
+        text = u''
+        style = stylizer.style(elem)
+
+        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
+           or style['visibility'] == 'hidden':
+            return u''
+
+        tag = barename(elem.tag)
+        in_block = False
+
+        # Are we in a paragraph block?
+        if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
+            in_block = True
+            #if not text.endswith(os.linesep) and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
+            #    print '"%s"' % text
+            #    text += os.linesep + os.linesep
+
+        # Proccess tags that contain text.
+        if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
+            text += elem.text
+
+        for item in elem:
+            text += self.dump_text(item, stylizer)
+
+        if in_block:
+            text += os.linesep + os.linesep
+
+        if hasattr(elem, 'tail') and elem.tail != None and elem.tail.strip() != '':
+            text += elem.tail
+
+        return text