New TXT output processor.

2025-07-09 03:04:10 -04:00 · 2009-07-12 20:22:19 -04:00 · 2009-07-12 20:22:19 -04:00 · e09193a48f
commit e09193a48f
parent 97c1b8a0c3
4 changed files with 72 additions and 157 deletions
--- a/src/calibre/ebooks/txt/newlines.py
+++ b/src/calibre/ebooks/txt/newlines.py
@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+class TxtNewlines(object):
+
+    NEWLINE_TYPES = {
+                        'system'  : os.linesep,
+                        'unix'    : '\n',
+                        'old_mac' : '\r',
+                        'windows' : '\r\n'
+                     }
+
+    def __init__(self, newline_type):
+        self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)
+
+def specified_newlines(newline, text):
+    if newline == os.linesep:
+        return text
+
+    return text.replace(os.linesep, newline)
--- a/src/calibre/ebooks/txt/output.py
+++ b/src/calibre/ebooks/txt/output.py
@ -8,7 +8,8 @@ import os

 from calibre.customize.conversion import OutputFormatPlugin, \
    OptionRecommendation
-from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines
+from calibre.ebooks.txt.txtml import TXTMLizer
+from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines

 class TXTOutput(OutputFormatPlugin):

@ -32,13 +33,12 @@ class TXTOutput(OutputFormatPlugin):
                 ])

    def convert(self, oeb_book, output_path, input_plugin, opts, log):
-#        writer = TxtWriter(TxtNewlines(opts.newline).newline, log)
-#        txt = writer.dump(oeb_book.spine)
-
-        from calibre.ebooks.txt.txtml import TXTMLizer
        writer = TXTMLizer(log)
        txt = writer.extract_content(oeb_book, opts)
        
+        log.debug('\tReplacing newlines with selected type...')
+        txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
+
        close = False
        if not hasattr(output_path, 'write'):
            close = True
--- a/src/calibre/ebooks/txt/txtml.py
+++ b/src/calibre/ebooks/txt/txtml.py
@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
 Transform OEB content into plain text
 '''

-import os
+import os, re

 from lxml import etree

@ -32,6 +32,7 @@ BLOCK_STYLES = [
 ]

 class TXTMLizer(object):
+    
    def __init__(self, log):
        self.log = log

@ -49,6 +50,7 @@ class TXTMLizer(object):
            content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
            content = self.remove_newlines(content)
            output += self.dump_text(etree.fromstring(content), stylizer)
+        output = self.cleanup_text(output)

        return output

@ -60,7 +62,42 @@ class TXTMLizer(object):

        return text

-    def dump_text(self, elem, stylizer):
+    def cleanup_text(self, text):
+        self.log.debug('\tClean up text...')
+        # Replace bad characters.
+        text = text.replace(u'\xc2', '')
+        text = text.replace(u'\xa0', ' ')
+
+        # Replace tabs, vertical tags and form feeds with single space.
+        text = text.replace('\t+', ' ')
+        text = text.replace('\v+', ' ')
+        text = text.replace('\f+', ' ')
+
+        # Single line paragraph.
+        text = re.sub('(?<=.)%s(?=.)' % os.linesep, ' ', text)
+
+        # Remove multiple spaces.
+        text = re.sub('[  ]+', ' ', text)
+
+        # Remove excessive newlines.
+        #text = re.sub('\n[ ]+\n', '\n\n', text)
+        #text = re.sub('\n{3,}', '\n\n', text)
+
+        # Replace spaces at the beginning and end of lines
+        text = re.sub('(?imu)^[ ]+', '', text)
+        text = re.sub('(?imu)[ ]+$', '', text)
+
+        return text
+
+    def dump_text(self, elem, stylizer, end=''):
+        '''
+        @elem: The element in the etree that we are working on.
+        @stylizer: The style information attached to the element.
+        @end: The last two characters of the text from the previous element.
+              This is used to determine if a blank line is needed when starting
+              a new block element.
+        '''
+
        if not isinstance(elem.tag, basestring) \
           or namespace(elem.tag) != XHTML_NS:
            return u''
@ -78,16 +115,15 @@ class TXTMLizer(object):
        # Are we in a paragraph block?
        if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
            in_block = True
-            #if not text.endswith(os.linesep) and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
-            #    print '"%s"' % text
-            #    text += os.linesep + os.linesep
+            if not end.endswith(os.linesep + os.linesep) and hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
+                text += os.linesep + os.linesep

        # Proccess tags that contain text.
        if hasattr(elem, 'text') and elem.text != None and elem.text.strip() != '':
            text += elem.text

        for item in elem:
-            text += self.dump_text(item, stylizer)
+            text += self.dump_text(item, stylizer, text[-2:])

        if in_block:
            text += os.linesep + os.linesep
--- a/src/calibre/ebooks/txt/writer.py
+++ b/src/calibre/ebooks/txt/writer.py
@ -1,146 +0,0 @@
-# -*- coding: utf-8 -*-
-
-__license__   = 'GPL v3'
-__copyright__ = '2009, John Schember <john@nachtimwald.com>'
-__docformat__ = 'restructuredtext en'
-
-'''
-Write content to TXT.
-'''
-
-import os
-import re
-
-from lxml import etree
-
-from calibre import entity_to_unicode
-from calibre.ebooks.oeb.base import XHTML
-
-class TxtWriter(object):
-    def __init__(self, newline, log):
-        self.newline = newline
-        self.log = log
-
-    def dump(self, spine):
-        out = u''
-        for item in spine:
-            self.log.debug('Processing %s...' % item.href)
-            content = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
-            content = self.remove_newlines(content)
-            content = self.strip_html(content)
-            content = self.replace_html_symbols(content)
-            content = self.cleanup_text(content)
-            content = self.specified_newlines(content)
-            out += content
-
-            # Put two blank lines at end of file
-            end = out[-3 * len(self.newline):]
-            for i in range(3 - end.count(self.newline)):
-                out += self.newline
-
-        return out
-
-    def strip_html(self, text):
-        self.log.debug('\tStripping html...')
-        stripped = u''
-
-        # Remove unnecessary tags
-        for tag in ['script', 'style']:
-            text = re.sub('(?imu)<[ ]*%s[ ]*.*?>(.*)</[ ]*%s[ ]*>' % (tag, tag), '', text)
-        text = re.sub('<!--.*-->', '', text)
-        text = re.sub('<\?.*?\?>', '', text)
-        text = re.sub('<@.*?@>', '', text)
-        text = re.sub('<%.*?%>', '', text)
-
-        # Headings usually indicate Chapters.
-        # We are going to use a marker to insert the proper number of
-        # newline characters at the end of cleanup_text because cleanup_text
-        # remove excessive (more than 2 newlines).
-        for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
-            text = re.sub('(?imu)<[ ]*%s[ ]*.*?>' % tag, '-vzxedxy-', text)
-            text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '-vlgzxey-', text)
-
-        # Separate content with space.
-        for tag in ['td']:
-            text = re.sub('(?imu)</[ ]*%s[ ]*>', ' ', text)
-
-        # Separate content with empty line.
-        for tag in ['p', 'div', 'pre', 'li', 'table', 'tr']:
-            text = re.sub('(?imu)</[ ]*%s[ ]*>' % tag, '\n\n', text)
-
-        for tag in ['hr', 'br']:
-            text = re.sub('(?imu)<[ ]*%s.*?>' % tag, '\n\n', text)
-
-        # Remove any tags that do not need special processing.
-        text = re.sub('<.*?>', '', text)
-
-        stripped = stripped + text
-
-        return stripped
-
-    def replace_html_symbols(self, content):
-        self.log.debug('\tReplacing entities with unicode...')
-        for entity in set(re.findall('&.+?;', content)):
-            mo = re.search('(%s)' % entity[1:-1], content)
-            content = content.replace(entity, entity_to_unicode(mo))
-
-        return content
-
-    def cleanup_text(self, text):
-        self.log.debug('\tClean up text...')
-        # Replace bad characters.
-        text = text.replace(u'\xc2', '')
-        text = text.replace(u'\xa0', ' ')
-
-        # Replace tabs, vertical tags and form feeds with single space.
-        text = text.replace('\t+', ' ')
-        text = text.replace('\v+', ' ')
-        text = text.replace('\f+', ' ')
-
-        # Single line paragraph.
-        text = re.sub('(?<=.)\n(?=.)', ' ', text)
-
-        # Remove multiple spaces.
-        text = re.sub('[  ]+', ' ', text)
-
-        # Remove excessive newlines.
-        text = re.sub('\n[ ]+\n', '\n\n', text)
-        text = re.sub('\n{3,}', '\n\n', text)
-
-        # Replace markers with the proper characters.
-        text = text.replace('-vzxedxy-', '\n\n\n\n\n')
-        text = text.replace('-vlgzxey-', '\n\n\n')
-
-        # Replace spaces at the beginning and end of lines
-        text = re.sub('(?imu)^[ ]+', '', text)
-        text = re.sub('(?imu)[ ]+$', '', text)
-
-        return text
-
-    def remove_newlines(self, text):
-        self.log.debug('\tRemove newlines for processing...')
-        text = text.replace('\r\n', ' ')
-        text = text.replace('\n', ' ')
-        text = text.replace('\r', ' ')
-
-        return text
-
-    def specified_newlines(self, text):
-        self.log.debug('\tReplacing newlines with selected type...')
-        if self.newline == '\n':
-            return text
-
-        return text.replace('\n', self.newline)
-
-
-class TxtNewlines(object):
-    NEWLINE_TYPES = {
-                        'system'  : os.linesep,
-                        'unix'    : '\n',
-                        'old_mac' : '\r',
-                        'windows' : '\r\n'
-                     }
-
-    def __init__(self, newline_type):
-        self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)
-