Initial TXTZ format support

2025-08-30 23:00:21 -04:00 · 2011-02-07 15:43:50 -07:00 · 2011-02-07 15:43:50 -07:00 · b8ec84468d
commit b8ec84468d
parent 8a93808ccc ad32cd1d72
9 changed files with 143 additions and 21 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -325,6 +325,17 @@ class TXTMetadataReader(MetadataReaderPlugin):
        from calibre.ebooks.metadata.txt import get_metadata
        return get_metadata(stream)

+class TXTZMetadataReader(MetadataReaderPlugin):
+
+    name        = 'Read TXTZ metadata'
+    file_types  = set(['txtz'])
+    description = _('Read metadata from %s files') % 'TXTZ'
+    author      = 'John Schember'
+
+    def get_metadata(self, stream, ftype):
+        from calibre.ebooks.metadata.txtz import get_metadata
+        return get_metadata(stream)
+
 class ZipMetadataReader(MetadataReaderPlugin):

    name = 'Read ZIP metadata'
@ -412,6 +423,17 @@ class TOPAZMetadataWriter(MetadataWriterPlugin):
        from calibre.ebooks.metadata.topaz import set_metadata
        set_metadata(stream, mi)

+class TXTZMetadataWriter(MetadataWriterPlugin):
+
+    name        = 'Set TXTZ metadata'
+    file_types  = set(['txtz'])
+    description = _('Set metadata from %s files') % 'TXTZ'
+    author      = 'John Schember'
+
+    def set_metadata(self, stream, mi, type):
+        from calibre.ebooks.metadata.txtz import set_metadata
+        set_metadata(stream, mi)
+
 # }}}

 from calibre.ebooks.comic.input import ComicInput
@ -446,6 +468,7 @@ from calibre.ebooks.rb.output import RBOutput
 from calibre.ebooks.rtf.output import RTFOutput
 from calibre.ebooks.tcr.output import TCROutput
 from calibre.ebooks.txt.output import TXTOutput
+from calibre.ebooks.txt.output import TXTZOutput
 from calibre.ebooks.html.output import HTMLOutput
 from calibre.ebooks.snb.output import SNBOutput

@ -531,6 +554,7 @@ plugins += [
    RTFOutput,
    TCROutput,
    TXTOutput,
+    TXTZOutput,
    HTMLOutput,
    SNBOutput,
 ]
--- a/src/calibre/ebooks/init.py
+++ b/src/calibre/ebooks/init.py
@ -25,7 +25,7 @@ class DRMError(ValueError):
 class ParserError(ValueError):
    pass

-BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'htm', 'xhtm',
+BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'htm', 'xhtm',
                   'html', 'xhtml', 'pdf', 'pdb', 'pdr', 'prc', 'mobi', 'azw', 'doc',
                   'epub', 'fb2', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
                   'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'mbp', 'tan', 'snb']
--- a/src/calibre/ebooks/metadata/txt.py
+++ b/src/calibre/ebooks/metadata/txt.py
@ -1,16 +1,20 @@
-'''Read meta information from TXT files'''
-
-from __future__ import with_statement
+# -*- coding: utf-8 -*-

 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'

+'''
+Read meta information from TXT files
+'''
+
 import re

 from calibre.ebooks.metadata import MetaInformation

 def get_metadata(stream, extract_cover=True):
-    """ Return metadata as a L{MetaInfo} object """
+    '''
+    Return metadata as a L{MetaInfo} object
+    '''
    mi = MetaInformation(_('Unknown'), [_('Unknown')])
    stream.seek(0)

--- a/src/calibre/ebooks/metadata/txtz.py
+++ b/src/calibre/ebooks/metadata/txtz.py
@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+
+'''
+Read meta information from TXT files
+'''
+
+import os
+
+from cStringIO import StringIO
+
+from calibre.ebooks.metadata import MetaInformation
+from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf
+from calibre.ptempfile import TemporaryDirectory
+from calibre.utils.zipfile import ZipFile, safe_replace
+
+def get_metadata(stream, extract_cover=True):
+    '''
+    Return metadata as a L{MetaInfo} object
+    '''
+    mi = MetaInformation(_('Unknown'), [_('Unknown')])
+    stream.seek(0)
+
+    with TemporaryDirectory('_untxtz_mdata') as tdir:
+        try:
+            zf = ZipFile(stream)
+            zf.extract('metadata.opf', tdir)
+            with open(os.path.join(tdir, 'metadata.opf'), 'rb') as opff:
+                mi = OPF(opff).to_book_metadata()
+        except:
+            return mi
+    return mi
+
+def set_metadata(stream, mi):
+    opf = StringIO(metadata_to_opf(mi))
+    safe_replace(stream, 'metadata.opf', opf)
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -4,8 +4,10 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'

+import glob
 import os

+from calibre import _ent_pat, xml_entity_to_unicode
 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.ebooks.chardet import detect
@ -13,14 +15,15 @@ from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
    separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
    preserve_spaces, detect_paragraph_type, detect_formatting_type, \
    normalize_line_endings, convert_textile, remove_indents, block_to_single_line
-from calibre import _ent_pat, xml_entity_to_unicode
+from calibre.ptempfile import TemporaryDirectory
+from calibre.utils.zipfile import ZipFile

 class TXTInput(InputFormatPlugin):

    name        = 'TXT Input'
    author      = 'John Schember'
    description = 'Convert TXT files to HTML'
-    file_types  = set(['txt'])
+    file_types  = set(['txt', 'txtz'])

    options = set([
        OptionRecommendation(name='paragraph_type', recommended_value='auto',
@ -57,10 +60,23 @@ class TXTInput(InputFormatPlugin):
    def convert(self, stream, options, file_ext, log,
                accelerators):
        self.log = log
+        txt = ''
        log.debug('Reading text from file...')
        length = 0

-        txt = stream.read()
+        # Extract content from zip archive.
+        if file_ext == 'txtz':
+            log.debug('De-compressing content to temporary directory...')
+            with TemporaryDirectory('_untxtz') as tdir:
+                zf = ZipFile(stream)
+                zf.extractall(tdir)
+
+                txts = glob.glob(os.path.join(tdir, '*.txt'))
+                for t in txts:
+                    with open(t, 'rb') as tf:
+                        txt += tf.read()
+        else:
+            txt = stream.read()

        # Get the encoding of the document.
        if options.input_encoding:
@ -175,4 +191,11 @@ class TXTInput(InputFormatPlugin):
                {})
        options.debug_pipeline = odi
        os.remove(htmlfile.name)
+        
+        # Set metadata from file.
+        from calibre.customize.ui import get_file_type_metadata
+        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
+        mi = get_file_type_metadata(stream, file_ext)
+        meta_info_to_oeb_metadata(mi, oeb.metadata, log)
+        
        return oeb
--- a/src/calibre/ebooks/txt/markdownml.py
+++ b/src/calibre/ebooks/txt/markdownml.py
@ -35,11 +35,9 @@ class MarkdownMLizer(object):
            html = unicode(etree.tostring(item.data, encoding=unicode))
            
            if not self.opts.keep_links:
-                html = re.sub(r'<\s*a[^>]*>', '', html)
-                html = re.sub(r'<\s*/\s*a\s*>', '', html)
+                html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
            if not self.opts.keep_image_references:
-                html = re.sub(r'<\s*img[^>]*>', '', html)
-                html = re.sub(r'<\s*img\s*>', '', html)
+                html = re.sub(r'<\s*img[^>]*>', '', html)\
            
            text = html2text(html)
        
--- a/src/calibre/ebooks/txt/output.py
+++ b/src/calibre/ebooks/txt/output.py
@ -5,11 +5,18 @@ __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'

 import os
+import shutil
+
+from lxml import etree

 from calibre.customize.conversion import OutputFormatPlugin, \
    OptionRecommendation
+from calibre.ebooks.oeb.base import OEB_IMAGES 
 from calibre.ebooks.txt.txtml import TXTMLizer
 from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines
+from calibre.ptempfile import TemporaryDirectory, TemporaryFile
+from calibre.utils.cleantext import clean_ascii_chars
+from calibre.utils.zipfile import ZipFile

 class TXTOutput(OutputFormatPlugin):

@ -73,6 +80,7 @@ class TXTOutput(OutputFormatPlugin):
            writer = TXTMLizer(log)

        txt = writer.extract_content(oeb_book, opts)
+        txt = clean_ascii_chars(txt)

        log.debug('\tReplacing newlines with selected type...')
        txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
@ -93,3 +101,32 @@ class TXTOutput(OutputFormatPlugin):
        if close:
            out_stream.close()

+
+class TXTZOutput(TXTOutput):
+    
+    name = 'TXTZ Output'
+    author = 'John Schember'
+    file_type = 'txtz'
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        with TemporaryDirectory('_txtz_output') as tdir:
+            # TXT
+            with TemporaryFile('index.txt') as tf:
+                TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
+                shutil.copy(tf, os.path.join(tdir, 'index.txt'))
+
+            # Images
+            for item in oeb_book.manifest:
+                if item.media_type in OEB_IMAGES:
+                    path = os.path.join(tdir, os.path.dirname(item.href))
+                    if not os.path.exists(path):
+                        os.makedirs(path)
+                    with open(os.path.join(tdir, item.href), 'wb') as imgf:
+                        imgf.write(item.data)
+            
+            # Metadata
+            with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf: 
+                mdataf.write(etree.tostring(oeb_book.metadata.to_opf1()))
+            
+            txtz = ZipFile(output_path, 'w')
+            txtz.add_dir(tdir)
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -29,8 +29,7 @@ def clean_txt(txt):
    txt = '\n'.join([line.rstrip() for line in txt.splitlines()])
    
    # Replace whitespace at the beginning of the line with &nbsp;
-    txt = re.sub('(?m)(?P<space>^[ ]+)(?=.)', lambda mo: '&nbsp;' * mo.groups('space').count(' '), txt)
-    txt = re.sub('(?m)(?P<space>^[\t]+)(?=.)', lambda mo: '&nbsp;' * 4 * mo.groups('space').count('\t'), txt)
+    txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', '&nbsp;' * 4, txt)

    # Condense redundant spaces
    txt = re.sub('[ ]{2,}', ' ', txt)
@ -221,9 +220,9 @@ def detect_formatting_type(txt):
    markdown_count += len(re.findall('(?mu)^=+$', txt))
    markdown_count += len(re.findall('(?mu)^-+$', txt))
    # Images
-    markdown_count += len(re.findall('(?u)!\[.*?\]\(.+?\)', txt))
+    markdown_count += len(re.findall('(?u)!\[.*?\](\[|\()', txt))
    # Links
-    markdown_count += len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt))
+    markdown_count += len(re.findall('(?u)^|[^!]\[.*?\](\[|\()', txt))

    # Check for textile
    # Headings
@ -231,9 +230,9 @@ def detect_formatting_type(txt):
    # Block quote.
    textile_count += len(re.findall(r'(?mu)^bq\.', txt))
    # Images
-    textile_count += len(re.findall(r'\![^\s]+(?=.*?/)(:[^\s]+)*', txt))
+    textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
    # Links
-    textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
+    textile_count += len(re.findall(r'"[^"]*":\S+', txt))

    # Decide if either markdown or textile is used in the text
    # based on the number of unique formatting elements found.
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@ -36,13 +36,12 @@ class TextileMLizer(object):
            html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))

            if not self.opts.keep_links:
-                html = re.sub(r'<\s*a[^>]*>', '', html)
-                html = re.sub(r'<\s*/\s*a\s*>', '', html)
+                html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
            if not self.opts.keep_image_references:
                html = re.sub(r'<\s*img[^>]*>', '', html)
-                html = re.sub(r'<\s*img\s*>', '', html)

            text = html2textile(html)
+            text = text.replace('%', '')

            # Ensure the section ends with at least two new line characters.
            # This is to prevent the last paragraph from a section being