TXTZ Input: Include images in manifest. They were not always being included by the HTML input plugin.

2025-11-21 05:53:03 -05:00 · 2011-02-12 17:34:29 -05:00 · 2011-02-12 17:34:29 -05:00 · 04b80eb9ee
commit 04b80eb9ee
parent 95892f204b
1 changed files with 24 additions and 7 deletions
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -4,13 +4,15 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'

-import glob
+import mimetypes
 import os
+import shutil

-from calibre import _ent_pat, xml_entity_to_unicode
+from calibre import _ent_pat, walk, xml_entity_to_unicode
 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.ebooks.chardet import detect
+from calibre.ebooks.oeb.base import OEB_IMAGES
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
    separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
    preserve_spaces, detect_paragraph_type, detect_formatting_type, \
@ -67,6 +69,8 @@ class TXTInput(InputFormatPlugin):
        txt = ''
        log.debug('Reading text from file...')
        length = 0
+        # [(u'path', mime),]
+        images = []

        # Extract content from zip archive.
        if file_ext == 'txtz':
@ -75,10 +79,19 @@ class TXTInput(InputFormatPlugin):
                zf = ZipFile(stream)
                zf.extractall(tdir)

-                txts = glob.glob(os.path.join(tdir, '*.txt'))
-                for t in txts:
-                    with open(t, 'rb') as tf:
-                        txt += tf.read()
+                for x in walk(tdir):
+                    if not os.path.isfile(x):
+                        continue
+                    if os.path.splitext(x)[1].lower() == '.txt':
+                        with open(x, 'rb') as tf:
+                            txt += tf.read() + '\n\n'
+                    if mimetypes.guess_type(x)[0] in OEB_IMAGES:
+                        path = os.path.relpath(x, tdir)
+                        dir = os.path.join(os.getcwd(), os.path.dirname(path))
+                        if not os.path.exists(dir):
+                            os.makedirs(dir)
+                        shutil.copy(x, os.path.join(os.getcwd(), path))
+                        images.append((path, mimetypes.guess_type(x)[0]))
        else:
            txt = stream.read()

@ -193,9 +206,13 @@ class TXTInput(InputFormatPlugin):
            htmlfile.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
-        # Generate oeb from htl conversion.
+        # Generate oeb from html conversion.
        oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log,
                {})
+        # Add images from from txtz archive to oeb.
+        for image, mime in images:
+            id, href = oeb.manifest.generate(id='image', href=image)
+            oeb.manifest.add(id, href, mime)
        options.debug_pipeline = odi
        os.remove(htmlfile.name)