TXT Input: Enhance formatting detection regexes. Add basic TXTZ input support.

2025-07-09 03:04:10 -04:00 · 2011-02-06 13:49:50 -05:00 · 2011-02-06 13:49:50 -05:00 · 52c0a1899b
commit 52c0a1899b
parent 39c57bba49
2 changed files with 27 additions and 8 deletions
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -4,23 +4,29 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'

+import glob
+import mimetypes
 import os
+import shutil

+from calibre import _ent_pat, xml_entity_to_unicode
 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
    separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
    preserve_spaces, detect_paragraph_type, detect_formatting_type, \
-    normalize_line_endings, convert_textile, remove_indents, block_to_single_line
-from calibre import _ent_pat, xml_entity_to_unicode
+    normalize_line_endings, convert_textile, remove_indents, block_to_single_line, \
+    image_list
+from calibre.ptempfile import TemporaryDirectory
+from calibre.utils.zipfile import ZipFile

 class TXTInput(InputFormatPlugin):

    name        = 'TXT Input'
    author      = 'John Schember'
    description = 'Convert TXT files to HTML'
-    file_types  = set(['txt'])
+    file_types  = set(['txt', 'txtz'])

    options = set([
        OptionRecommendation(name='paragraph_type', recommended_value='auto',
@ -57,10 +63,23 @@ class TXTInput(InputFormatPlugin):
    def convert(self, stream, options, file_ext, log,
                accelerators):
        self.log = log
+        txt = ''
        log.debug('Reading text from file...')
        length = 0

-        txt = stream.read()
+        # Extract content from zip archive.
+        if file_ext == 'txtz':
+            log.debug('De-compressing content to temporary directory...')
+            with TemporaryDirectory('_untxtz') as tdir:
+                zf = ZipFile(stream)
+                zf.extractall(tdir)
+
+                txts = glob.glob(os.path.join(tdir, '*.txt'))
+                for t in txts:
+                    with open(t, 'rb') as tf:
+                        txt += tf.read()
+        else:
+            txt = stream.read()

        # Get the encoding of the document.
        if options.input_encoding:
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -221,9 +221,9 @@ def detect_formatting_type(txt):
    markdown_count += len(re.findall('(?mu)^=+$', txt))
    markdown_count += len(re.findall('(?mu)^-+$', txt))
    # Images
-    markdown_count += len(re.findall('(?u)!\[.*?\]\(.+?\)', txt))
+    markdown_count += len(re.findall('(?u)!\[.*?\](\[|\()', txt))
    # Links
-    markdown_count += len(re.findall('(?u)(^|(?P<pre>[^!]))\[.*?\]\([^)]+\)', txt))
+    markdown_count += len(re.findall('(?u)^|[^!]\[.*?\](\[|\()', txt))

    # Check for textile
    # Headings
@ -231,9 +231,9 @@ def detect_formatting_type(txt):
    # Block quote.
    textile_count += len(re.findall(r'(?mu)^bq\.', txt))
    # Images
-    textile_count += len(re.findall(r'\![^\s]+(?=.*?/)(:[^\s]+)*', txt))
+    textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
    # Links
-    textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
+    textile_count += len(re.findall(r'"[^"]*":\S+', txt))

    # Decide if either markdown or textile is used in the text
    # based on the number of unique formatting elements found.