From 52c0a1899bbe52f0bbe9fa253c7f9503095781c1 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sun, 6 Feb 2011 13:49:50 -0500 Subject: [PATCH 1/9] TXT Input: Enhance formatting detection regexes. Add basic TXTZ input support. --- src/calibre/ebooks/txt/input.py | 27 +++++++++++++++++++++++---- src/calibre/ebooks/txt/processor.py | 8 ++++---- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py index e240205222..ed1597111d 100644 --- a/src/calibre/ebooks/txt/input.py +++ b/src/calibre/ebooks/txt/input.py @@ -4,23 +4,29 @@ __license__ = 'GPL 3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' +import glob +import mimetypes import os +import shutil +from calibre import _ent_pat, xml_entity_to_unicode from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.chardet import detect from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \ separate_paragraphs_single_line, separate_paragraphs_print_formatted, \ preserve_spaces, detect_paragraph_type, detect_formatting_type, \ - normalize_line_endings, convert_textile, remove_indents, block_to_single_line -from calibre import _ent_pat, xml_entity_to_unicode + normalize_line_endings, convert_textile, remove_indents, block_to_single_line, \ + image_list +from calibre.ptempfile import TemporaryDirectory +from calibre.utils.zipfile import ZipFile class TXTInput(InputFormatPlugin): name = 'TXT Input' author = 'John Schember' description = 'Convert TXT files to HTML' - file_types = set(['txt']) + file_types = set(['txt', 'txtz']) options = set([ OptionRecommendation(name='paragraph_type', recommended_value='auto', @@ -57,10 +63,23 @@ class TXTInput(InputFormatPlugin): def convert(self, stream, options, file_ext, log, accelerators): self.log = log + txt = '' log.debug('Reading text from file...') length = 0 - txt = stream.read() + # Extract content from zip archive. + if file_ext == 'txtz': + log.debug('De-compressing content to temporary directory...') + with TemporaryDirectory('_untxtz') as tdir: + zf = ZipFile(stream) + zf.extractall(tdir) + + txts = glob.glob(os.path.join(tdir, '*.txt')) + for t in txts: + with open(t, 'rb') as tf: + txt += tf.read() + else: + txt = stream.read() # Get the encoding of the document. if options.input_encoding: diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py index f7b6cce234..e4ff2763e5 100644 --- a/src/calibre/ebooks/txt/processor.py +++ b/src/calibre/ebooks/txt/processor.py @@ -221,9 +221,9 @@ def detect_formatting_type(txt): markdown_count += len(re.findall('(?mu)^=+$', txt)) markdown_count += len(re.findall('(?mu)^-+$', txt)) # Images - markdown_count += len(re.findall('(?u)!\[.*?\]\(.+?\)', txt)) + markdown_count += len(re.findall('(?u)!\[.*?\](\[|\()', txt)) # Links - markdown_count += len(re.findall('(?u)(^|(?P
[^!]))\[.*?\]\([^)]+\)', txt))
+    markdown_count += len(re.findall('(?u)^|[^!]\[.*?\](\[|\()', txt))
 
     # Check for textile
     # Headings
@@ -231,9 +231,9 @@ def detect_formatting_type(txt):
     # Block quote.
     textile_count += len(re.findall(r'(?mu)^bq\.', txt))
     # Images
-    textile_count += len(re.findall(r'\![^\s]+(?=.*?/)(:[^\s]+)*', txt))
+    textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
     # Links
-    textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
+    textile_count += len(re.findall(r'"[^"]*":\S+', txt))
 
     # Decide if either markdown or textile is used in the text
     # based on the number of unique formatting elements found.

From b6ec16463ee3083530b7aa76975ac60c73ff3386 Mon Sep 17 00:00:00 2001
From: John Schember 
Date: Sun, 6 Feb 2011 13:52:00 -0500
Subject: [PATCH 2/9] ...

---
 src/calibre/ebooks/txt/input.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index ed1597111d..55d37da026 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -5,9 +5,7 @@ __copyright__ = '2009, John Schember '
 __docformat__ = 'restructuredtext en'
 
 import glob
-import mimetypes
 import os
-import shutil
 
 from calibre import _ent_pat, xml_entity_to_unicode
 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
@@ -16,8 +14,7 @@ from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
     separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
     preserve_spaces, detect_paragraph_type, detect_formatting_type, \
-    normalize_line_endings, convert_textile, remove_indents, block_to_single_line, \
-    image_list
+    normalize_line_endings, convert_textile, remove_indents, block_to_single_line
 from calibre.ptempfile import TemporaryDirectory
 from calibre.utils.zipfile import ZipFile
 

From 0257fe4a0202ee94cd56030f338c5baca74da74d Mon Sep 17 00:00:00 2001
From: John Schember 
Date: Sun, 6 Feb 2011 14:09:14 -0500
Subject: [PATCH 3/9] TXT Output: Simplify remove image and link regexes

---
 src/calibre/ebooks/txt/markdownml.py | 6 ++----
 src/calibre/ebooks/txt/textileml.py  | 4 +---
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/calibre/ebooks/txt/markdownml.py b/src/calibre/ebooks/txt/markdownml.py
index 116561f355..c179378049 100644
--- a/src/calibre/ebooks/txt/markdownml.py
+++ b/src/calibre/ebooks/txt/markdownml.py
@@ -35,11 +35,9 @@ class MarkdownMLizer(object):
             html = unicode(etree.tostring(item.data, encoding=unicode))
             
             if not self.opts.keep_links:
-                html = re.sub(r'<\s*a[^>]*>', '', html)
-                html = re.sub(r'<\s*/\s*a\s*>', '', html)
+                html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
             if not self.opts.keep_image_references:
-                html = re.sub(r'<\s*img[^>]*>', '', html)
-                html = re.sub(r'<\s*img\s*>', '', html)
+                html = re.sub(r'<\s*img[^>]*>', '', html)\
             
             text = html2text(html)
         
diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py
index 94834d8e79..d7e11695c5 100644
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@@ -36,11 +36,9 @@ class TextileMLizer(object):
             html = unicode(etree.tostring(item.data.find(XHTML('body')), encoding=unicode))
 
             if not self.opts.keep_links:
-                html = re.sub(r'<\s*a[^>]*>', '', html)
-                html = re.sub(r'<\s*/\s*a\s*>', '', html)
+                html = re.sub(r'<\s*/*\s*a[^>]*>', '', html)
             if not self.opts.keep_image_references:
                 html = re.sub(r'<\s*img[^>]*>', '', html)
-                html = re.sub(r'<\s*img\s*>', '', html)
 
             text = html2textile(html)
 

From 837bd4e548e000480eb4d104b66decefc41281c1 Mon Sep 17 00:00:00 2001
From: John Schember 
Date: Sun, 6 Feb 2011 14:37:02 -0500
Subject: [PATCH 4/9] TXTZ metadata reader.

---
 src/calibre/customize/builtins.py   | 11 ++++++++++
 src/calibre/ebooks/__init__.py      |  2 +-
 src/calibre/ebooks/metadata/txt.py  | 12 ++++++----
 src/calibre/ebooks/metadata/txtz.py | 34 +++++++++++++++++++++++++++++
 4 files changed, 54 insertions(+), 5 deletions(-)
 create mode 100644 src/calibre/ebooks/metadata/txtz.py

diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index e0367515bc..04e880b714 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -325,6 +325,17 @@ class TXTMetadataReader(MetadataReaderPlugin):
         from calibre.ebooks.metadata.txt import get_metadata
         return get_metadata(stream)
 
+class TXTZMetadataReader(MetadataReaderPlugin):
+
+    name        = 'Read TXTZ metadata'
+    file_types  = set(['txtz'])
+    description = _('Read metadata from %s files') % 'TXTZ'
+    author      = 'John Schember'
+
+    def get_metadata(self, stream, ftype):
+        from calibre.ebooks.metadata.txtz import get_metadata
+        return get_metadata(stream)
+
 class ZipMetadataReader(MetadataReaderPlugin):
 
     name = 'Read ZIP metadata'
diff --git a/src/calibre/ebooks/__init__.py b/src/calibre/ebooks/__init__.py
index 4dc97f43ed..49604ae682 100644
--- a/src/calibre/ebooks/__init__.py
+++ b/src/calibre/ebooks/__init__.py
@@ -25,7 +25,7 @@ class DRMError(ValueError):
 class ParserError(ValueError):
     pass
 
-BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'htm', 'xhtm',
+BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'htm', 'xhtm',
                    'html', 'xhtml', 'pdf', 'pdb', 'pdr', 'prc', 'mobi', 'azw', 'doc',
                    'epub', 'fb2', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
                    'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'mbp', 'tan', 'snb']
diff --git a/src/calibre/ebooks/metadata/txt.py b/src/calibre/ebooks/metadata/txt.py
index 79713774e3..70d3c72ae0 100644
--- a/src/calibre/ebooks/metadata/txt.py
+++ b/src/calibre/ebooks/metadata/txt.py
@@ -1,16 +1,20 @@
-'''Read meta information from TXT files'''
-
-from __future__ import with_statement
+# -*- coding: utf-8 -*-
 
 __license__   = 'GPL v3'
 __copyright__ = '2009, John Schember '
 
+'''
+Read meta information from TXT files
+'''
+
 import re
 
 from calibre.ebooks.metadata import MetaInformation
 
 def get_metadata(stream, extract_cover=True):
-    """ Return metadata as a L{MetaInfo} object """
+    '''
+    Return metadata as a L{MetaInfo} object
+    '''
     mi = MetaInformation(_('Unknown'), [_('Unknown')])
     stream.seek(0)
 
diff --git a/src/calibre/ebooks/metadata/txtz.py b/src/calibre/ebooks/metadata/txtz.py
new file mode 100644
index 0000000000..b9d607c63b
--- /dev/null
+++ b/src/calibre/ebooks/metadata/txtz.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, John Schember '
+
+'''
+Read meta information from TXT files
+'''
+
+import os
+
+from calibre.ebooks.metadata import MetaInformation
+from calibre.ptempfile import TemporaryDirectory
+from calibre.utils.zipfile import ZipFile
+
+def get_metadata(stream, extract_cover=True):
+    '''
+    Return metadata as a L{MetaInfo} object
+    '''
+    mi = MetaInformation(_('Unknown'), [_('Unknown')])
+    stream.seek(0)
+
+    with TemporaryDirectory('_untxtz_mdata') as tdir:
+        try:
+            zf = ZipFile(stream)
+            zf.extract('metadata.opf', tdir)
+            
+            from calibre.ebooks.metadata.opf2 import OPF
+            with open(os.path.join(tdir, 'metadata.opf'), 'rb') as opff:
+                mi = OPF(opff).to_book_metadata()
+        except:
+            return mi
+
+    return mi

From 0f86858c3ba06357a47dc88943ffe32818cd5329 Mon Sep 17 00:00:00 2001
From: John Schember 
Date: Sun, 6 Feb 2011 15:01:31 -0500
Subject: [PATCH 5/9] TXTZ metadata writer.

---
 src/calibre/customize/builtins.py   | 11 +++++++++++
 src/calibre/ebooks/metadata/txtz.py | 13 +++++++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index 04e880b714..b56d015e54 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -423,6 +423,17 @@ class TOPAZMetadataWriter(MetadataWriterPlugin):
         from calibre.ebooks.metadata.topaz import set_metadata
         set_metadata(stream, mi)
 
+class TXTZMetadataWriter(MetadataWriterPlugin):
+
+    name        = 'Set TXTZ metadata'
+    file_types  = set(['txtz'])
+    description = _('Set metadata from %s files') % 'TXTZ'
+    author      = 'John Schember'
+
+    def set_metadata(self, stream, mi, type):
+        from calibre.ebooks.metadata.txtz import set_metadata
+        set_metadata(stream, mi)
+
 # }}}
 
 from calibre.ebooks.comic.input import ComicInput
diff --git a/src/calibre/ebooks/metadata/txtz.py b/src/calibre/ebooks/metadata/txtz.py
index b9d607c63b..ba0078328e 100644
--- a/src/calibre/ebooks/metadata/txtz.py
+++ b/src/calibre/ebooks/metadata/txtz.py
@@ -9,9 +9,12 @@ Read meta information from TXT files
 
 import os
 
+from cStringIO import StringIO
+
 from calibre.ebooks.metadata import MetaInformation
+from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf
 from calibre.ptempfile import TemporaryDirectory
-from calibre.utils.zipfile import ZipFile
+from calibre.utils.zipfile import ZipFile, safe_replace
 
 def get_metadata(stream, extract_cover=True):
     '''
@@ -24,11 +27,13 @@ def get_metadata(stream, extract_cover=True):
         try:
             zf = ZipFile(stream)
             zf.extract('metadata.opf', tdir)
-            
-            from calibre.ebooks.metadata.opf2 import OPF
             with open(os.path.join(tdir, 'metadata.opf'), 'rb') as opff:
                 mi = OPF(opff).to_book_metadata()
         except:
             return mi
-
     return mi
+
+def set_metadata(stream, mi):
+    stream.seek(0)
+    opf = StringIO(metadata_to_opf(mi))
+    safe_replace(stream, 'metadata.opf', opf)

From c481be879df7a438639b337ff0232b9abf4dc0fc Mon Sep 17 00:00:00 2001
From: John Schember 
Date: Sun, 6 Feb 2011 15:46:07 -0500
Subject: [PATCH 6/9] TXTZ Output support.

---
 src/calibre/customize/builtins.py   |  2 ++
 src/calibre/ebooks/metadata/txtz.py |  1 -
 src/calibre/ebooks/txt/output.py    | 35 +++++++++++++++++++++++++++++
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index b56d015e54..32c512fe39 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -468,6 +468,7 @@ from calibre.ebooks.rb.output import RBOutput
 from calibre.ebooks.rtf.output import RTFOutput
 from calibre.ebooks.tcr.output import TCROutput
 from calibre.ebooks.txt.output import TXTOutput
+from calibre.ebooks.txt.output import TXTZOutput
 from calibre.ebooks.html.output import HTMLOutput
 from calibre.ebooks.snb.output import SNBOutput
 
@@ -553,6 +554,7 @@ plugins += [
     RTFOutput,
     TCROutput,
     TXTOutput,
+    TXTZOutput,
     HTMLOutput,
     SNBOutput,
 ]
diff --git a/src/calibre/ebooks/metadata/txtz.py b/src/calibre/ebooks/metadata/txtz.py
index ba0078328e..ae6efb4838 100644
--- a/src/calibre/ebooks/metadata/txtz.py
+++ b/src/calibre/ebooks/metadata/txtz.py
@@ -34,6 +34,5 @@ def get_metadata(stream, extract_cover=True):
     return mi
 
 def set_metadata(stream, mi):
-    stream.seek(0)
     opf = StringIO(metadata_to_opf(mi))
     safe_replace(stream, 'metadata.opf', opf)
diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py
index b73a6e8908..3905081a84 100644
--- a/src/calibre/ebooks/txt/output.py
+++ b/src/calibre/ebooks/txt/output.py
@@ -5,11 +5,17 @@ __copyright__ = '2009, John Schember '
 __docformat__ = 'restructuredtext en'
 
 import os
+import shutil
+
+from lxml import etree
 
 from calibre.customize.conversion import OutputFormatPlugin, \
     OptionRecommendation
+from calibre.ebooks.oeb.base import OEB_IMAGES 
 from calibre.ebooks.txt.txtml import TXTMLizer
 from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines
+from calibre.ptempfile import TemporaryDirectory, TemporaryFile
+from calibre.utils.zipfile import ZipFile
 
 class TXTOutput(OutputFormatPlugin):
 
@@ -93,3 +99,32 @@ class TXTOutput(OutputFormatPlugin):
         if close:
             out_stream.close()
 
+
+class TXTZOutput(TXTOutput):
+    
+    name = 'TXTZ Output'
+    author = 'John Schember'
+    file_type = 'txtz'
+
+    def convert(self, oeb_book, output_path, input_plugin, opts, log):
+        with TemporaryDirectory('_txtz_output') as tdir:
+            # TXT
+            with TemporaryFile('index.txt') as tf:
+                TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
+                shutil.copy(tf, os.path.join(tdir, 'index.txt'))
+
+            # Images
+            for item in oeb_book.manifest:
+                if item.media_type in OEB_IMAGES:
+                    path = os.path.join(tdir, os.path.dirname(item.href))
+                    if not os.path.exists(path):
+                        os.makedirs(path)
+                    with open(os.path.join(tdir, item.href), 'wb') as imgf:
+                        imgf.write(item.data)
+            
+            # Metadata
+            with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf: 
+                mdataf.write(etree.tostring(oeb_book.metadata.to_opf1()))
+            
+            txtz = ZipFile(output_path, 'w')
+            txtz.add_dir(tdir)

From 6548dbd33cf2c917aa72bc73b54ba139c4094d2d Mon Sep 17 00:00:00 2001
From: John Schember 
Date: Sun, 6 Feb 2011 16:18:25 -0500
Subject: [PATCH 7/9] TXT Input: Read and set metadata.

---
 src/calibre/ebooks/txt/input.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 55d37da026..12f780913c 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -191,4 +191,11 @@ class TXTInput(InputFormatPlugin):
                 {})
         options.debug_pipeline = odi
         os.remove(htmlfile.name)
+        
+        # Set metadata from file.
+        from calibre.customize.ui import get_file_type_metadata
+        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
+        mi = get_file_type_metadata(stream, file_ext)
+        meta_info_to_oeb_metadata(mi, oeb.metadata, log)
+        
         return oeb

From 1aa66f42fe809583a5fa462e26a9514042864db2 Mon Sep 17 00:00:00 2001
From: John Schember 
Date: Sun, 6 Feb 2011 19:45:39 -0500
Subject: [PATCH 8/9] TXT Output: clean ascii characters. Textile output remove
 span attributes.

---
 src/calibre/ebooks/txt/output.py    | 2 ++
 src/calibre/ebooks/txt/textileml.py | 1 +
 2 files changed, 3 insertions(+)

diff --git a/src/calibre/ebooks/txt/output.py b/src/calibre/ebooks/txt/output.py
index 3905081a84..d021cbbba6 100644
--- a/src/calibre/ebooks/txt/output.py
+++ b/src/calibre/ebooks/txt/output.py
@@ -15,6 +15,7 @@ from calibre.ebooks.oeb.base import OEB_IMAGES
 from calibre.ebooks.txt.txtml import TXTMLizer
 from calibre.ebooks.txt.newlines import TxtNewlines, specified_newlines
 from calibre.ptempfile import TemporaryDirectory, TemporaryFile
+from calibre.utils.cleantext import clean_ascii_chars
 from calibre.utils.zipfile import ZipFile
 
 class TXTOutput(OutputFormatPlugin):
@@ -79,6 +80,7 @@ class TXTOutput(OutputFormatPlugin):
             writer = TXTMLizer(log)
 
         txt = writer.extract_content(oeb_book, opts)
+        txt = clean_ascii_chars(txt)
 
         log.debug('\tReplacing newlines with selected type...')
         txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
diff --git a/src/calibre/ebooks/txt/textileml.py b/src/calibre/ebooks/txt/textileml.py
index d7e11695c5..284e4846d9 100644
--- a/src/calibre/ebooks/txt/textileml.py
+++ b/src/calibre/ebooks/txt/textileml.py
@@ -41,6 +41,7 @@ class TextileMLizer(object):
                 html = re.sub(r'<\s*img[^>]*>', '', html)
 
             text = html2textile(html)
+            text = text.replace('%', '')
 
             # Ensure the section ends with at least two new line characters.
             # This is to prevent the last paragraph from a section being

From ad32cd1d726c2704de10ad22b8bae7b75effdfca Mon Sep 17 00:00:00 2001
From: John Schember 
Date: Sun, 6 Feb 2011 20:41:12 -0500
Subject: [PATCH 9/9] TXT Output: simpilfy retain indent regex. Force 4 indents
 to ensure we don't get situations where and entire line is whitespace.

---
 src/calibre/ebooks/txt/processor.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index e4ff2763e5..685a7504b9 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -29,8 +29,7 @@ def clean_txt(txt):
     txt = '\n'.join([line.rstrip() for line in txt.splitlines()])
     
     # Replace whitespace at the beginning of the line with  
-    txt = re.sub('(?m)(?P^[ ]+)(?=.)', lambda mo: ' ' * mo.groups('space').count(' '), txt)
-    txt = re.sub('(?m)(?P^[\t]+)(?=.)', lambda mo: ' ' * 4 * mo.groups('space').count('\t'), txt)
+    txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', ' ' * 4, txt)
 
     # Condense redundant spaces
     txt = re.sub('[ ]{2,}', ' ', txt)