From c5d6f9665b5a83e088d3bbe5f5412c6a6bce9b6b Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 12 Feb 2011 14:33:32 -0500
Subject: [PATCH 1/5] TXT Input: Rename none formatting-type to plain to
 correspond to the output option.

---
 src/calibre/ebooks/txt/input.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 8ab1524b02..7d218a36cf 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -38,10 +38,10 @@ class TXTInput(InputFormatPlugin):
                    'starts a paragraph.'
                    '* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')),
         OptionRecommendation(name='formatting_type', recommended_value='auto',
-            choices=['auto', 'none', 'heuristic', 'textile', 'markdown'],
+            choices=['auto', 'plain', 'heuristic', 'textile', 'markdown'],
             help=_('Formatting used within the document.'
                    '* auto: Automatically decide which formatting processor to use.\n'
-                   '* none: Do not process the document formatting. Everything is a '
+                   '* plain: Do not process the document formatting. Everything is a '
                    'paragraph and no styling is applied.\n'
                    '* heuristic: Process using heuristics to determine formatting such '
                    'as chapter headings and italic text.\n'

From 95892f204b22f0ea2aef5a06eea99c5a6d18c82e Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 12 Feb 2011 14:38:55 -0500
Subject: [PATCH 2/5] TXT Input: New paragraph-type option (off) to disable
 modifying the paragraph structure.

---
 src/calibre/ebooks/txt/input.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 7d218a36cf..6d958b0e7c 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -28,15 +28,18 @@ class TXTInput(InputFormatPlugin):
 
     options = set([
         OptionRecommendation(name='paragraph_type', recommended_value='auto',
-            choices=['auto', 'block', 'single', 'print', 'unformatted'],
+            choices=['auto', 'block', 'single', 'print', 'unformatted', 'off'],
             help=_('Paragraph structure.\n'
-                   'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\']\n'
+                   'choices are [\'auto\', \'block\', \'single\', \'print\', \'unformatted\', \'off\']\n'
                    '* auto: Try to auto detect paragraph type.\n'
                    '* block: Treat a blank line as a paragraph break.\n'
                    '* single: Assume every line is a paragraph.\n'
                    '* print:  Assume every line starting with 2+ spaces or a tab '
-                   'starts a paragraph.'
-                   '* unformatted: Most lines have hard line breaks, few/no blank lines or indents.')),
+                   'starts a paragraph.\n'
+                   '* unformatted: Most lines have hard line breaks, few/no blank lines or indents. '
+                   'Tries to determine structure and reformat the differentiate elements.\n'
+                   '* off: Don\'t modify the paragraph structure. This is useful when combined with '
+                   'Markdown or Textile formatting to ensure no formatting is lost.')),
         OptionRecommendation(name='formatting_type', recommended_value='auto',
             choices=['auto', 'plain', 'heuristic', 'textile', 'markdown'],
             help=_('Formatting used within the document.'
@@ -134,7 +137,7 @@ class TXTInput(InputFormatPlugin):
             preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
             txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
             txt = separate_paragraphs_single_line(txt)
-        else:
+        elif options.paragraph_type == 'block':
             txt = separate_hard_scene_breaks(txt)
             txt = block_to_single_line(txt)
 

From 04b80eb9eef7452d3ff70ceece2144e0b718f976 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 12 Feb 2011 17:34:29 -0500
Subject: [PATCH 3/5] TXTZ Input: Include images in manifest. They were not
 always being included by the HTML input plugin.

---
 src/calibre/ebooks/txt/input.py | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 6d958b0e7c..9952845fdf 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -4,13 +4,15 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 
-import glob
+import mimetypes
 import os
+import shutil
 
-from calibre import _ent_pat, xml_entity_to_unicode
+from calibre import _ent_pat, walk, xml_entity_to_unicode
 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.ebooks.chardet import detect
+from calibre.ebooks.oeb.base import OEB_IMAGES
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
     separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
     preserve_spaces, detect_paragraph_type, detect_formatting_type, \
@@ -67,6 +69,8 @@ class TXTInput(InputFormatPlugin):
         txt = ''
         log.debug('Reading text from file...')
         length = 0
+        # [(u'path', mime),]
+        images = []
 
         # Extract content from zip archive.
         if file_ext == 'txtz':
@@ -75,10 +79,19 @@ class TXTInput(InputFormatPlugin):
                 zf = ZipFile(stream)
                 zf.extractall(tdir)
 
-                txts = glob.glob(os.path.join(tdir, '*.txt'))
-                for t in txts:
-                    with open(t, 'rb') as tf:
-                        txt += tf.read()
+                for x in walk(tdir):
+                    if not os.path.isfile(x):
+                        continue
+                    if os.path.splitext(x)[1].lower() == '.txt':
+                        with open(x, 'rb') as tf:
+                            txt += tf.read() + '\n\n'
+                    if mimetypes.guess_type(x)[0] in OEB_IMAGES:
+                        path = os.path.relpath(x, tdir)
+                        dir = os.path.join(os.getcwd(), os.path.dirname(path))
+                        if not os.path.exists(dir):
+                            os.makedirs(dir)
+                        shutil.copy(x, os.path.join(os.getcwd(), path))
+                        images.append((path, mimetypes.guess_type(x)[0]))
         else:
             txt = stream.read()
 
@@ -193,9 +206,13 @@ class TXTInput(InputFormatPlugin):
             htmlfile.write(html.encode('utf-8'))
         odi = options.debug_pipeline
         options.debug_pipeline = None
-        # Generate oeb from htl conversion.
+        # Generate oeb from html conversion.
         oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log,
                 {})
+        # Add images from from txtz archive to oeb.
+        for image, mime in images:
+            id, href = oeb.manifest.generate(id='image', href=image)
+            oeb.manifest.add(id, href, mime)
         options.debug_pipeline = odi
         os.remove(htmlfile.name)
         

From ffaed91cdcb10b88fb9c13787f4b347ff1fc9a22 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 13 Feb 2011 14:23:03 -0500
Subject: [PATCH 4/5] TXT Input: Add _ to set of characters that can make up a
 hard scene break.

---
 src/calibre/ebooks/txt/processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index 55213381c9..7e161f63bd 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -126,7 +126,7 @@ def separate_hard_scene_breaks(txt):
             return '\n%s\n' % line
         else:
             return line
-    txt = re.sub(u'(?miu)^[ \t-=~\/]+$', lambda mo: sep_break(mo.group()), txt)
+    txt = re.sub(u'(?miu)^[ \t-=~\/_]+$', lambda mo: sep_break(mo.group()), txt)
     return txt
 
 def block_to_single_line(txt):

From aed26584c1c25172a219aacf7390ad0fb9150a79 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 13 Feb 2011 14:29:46 -0500
Subject: [PATCH 5/5] Fix bug #8960: None inserted before ___ in text input.
 Tweak italicize common cases regex patterns.

---
 src/calibre/ebooks/conversion/utils.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 95f832c76a..2e26f927f5 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -156,17 +156,17 @@ class HeuristicProcessor(object):
         ]
 
         ITALICIZE_STYLE_PATS = [
-            r'(?msu)(?<=[\s>])_(?P<words>[^_]+)?_',
-            r'(?msu)(?<=[\s>])/(?P<words>[^/]+)?/',
-            r'(?msu)(?<=[\s>])~~(?P<words>[^~]+)?~~',
-            r'(?msu)(?<=[\s>])\*(?P<words>[^\*]+)?\*',
-            r'(?msu)(?<=[\s>])~(?P<words>[^~]+)?~',
-            r'(?msu)(?<=[\s>])_/(?P<words>[^/_]+)?/_',
-            r'(?msu)(?<=[\s>])_\*(?P<words>[^\*_]+)?\*_',
-            r'(?msu)(?<=[\s>])\*/(?P<words>[^/\*]+)?/\*',
-            r'(?msu)(?<=[\s>])_\*/(?P<words>[^\*_]+)?/\*_',
-            r'(?msu)(?<=[\s>])/:(?P<words>[^:/]+)?:/',
-            r'(?msu)(?<=[\s>])\|:(?P<words>[^:\|]+)?:\|',
+            r'(?msu)(?<=[\s>])_(?P<words>[^_]+)_',
+            r'(?msu)(?<=[\s>])/(?P<words>[^/]+)/',
+            r'(?msu)(?<=[\s>])~~(?P<words>[^~]+)~~',
+            r'(?msu)(?<=[\s>])\*(?P<words>[^\*]+)\*',
+            r'(?msu)(?<=[\s>])~(?P<words>[^~]+)~',
+            r'(?msu)(?<=[\s>])_/(?P<words>[^/_]+)/_',
+            r'(?msu)(?<=[\s>])_\*(?P<words>[^\*_]+)\*_',
+            r'(?msu)(?<=[\s>])\*/(?P<words>[^/\*]+)/\*',
+            r'(?msu)(?<=[\s>])_\*/(?P<words>[^\*_]+)/\*_',
+            r'(?msu)(?<=[\s>])/:(?P<words>[^:/]+):/',
+            r'(?msu)(?<=[\s>])\|:(?P<words>[^:\|]+):\|',
         ]
 
         for word in ITALICIZE_WORDS: