From ed3b2866cfb846c7ccbb39e64711fb37d56e927c Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 6 Feb 2011 08:35:07 -0500
Subject: [PATCH 1/5] Sync ldolse preprocessing changes.

---
 src/calibre/ebooks/txt/input.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 75bafc7cef..a07b423ebb 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -57,6 +57,7 @@ class TXTInput(InputFormatPlugin):
     def convert(self, stream, options, file_ext, log,
                 accelerators):
         self.log = log
+        length = None
         log.debug('Reading text from file...')
 
         txt = stream.read()
@@ -109,7 +110,7 @@ class TXTInput(InputFormatPlugin):
         # Reformat paragraphs to block formatting based on the detected type.
         # We don't check for block because the processor assumes block.
         # single and print at transformed to block for processing.
-        if options.paragraph_type == 'single' or options.paragraph_type == 'unformatted':
+        if options.paragraph_type == 'single':
             txt = separate_paragraphs_single_line(txt)
         elif options.paragraph_type == 'print':
             txt = separate_paragraphs_print_formatted(txt)
@@ -120,10 +121,12 @@ class TXTInput(InputFormatPlugin):
             length = docanalysis.line_length(.5)
             preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
             txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
+            txt = separate_paragraphs_single_line(txt)
 
         if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
             docanalysis = DocAnalysis('txt', txt)
-            length = docanalysis.line_length(.5)
+            if not length:
+                length = docanalysis.line_length(.5)
             dehyphenator = Dehyphenator(options.verbose, log=self.log)
             txt = dehyphenator(txt,'txt', length)
 

From 92ee46cdb9bc070aa9fa71df2c59bae77855b044 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 6 Feb 2011 08:35:55 -0500
Subject: [PATCH 2/5] TXT Input: Retain indents with print formatted
 paragraphs. Move remove indents to keep print formatting working.

---
 src/calibre/ebooks/txt/input.py     | 17 +++++++++--------
 src/calibre/ebooks/txt/processor.py |  2 +-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index a07b423ebb..7253596801 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -99,14 +99,6 @@ class TXTInput(InputFormatPlugin):
             setattr(options, 'enable_heuristics', True)
             setattr(options, 'unwrap_lines', False)
 
-        if options.txt_in_remove_indents:
-            txt = remove_indents(txt)
-
-        # Preserve spaces will replace multiple spaces to a space
-        # followed by the &nbsp; entity.
-        if options.preserve_spaces:
-            txt = preserve_spaces(txt)
-
         # Reformat paragraphs to block formatting based on the detected type.
         # We don't check for block because the processor assumes block.
         # single and print at transformed to block for processing.
@@ -130,6 +122,15 @@ class TXTInput(InputFormatPlugin):
             dehyphenator = Dehyphenator(options.verbose, log=self.log)
             txt = dehyphenator(txt,'txt', length)
 
+        # User requested transformation on the text.
+        if options.txt_in_remove_indents:
+            txt = remove_indents(txt)
+
+        # Preserve spaces will replace multiple spaces to a space
+        # followed by the &nbsp; entity.
+        if options.preserve_spaces:
+            txt = preserve_spaces(txt)
+
         # Process the text using the appropriate text processor.
         html = ''
         if options.formatting_type == 'markdown':
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index 987d7cdc73..ebbdc9eb07 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -99,7 +99,7 @@ def separate_paragraphs_single_line(txt):
     return txt
 
 def separate_paragraphs_print_formatted(txt):
-    txt = re.sub(u'(?miu)^(\t+|[ ]{2,})(?=.)', '\n\t', txt)
+    txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '%s\n\t' % mo.group('indent'), txt)
     return txt
 
 def preserve_spaces(txt):

From 2d4fc57ddcacb7b04d2ab4d89a1c08125d8e5df8 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 6 Feb 2011 08:45:50 -0500
Subject: [PATCH 3/5] TXT Input: convet_basic changed at some point to require
 single line paragraphs. Add function to turn block formatted paragraphs to
 single so they are processed correctly.

---
 src/calibre/ebooks/txt/input.py     | 5 ++++-
 src/calibre/ebooks/txt/processor.py | 6 +++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 7253596801..60adf4bd7a 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
     separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
     preserve_spaces, detect_paragraph_type, detect_formatting_type, \
-    normalize_line_endings, convert_textile, remove_indents
+    normalize_line_endings, convert_textile, remove_indents, block_to_single_line
 from calibre import _ent_pat, xml_entity_to_unicode
 
 class TXTInput(InputFormatPlugin):
@@ -106,6 +106,7 @@ class TXTInput(InputFormatPlugin):
             txt = separate_paragraphs_single_line(txt)
         elif options.paragraph_type == 'print':
             txt = separate_paragraphs_print_formatted(txt)
+            txt = block_to_single_line(txt)
         elif options.paragraph_type == 'unformatted':
             from calibre.ebooks.conversion.utils import HeuristicProcessor
             # unwrap lines based on punctuation
@@ -114,6 +115,8 @@ class TXTInput(InputFormatPlugin):
             preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
             txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
             txt = separate_paragraphs_single_line(txt)
+        else:
+            txt = block_to_single_line(txt)
 
         if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
             docanalysis = DocAnalysis('txt', txt)
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index ebbdc9eb07..e4e7772ce7 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -99,7 +99,11 @@ def separate_paragraphs_single_line(txt):
     return txt
 
 def separate_paragraphs_print_formatted(txt):
-    txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '%s\n\t' % mo.group('indent'), txt)
+    txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
+    return txt
+
+def block_to_single_line(txt):
+    txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt)
     return txt
 
 def preserve_spaces(txt):

From 0916a9dc348a73f665eb7de6e0cba1f725f5f356 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 6 Feb 2011 08:47:17 -0500
Subject: [PATCH 4/5] ...

---
 src/calibre/ebooks/txt/processor.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index e4e7772ce7..b91191e9fe 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -59,6 +59,9 @@ def split_txt(txt, epub_split_size_kb=0):
     return txt
 
 def convert_basic(txt, title='', epub_split_size_kb=0):
+    '''
+    Requires paragraphs to be in single line format.
+    '''
     txt = clean_txt(txt)
     txt = split_txt(txt, epub_split_size_kb)
 

From b1b4e7bac58881c9970034048247e2bd8c288ce6 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 6 Feb 2011 10:12:43 -0500
Subject: [PATCH 5/5] TXT Processing: Comments.

---
 src/calibre/ebooks/txt/processor.py | 41 +++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index b91191e9fe..f7b6cce234 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -18,6 +18,10 @@ from calibre.utils.cleantext import clean_ascii_chars
 HTML_TEMPLATE = u'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
 
 def clean_txt(txt):
+    '''
+    Run transformations on the text to put it into
+    consistent state.
+    '''
     if isbytestring(txt):
         txt = txt.decode('utf-8', 'replace')
     # Strip whitespace from the end of the line. Also replace
@@ -42,6 +46,15 @@ def clean_txt(txt):
     return txt
 
 def split_txt(txt, epub_split_size_kb=0):
+    '''
+    Ensure there are split points for converting
+    to EPUB. A misdetected paragraph type can
+    result in the entire document being one giant
+    paragraph. In this case the EPUB parser will not
+    be able to determine where to split the file
+    to accomidate the EPUB file size limitation
+    and will fail.
+    '''
     #Takes care if there is no point to split
     if epub_split_size_kb > 0:
         if isinstance(txt, unicode):
@@ -60,6 +73,9 @@ def split_txt(txt, epub_split_size_kb=0):
 
 def convert_basic(txt, title='', epub_split_size_kb=0):
     '''
+    Converts plain text to html by putting all paragraphs in
+    <p> tags. It condense and retains blank lines when necessary.
+    
     Requires paragraphs to be in single line format.
     '''
     txt = clean_txt(txt)
@@ -110,11 +126,17 @@ def block_to_single_line(txt):
     return txt
 
 def preserve_spaces(txt):
+    '''
+    Replaces spaces multiple spaces with &nbsp; entities.
+    '''
     txt = re.sub('(?P<space>[ ]{2,})', lambda mo: ' ' + ('&nbsp;' * (len(mo.group('space')) - 1)), txt)
     txt = txt.replace('\t', '&nbsp;&nbsp;&nbsp;&nbsp;')
     return txt
 
 def remove_indents(txt):
+    '''
+    Remove whitespace at the beginning of each line.
+    '''
     txt = re.sub('(?miu)^\s+', '', txt)
     return txt
 
@@ -125,7 +147,10 @@ def opf_writer(path, opf_name, manifest, spine, mi):
     with open(os.path.join(path, opf_name), 'wb') as opffile:
         opf.render(opffile)
 
-def split_string_separator(txt, size) :
+def split_string_separator(txt, size):
+    '''
+    Splits the text by putting \n\n at the point size.
+    '''
     if len(txt) > size:
         txt = ''.join([re.sub(u'\.(?P<ends>[^.]*)$', '.\n\n\g<ends>',
             txt[i:i+size], 1) for i in
@@ -134,7 +159,7 @@ def split_string_separator(txt, size) :
 
 def detect_paragraph_type(txt):
     '''
-    Tries to determine the formatting of the document.
+    Tries to determine the paragraph type of the document.
 
     block: Paragraphs are separated by a blank line.
     single: Each line is a paragraph.
@@ -177,6 +202,16 @@ def detect_paragraph_type(txt):
 
 
 def detect_formatting_type(txt):
+    '''
+    Tries to determine the formatting of the document.
+    
+    markdown: Markdown formatting is used.
+    textile: Textile formatting is used.
+    heuristic: When none of the above formatting types are
+               detected heuristic is returned.
+    '''
+    # Keep a count of the number of format specific object
+    # that are found in the text.
     markdown_count = 0
     textile_count = 0
 
@@ -200,6 +235,8 @@ def detect_formatting_type(txt):
     # Links
     textile_count += len(re.findall(r'"(?=".*?\()(\(.+?\))*[^\(]+?(\(.+?\))*":[^\s]+', txt))
 
+    # Decide if either markdown or textile is used in the text
+    # based on the number of unique formatting elements found.
     if markdown_count > 5 or textile_count > 5:
         if markdown_count > textile_count:
             return 'markdown'