From ed2b94ac9d98be1ed3564c36071b62e6335ea60d Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 5 Feb 2011 10:46:32 -0500
Subject: [PATCH 1/6] Heuristics: Tweak italicize patterns to make them more
 robust.

---
 src/calibre/ebooks/conversion/utils.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 63eca10714..e8e2a82949 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -149,17 +149,17 @@ class HeuristicProcessor(object):
         ]
 
         ITALICIZE_STYLE_PATS = [
-            r'(?msu)(?<=\s)_(?P<words>\S[^_]{0,40}?\S)?_(?=[\s\.,\!\?])',
-            r'(?msu)(?<=\s)/(?P<words>\S[^/]{0,40}?\S)?/(?=[\s\.,\!\?])',
-            r'(?msu)(?<=\s)~~(?P<words>\S[^~]{0,40}?\S)?~~(?=[\s\.,\!\?])',
-            r'(?msu)(?<=\s)\*(?P<words>\S[^\*]{0,40}?\S)?\*(?=[\s\.,\!\?])',
-            r'(?msu)(?<=\s)~(?P<words>\S[^~]{0,40}?\S)?~(?=[\s\.,\!\?])',
-            r'(?msu)(?<=\s)_/(?P<words>\S[^/_]{0,40}?\S)?/_(?=[\s\.,\!\?])',
-            r'(?msu)(?<=\s)_\*(?P<words>\S[^\*_]{0,40}?\S)?\*_(?=[\s\.,\!\?])',
-            r'(?msu)(?<=\s)\*/(?P<words>\S[^/\*]{0,40}?\S)?/\*(?=[\s\.,\!\?])',
-            r'(?msu)(?<=\s)_\*/(?P<words>\S[^\*_]{0,40}?\S)?/\*_(?=[\s\.,\!\?])',
-            r'(?msu)(?<=\s)/:(?P<words>\S[^:/]{0,40}?\S)?:/(?=[\s\.,\!\?])',
-            r'(?msu)(?<=\s)\|:(?P<words>\S[^:\|]{0,40}?\S)?:\|(?=[\s\.,\!\?])',
+            r'(?msu)(?<=[\s>"])_(?P<words>[^_]+)?_',
+            r'(?msu)(?<=[\s>"])/(?P<words>[^/]+)?/',
+            r'(?msu)(?<=[\s>"])~~(?P<words>[^~]+)?~~',
+            r'(?msu)(?<=[\s>"])\*(?P<words>[^\*]+)?\*',
+            r'(?msu)(?<=[\s>"])~(?P<words>[^~]+)?~',
+            r'(?msu)(?<=[\s>"])_/(?P<words>[^/_]+)?/_',
+            r'(?msu)(?<=[\s>"])_\*(?P<words>[^\*_]+)?\*_',
+            r'(?msu)(?<=[\s>"])\*/(?P<words>[^/\*]+)?/\*',
+            r'(?msu)(?<=[\s>"])_\*/(?P<words>[^\*_]+)?/\*_',
+            r'(?msu)(?<=[\s>"])/:(?P<words>[^:/]+)?:/',
+            r'(?msu)(?<=[\s>"])\|:(?P<words>[^:\|]+)?:\|',
         ]
 
         for word in ITALICIZE_WORDS:

From deee20d8f85010059019acfdfd4d6c719711ec73 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 5 Feb 2011 11:02:00 -0500
Subject: [PATCH 2/6] TXT Output: Fix inline toc not showing all items.

---
 src/calibre/ebooks/txt/txtml.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py
index c2ee3f37c5..fa7bfbb380 100644
--- a/src/calibre/ebooks/txt/txtml.py
+++ b/src/calibre/ebooks/txt/txtml.py
@@ -55,6 +55,7 @@ class TXTMLizer(object):
         self.log.info('Converting XHTML to TXT...')
         self.oeb_book = oeb_book
         self.opts = opts
+        self.toc_titles = []
         self.toc_ids = []
         self.last_was_heading = False
         
@@ -94,8 +95,8 @@ class TXTMLizer(object):
         if getattr(self.opts, 'inline_toc', None):
             self.log.debug('Generating table of contents...')
             toc.append(u'%s\n\n' % _(u'Table of Contents:'))
-            for item in self.oeb_book.toc:
-                toc.append(u'* %s\n\n' % item.title)
+            for item in self.toc_titles:
+                toc.append(u'* %s\n\n' % item)
         return ''.join(toc)
 
     def create_flat_toc(self, nodes):
@@ -103,6 +104,7 @@ class TXTMLizer(object):
         Turns a hierarchical list of TOC href's into a flat list.
         '''
         for item in nodes:
+            self.toc_titles.append(item.title)
             self.toc_ids.append(item.href)
             self.create_flat_toc(item.nodes)
 

From 9cdad92468b25f289f5531be56be0ec0ee32e01d Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 5 Feb 2011 12:51:47 -0500
Subject: [PATCH 3/6] TXT Input: Restructure to run dehyphenator when auto and
 heuristic formatting options are used. This causes textile and markdown to be
 dehyphenated.

---
 src/calibre/ebooks/txt/input.py | 47 ++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index e1392ef732..85bd781ff8 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -77,20 +77,6 @@ class TXTInput(InputFormatPlugin):
         # Normalize line endings
         txt = normalize_line_endings(txt)
 
-        # Detect formatting
-        if options.formatting_type == 'auto':
-            options.formatting_type = detect_formatting_type(txt)
-            log.debug('Auto detected formatting as %s' % options.formatting_type)
-
-        if options.formatting_type == 'heuristic':
-            setattr(options, 'enable_heuristics', True)
-            setattr(options, 'markup_chapter_headings', True)
-            setattr(options, 'italicize_common_cases', True)
-            setattr(options, 'fix_indents', True)
-            setattr(options, 'delete_blank_paragraphs', True)
-            setattr(options, 'format_scene_breaks', True)
-            setattr(options, 'dehyphenate', True)
-
         # Determine the paragraph type of the document.
         if options.paragraph_type == 'auto':
             options.paragraph_type = detect_paragraph_type(txt)
@@ -99,16 +85,27 @@ class TXTInput(InputFormatPlugin):
                 options.paragraph_type = 'block'
             else:
                 log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
+                
+        dehyphenate = False
+        if options.formatting_type in ('auto', 'heuristic'):
+            # Set this here because we want it to run over all
+            # formatting types if auto is used.
+            dehyphenate = True
+
+        # Detect formatting
+        if options.formatting_type == 'auto':
+            options.formatting_type = detect_formatting_type(txt)
+            log.debug('Auto detected formatting as %s' % options.formatting_type)
+
+        if options.formatting_type == 'heuristic':
+            setattr(options, 'enable_heuristics', True)
+            setattr(options, 'unwrap_lines', False)
 
         # Preserve spaces will replace multiple spaces to a space
         # followed by the &nbsp; entity.
         if options.preserve_spaces:
             txt = preserve_spaces(txt)
 
-        # Get length for hyphen removal and punctuation unwrap
-        docanalysis = DocAnalysis('txt', txt)
-        length = docanalysis.line_length(.5)
-
         # Reformat paragraphs to block formatting based on the detected type.
         # We don't check for block because the processor assumes block.
         # single and print at transformed to block for processing.
@@ -119,9 +116,17 @@ class TXTInput(InputFormatPlugin):
         elif options.paragraph_type == 'unformatted':
             from calibre.ebooks.conversion.utils import HeuristicProcessor
             # unwrap lines based on punctuation
+            docanalysis = DocAnalysis('txt', txt)
+            length = docanalysis.line_length(.5)
             preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
             txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
 
+        if dehyphenate:
+            docanalysis = DocAnalysis('txt', txt)
+            length = docanalysis.line_length(.5)
+            dehyphenator = Dehyphenator(options.verbose, log=self.log)
+            txt = dehyphenator(txt,'txt', length)
+
         # Process the text using the appropriate text processor.
         html = ''
         if options.formatting_type == 'markdown':
@@ -134,14 +139,8 @@ class TXTInput(InputFormatPlugin):
         elif options.formatting_type == 'textile':
             log.debug('Running text through textile conversion...')
             html = convert_textile(txt)
-
         else:
             log.debug('Running text through basic conversion...')
-            if options.formatting_type == 'heuristic':
-                # Dehyphenate
-                dehyphenator = Dehyphenator(options.verbose, log=self.log)
-                txt = dehyphenator(txt,'txt', length)
-
             flow_size = getattr(options, 'flow_size', 0)
             html = convert_basic(txt, epub_split_size_kb=flow_size)
 

From 2796960f420cf26ad621f137845e6db84bc3019d Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 5 Feb 2011 13:04:32 -0500
Subject: [PATCH 4/6] Heuristics: Fix issue with invalid markup from italicize
 patterns.

---
 src/calibre/ebooks/conversion/utils.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index e8e2a82949..c0c2ee8978 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -149,17 +149,17 @@ class HeuristicProcessor(object):
         ]
 
         ITALICIZE_STYLE_PATS = [
-            r'(?msu)(?<=[\s>"])_(?P<words>[^_]+)?_',
-            r'(?msu)(?<=[\s>"])/(?P<words>[^/]+)?/',
-            r'(?msu)(?<=[\s>"])~~(?P<words>[^~]+)?~~',
-            r'(?msu)(?<=[\s>"])\*(?P<words>[^\*]+)?\*',
-            r'(?msu)(?<=[\s>"])~(?P<words>[^~]+)?~',
-            r'(?msu)(?<=[\s>"])_/(?P<words>[^/_]+)?/_',
-            r'(?msu)(?<=[\s>"])_\*(?P<words>[^\*_]+)?\*_',
-            r'(?msu)(?<=[\s>"])\*/(?P<words>[^/\*]+)?/\*',
-            r'(?msu)(?<=[\s>"])_\*/(?P<words>[^\*_]+)?/\*_',
-            r'(?msu)(?<=[\s>"])/:(?P<words>[^:/]+)?:/',
-            r'(?msu)(?<=[\s>"])\|:(?P<words>[^:\|]+)?:\|',
+            r'(?msu)(?<=[\s>])_(?P<words>[^_]+)?_',
+            r'(?msu)(?<=[\s>])/(?P<words>[^/]+)?/',
+            r'(?msu)(?<=[\s>])~~(?P<words>[^~]+)?~~',
+            r'(?msu)(?<=[\s>])\*(?P<words>[^\*]+)?\*',
+            r'(?msu)(?<=[\s>])~(?P<words>[^~]+)?~',
+            r'(?msu)(?<=[\s>])_/(?P<words>[^/_]+)?/_',
+            r'(?msu)(?<=[\s>])_\*(?P<words>[^\*_]+)?\*_',
+            r'(?msu)(?<=[\s>])\*/(?P<words>[^/\*]+)?/\*',
+            r'(?msu)(?<=[\s>])_\*/(?P<words>[^\*_]+)?/\*_',
+            r'(?msu)(?<=[\s>])/:(?P<words>[^:/]+)?:/',
+            r'(?msu)(?<=[\s>])\|:(?P<words>[^:\|]+)?:\|',
         ]
 
         for word in ITALICIZE_WORDS:

From 1f708746d0e6e62234f9a2a9a96cd5bcf73bfc94 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 5 Feb 2011 13:35:41 -0500
Subject: [PATCH 5/6] TXT Input: Fix bug where spaces were not retained
 properly. Fix bug where spaces were replaced with entities (this should only
 have happened at the beginning of lines). Add option to remove indents.

---
 src/calibre/ebooks/txt/input.py     |  8 +++++++-
 src/calibre/ebooks/txt/processor.py | 12 ++++++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 85bd781ff8..b1374bbeec 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
     separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
     preserve_spaces, detect_paragraph_type, detect_formatting_type, \
-    normalize_line_endings, convert_textile
+    normalize_line_endings, convert_textile, remove_indents
 from calibre import _ent_pat, xml_entity_to_unicode
 
 class TXTInput(InputFormatPlugin):
@@ -47,6 +47,9 @@ class TXTInput(InputFormatPlugin):
         OptionRecommendation(name='preserve_spaces', recommended_value=False,
             help=_('Normally extra spaces are condensed into a single space. '
                 'With this option all spaces will be displayed.')),
+        OptionRecommendation(name='txt_in_remove_indents', recommended_value=False,
+            help=_('Normally extra space at the beginning of lines is retained. '
+                   'With this option they will be removed.')),
         OptionRecommendation(name="markdown_disable_toc", recommended_value=False,
             help=_('Do not insert a Table of Contents into the output text.')),
     ])
@@ -101,6 +104,9 @@ class TXTInput(InputFormatPlugin):
             setattr(options, 'enable_heuristics', True)
             setattr(options, 'unwrap_lines', False)
 
+        if options.txt_in_remove_indents:
+            txt = remove_indents(txt)
+
         # Preserve spaces will replace multiple spaces to a space
         # followed by the &nbsp; entity.
         if options.preserve_spaces:
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index 546d3f1842..987d7cdc73 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -24,14 +24,14 @@ def clean_txt(txt):
     # all line breaks with \n.
     txt = '\n'.join([line.rstrip() for line in txt.splitlines()])
     
-    # Replace whitespace at the beginning of the list with &nbsp;
-    txt = re.sub('(?m)(?P<space>[ ]+)', lambda mo: '&nbsp;' * mo.groups('space').count(' '), txt)
-    txt = re.sub('(?m)(?P<space>[\t]+)', lambda mo: '&nbsp;' * 4 * mo.groups('space').count('\t'), txt)
+    # Replace whitespace at the beginning of the line with &nbsp;
+    txt = re.sub('(?m)(?P<space>^[ ]+)(?=.)', lambda mo: '&nbsp;' * mo.groups('space').count(' '), txt)
+    txt = re.sub('(?m)(?P<space>^[\t]+)(?=.)', lambda mo: '&nbsp;' * 4 * mo.groups('space').count('\t'), txt)
 
     # Condense redundant spaces
     txt = re.sub('[ ]{2,}', ' ', txt)
 
-    # Remove blank lines from the beginning and end of the document.
+    # Remove blank space from the beginning and end of the document.
     txt = re.sub('^\s+(?=.)', '', txt)
     txt = re.sub('(?<=.)\s+$', '', txt)
     # Remove excessive line breaks.
@@ -107,6 +107,10 @@ def preserve_spaces(txt):
     txt = txt.replace('\t', '&nbsp;&nbsp;&nbsp;&nbsp;')
     return txt
 
+def remove_indents(txt):
+    txt = re.sub('(?miu)^\s+', '', txt)
+    return txt
+
 def opf_writer(path, opf_name, manifest, spine, mi):
     opf = OPFCreator(path, mi)
     opf.create_manifest(manifest)

From a0fd28d9660f56ed2a37abe7b185c53f65e1dff7 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 5 Feb 2011 13:46:15 -0500
Subject: [PATCH 6/6] TXT Input GUI: Add remove indents option. Restructure
 options to make them grouped cleaner.

---
 src/calibre/gui2/convert/txt_input.py |   3 +-
 src/calibre/gui2/convert/txt_input.ui | 127 ++++++++++++++++++--------
 2 files changed, 91 insertions(+), 39 deletions(-)

diff --git a/src/calibre/gui2/convert/txt_input.py b/src/calibre/gui2/convert/txt_input.py
index 62672cc0f9..acdf5f43c0 100644
--- a/src/calibre/gui2/convert/txt_input.py
+++ b/src/calibre/gui2/convert/txt_input.py
@@ -16,7 +16,8 @@ class PluginWidget(Widget, Ui_Form):
 
     def __init__(self, parent, get_option, get_help, db=None, book_id=None):
         Widget.__init__(self, parent,
-            ['paragraph_type', 'formatting_type', 'markdown_disable_toc', 'preserve_spaces'])
+            ['paragraph_type', 'formatting_type', 'markdown_disable_toc',
+             'preserve_spaces', 'txt_in_remove_indents'])
         self.db, self.book_id = db, book_id
         for x in get_option('paragraph_type').option.choices:
             self.opt_paragraph_type.addItem(x)
diff --git a/src/calibre/gui2/convert/txt_input.ui b/src/calibre/gui2/convert/txt_input.ui
index 6cbd68135f..211b03294a 100644
--- a/src/calibre/gui2/convert/txt_input.ui
+++ b/src/calibre/gui2/convert/txt_input.ui
@@ -7,57 +7,95 @@
     <x>0</x>
     <y>0</y>
     <width>518</width>
-    <height>300</height>
+    <height>353</height>
    </rect>
   </property>
   <property name="windowTitle">
    <string>Form</string>
   </property>
-  <layout class="QGridLayout" name="gridLayout">
-   <item row="0" column="0">
-    <widget class="QLabel" name="label_2">
-     <property name="text">
-      <string>Paragraph style:</string>
+  <layout class="QVBoxLayout" name="verticalLayout_3">
+   <item>
+    <widget class="QGroupBox" name="groupBox_3">
+     <property name="title">
+      <string>Structure</string>
      </property>
+     <layout class="QGridLayout" name="gridLayout">
+      <item row="0" column="0">
+       <widget class="QLabel" name="label_2">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+        <property name="text">
+         <string>Paragraph style:</string>
+        </property>
+       </widget>
+      </item>
+      <item row="0" column="1">
+       <widget class="QComboBox" name="opt_paragraph_type">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+       </widget>
+      </item>
+      <item row="1" column="0">
+       <widget class="QLabel" name="label_3">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+        <property name="text">
+         <string>Formatting style:</string>
+        </property>
+       </widget>
+      </item>
+      <item row="1" column="1">
+       <widget class="QComboBox" name="opt_formatting_type">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+       </widget>
+      </item>
+     </layout>
     </widget>
    </item>
-   <item row="0" column="1">
-    <widget class="QComboBox" name="opt_paragraph_type"/>
-   </item>
-   <item row="5" column="0" colspan="2">
-    <widget class="QCheckBox" name="opt_preserve_spaces">
-     <property name="text">
-      <string>Preserve &amp;spaces</string>
+   <item>
+    <widget class="QGroupBox" name="groupBox_2">
+     <property name="title">
+      <string>Common</string>
      </property>
+     <layout class="QVBoxLayout" name="verticalLayout_2">
+      <item>
+       <widget class="QCheckBox" name="opt_preserve_spaces">
+        <property name="text">
+         <string>Preserve &amp;spaces</string>
+        </property>
+       </widget>
+      </item>
+      <item>
+       <widget class="QCheckBox" name="opt_txt_in_remove_indents">
+        <property name="text">
+         <string>Remove indents at the beginning of lines</string>
+        </property>
+       </widget>
+      </item>
+     </layout>
     </widget>
    </item>
-   <item row="6" column="0" colspan="2">
-    <spacer name="verticalSpacer">
-     <property name="orientation">
-      <enum>Qt::Vertical</enum>
-     </property>
-     <property name="sizeHint" stdset="0">
-      <size>
-       <width>20</width>
-       <height>213</height>
-      </size>
-     </property>
-    </spacer>
-   </item>
-   <item row="1" column="1">
-    <widget class="QComboBox" name="opt_formatting_type"/>
-   </item>
-   <item row="1" column="0">
-    <widget class="QLabel" name="label_3">
-     <property name="text">
-      <string>Formatting style:</string>
-     </property>
-    </widget>
-   </item>
-   <item row="2" column="0" rowspan="2" colspan="2">
+   <item>
     <widget class="QGroupBox" name="groupBox">
      <property name="title">
-      <string>Markdown Options</string>
+      <string>Markdown</string>
      </property>
      <layout class="QVBoxLayout" name="verticalLayout">
       <item>
@@ -83,6 +121,19 @@
      </layout>
     </widget>
    </item>
+   <item>
+    <spacer name="verticalSpacer">
+     <property name="orientation">
+      <enum>Qt::Vertical</enum>
+     </property>
+     <property name="sizeHint" stdset="0">
+      <size>
+       <width>20</width>
+       <height>213</height>
+      </size>
+     </property>
+    </spacer>
+   </item>
   </layout>
  </widget>
  <resources/>