From b0a9c9659cda37d6cda41b22cd765713fb29f308 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Thu, 13 Jan 2011 19:58:09 -0500
Subject: [PATCH 01/54] Add heuristic options. Remove options that they
 replace.

---
 src/calibre/ebooks/conversion/cli.py     |  23 ++++-
 src/calibre/ebooks/conversion/plumber.py | 117 ++++++++++++++---------
 2 files changed, 92 insertions(+), 48 deletions(-)

diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index 3178fe1b43..f825776c9c 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -126,8 +126,21 @@ def add_pipeline_options(parser, plumber):
                       'margin_top', 'margin_left', 'margin_right',
                       'margin_bottom', 'change_justification',
                       'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
-                      'asciiize', 'remove_header', 'header_regex',
-                      'remove_footer', 'footer_regex',
+                      'asciiize',
+                  ]
+                  ),
+                  
+              'HEURISTICS' : (
+                  _('Modify the document text and strucutre using common patterns.'),
+                  [
+                      'enable_heuristics', 'markup_chapter_headings',
+                      'italicize_common_cases', 'fix_indents',
+                      'html_unwrap_factor', 'unwrap_lines',
+                      'delete_blank_paragraphs', 'format_scene_breaks',
+                      'dehyphenate',
+                      'sr1_search', 'sr1_replace',
+                      'sr2_search', 'sr2_replace',
+                      'sr3_search', 'sr3_replace',
                   ]
                   ),
 
@@ -137,7 +150,6 @@ def add_pipeline_options(parser, plumber):
                       'chapter', 'chapter_mark',
                       'prefer_metadata_cover', 'remove_first_image',
                       'insert_metadata', 'page_breaks_before',
-                      'preprocess_html', 'html_unwrap_factor',
                   ]
                   ),
 
@@ -164,8 +176,9 @@ def add_pipeline_options(parser, plumber):
 
               }
 
-    group_order = ['', 'LOOK AND FEEL', 'STRUCTURE DETECTION',
-            'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
+    group_order = ['', 'LOOK AND FEEL', 'HEURISTICS',
+            'STRUCTURE DETECTION', 'TABLE OF CONTENTS',
+            'METADATA', 'DEBUG']
 
     for group in group_order:
         desc, options = groups[group]
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 9b22fb46ec..3ec4e104f9 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -376,23 +376,6 @@ OptionRecommendation(name='insert_metadata',
             )
         ),
 
-OptionRecommendation(name='preprocess_html',
-        recommended_value=False, level=OptionRecommendation.LOW,
-        help=_('Attempt to detect and correct hard line breaks and other '
-            'problems in the source file. This may make things worse, so use '
-            'with care.'
-            )
-        ),
-
-OptionRecommendation(name='html_unwrap_factor',
-        recommended_value=0.40, level=OptionRecommendation.LOW,
-        help=_('Scale used to determine the length at which a line should '
-            'be unwrapped if preprocess is enabled. Valid values are a decimal between 0 and 1. The '
-            'default is 0.40, just below the median line length. This will unwrap typical books '
-            ' with hard line breaks, but should be reduced if the line length is variable.'
-            )
-        ),
-
 OptionRecommendation(name='smarten_punctuation',
         recommended_value=False, level=OptionRecommendation.LOW,
         help=_('Convert plain quotes, dashes and ellipsis to their '
@@ -401,32 +384,6 @@ OptionRecommendation(name='smarten_punctuation',
             )
         ),
 
-OptionRecommendation(name='remove_header',
-        recommended_value=False, level=OptionRecommendation.LOW,
-        help=_('Use a regular expression to try and remove the header.'
-            )
-        ),
-
-OptionRecommendation(name='header_regex',
-        recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
-        level=OptionRecommendation.LOW,
-        help=_('The regular expression to use to remove the header.'
-            )
-        ),
-
-OptionRecommendation(name='remove_footer',
-        recommended_value=False, level=OptionRecommendation.LOW,
-        help=_('Use a regular expression to try and remove the footer.'
-            )
-        ),
-
-OptionRecommendation(name='footer_regex',
-        recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
-        level=OptionRecommendation.LOW,
-        help=_('The regular expression to use to remove the footer.'
-            )
-        ),
-
 OptionRecommendation(name='read_metadata_from_opf',
             recommended_value=None, level=OptionRecommendation.LOW,
             short_switch='m',
@@ -526,7 +483,81 @@ OptionRecommendation(name='pubdate',
 OptionRecommendation(name='timestamp',
     recommended_value=None, level=OptionRecommendation.LOW,
     help=_('Set the book timestamp (used by the date column in calibre).')),
+    
+OptionRecommendation(name='enable_heuristics',
+    recommended_value=False, level=OptionRecommendation.LOW,
+    help=_('Enable heurisic processing. This option must be set for any '
+           'heuristic processing to take place.')),
 
+OptionRecommendation(name='markup_chapter_headings',
+    recommended_value=False, level=OptionRecommendation.LOW,
+    help=_('Detect chapter headings and sub headings. Change ' 
+           'them to h1 and h2 tags.')),
+           
+OptionRecommendation(name='italicize_common_cases',
+    recommended_value=False, level=OptionRecommendation.LOW,
+    help=_('Look for common words and patterns that denote '
+           'italics and italicize them.')),
+           
+OptionRecommendation(name='fix_indents',
+    recommended_value=True, level=OptionRecommendation.LOW,
+    help=_('Turn indentation created from multiple &nbsp; entities '
+           'into CSS indents.')),
+           
+OptionRecommendation(name='html_unwrap_factor',
+    recommended_value=0.40, level=OptionRecommendation.LOW,
+    help=_('Scale used to determine the length at which a line should '
+            'be unwrapped. Valid values are a decimal between 0 and 1. The '
+            'default is 0.4, just below the median line length.')),
+            
+OptionRecommendation(name='unwrap_lines',
+    recommended_value=False, level=OptionRecommendation.LOW,
+    help=_('Unwrap lines.')),
+    
+OptionRecommendation(name='delete_blank_paragraphs',
+    recommended_value=True, level=OptionRecommendation.LOW,
+    help=_('Remove empyt paragraphs from the document')),
+    
+OptionRecommendation(name='format_scene_breaks',
+    recommended_value=False, level=OptionRecommendation.LOW,
+    help=_('Replace soft scene breaks that use multiple blank lines '
+           'with horizontal rules.')),
+
+OptionRecommendation(name='dehyphenate',
+    recommended_value=True, level=OptionRecommendation.LOW,
+    help=_('Combine words that are separated by a hyphen. '
+           'This is for cases where a word is hyphenated across '
+           'two lines to denote the characters from a single word.')),
+    
+OptionRecommendation(name='sr1_search',
+    recommended_value='', level=OptionRecommendation.LOW,
+    help=_('Search pattern (regular expression) to be replaced with '
+           'sr1-replace.')),
+    
+OptionRecommendation(name='sr1_replace',
+    recommended_value='', level=OptionRecommendation.LOW,
+    help=_('Replace characters (can be lambda expression) to '
+           'replace the text found with sr1-search.')),
+
+OptionRecommendation(name='sr2_search',
+    recommended_value='', level=OptionRecommendation.LOW,
+    help=_('Search pattern (regular expression) to be replaced with '
+           'sr2-replace.')),
+
+OptionRecommendation(name='sr2_replace',
+    recommended_value='', level=OptionRecommendation.LOW,
+    help=_('Replace characters (can be lambda expression) to '
+           'replace the text found with sr2-search.')),
+
+OptionRecommendation(name='sr3_search',
+    recommended_value='', level=OptionRecommendation.LOW,
+    help=_('Search pattern (regular expression) to be replaced with '
+           'sr3-replace.')),
+
+OptionRecommendation(name='sr3_replace',
+    recommended_value='', level=OptionRecommendation.LOW,
+    help=_('Replace characters (can be lambda expression) to '
+           'replace the text found with sr3-search.')),
 ]
         # }}}
 

From 8676ddd30fba0df90eb62e7c1c84c3fd3dc13f39 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 14 Jan 2011 18:12:17 +0800
Subject: [PATCH 02/54] updated heuristics help messages

---
 src/calibre/ebooks/conversion/plumber.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 3ec4e104f9..50d0646c7d 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -491,8 +491,8 @@ OptionRecommendation(name='enable_heuristics',
 
 OptionRecommendation(name='markup_chapter_headings',
     recommended_value=False, level=OptionRecommendation.LOW,
-    help=_('Detect chapter headings and sub headings. Change ' 
-           'them to h1 and h2 tags.')),
+    help=_('Detect unformatted chapter headings and sub headings. Change ' 
+           'them to h2 and h3 tags.')),
            
 OptionRecommendation(name='italicize_common_cases',
     recommended_value=False, level=OptionRecommendation.LOW,
@@ -508,26 +508,30 @@ OptionRecommendation(name='html_unwrap_factor',
     recommended_value=0.40, level=OptionRecommendation.LOW,
     help=_('Scale used to determine the length at which a line should '
             'be unwrapped. Valid values are a decimal between 0 and 1. The '
-            'default is 0.4, just below the median line length.')),
+            'default is 0.4, just below the median line length.  If only a '
+            'few lines in the document require unwrapping this value should '
+            'be reduced')),
             
 OptionRecommendation(name='unwrap_lines',
     recommended_value=False, level=OptionRecommendation.LOW,
-    help=_('Unwrap lines.')),
+    help=_('Unwrap lines using punctuation and other formatting clues.')),
     
 OptionRecommendation(name='delete_blank_paragraphs',
     recommended_value=True, level=OptionRecommendation.LOW,
-    help=_('Remove empyt paragraphs from the document')),
+    help=_('Remove empty paragraphs from the document when they exist between '
+           'every other paragraph')),
     
 OptionRecommendation(name='format_scene_breaks',
     recommended_value=False, level=OptionRecommendation.LOW,
-    help=_('Replace soft scene breaks that use multiple blank lines '
-           'with horizontal rules.')),
+    help=_('Detects left aligned scene break markers and center aligns them. '
+           'Replace soft scene breaks that use multiple blank lines with'
+           'horizontal rules.')),
 
 OptionRecommendation(name='dehyphenate',
     recommended_value=True, level=OptionRecommendation.LOW,
-    help=_('Combine words that are separated by a hyphen. '
-           'This is for cases where a word is hyphenated across '
-           'two lines to denote the characters from a single word.')),
+    help=_('Analyses hyphenated words throughout the document.  The '
+           'document itself is used as a dictionary to determine whether hyphens '
+           'should be retained or removed.')),
     
 OptionRecommendation(name='sr1_search',
     recommended_value='', level=OptionRecommendation.LOW,

From 90c978bb1076e7afb7843df96959cc365d17332d Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 14 Jan 2011 21:33:47 +0800
Subject: [PATCH 03/54] tied enable heuristics to preprocess, moved various
 pieces to functions

---
 src/calibre/customize/conversion.py      |   2 +-
 src/calibre/ebooks/conversion/plumber.py |   4 +-
 src/calibre/ebooks/conversion/utils.py   | 117 ++++++++++++++---------
 src/calibre/ebooks/lit/input.py          |   2 +-
 4 files changed, 78 insertions(+), 47 deletions(-)

diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py
index ec83600a49..a9e573ffa0 100644
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@@ -160,7 +160,7 @@ class InputFormatPlugin(Plugin):
         '''
         raise NotImplementedError()
 
-    def preprocess_html(self, opts, html):
+    def heuristics(self, opts, html):
         '''
         This method is called by the conversion pipeline on all HTML before it
         is parsed. It is meant to be used to do any required preprocessing on
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 50d0646c7d..a40c17a743 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -1012,8 +1012,8 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
     Create an OEBBook.
     '''
     from calibre.ebooks.oeb.base import OEBBook
-    html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
-            opts.preprocess_html, opts)
+    html_preprocessor = HTMLPreProcessor(input_plugin.heuristics,
+            opts.enable_heuristics, opts)
     if not encoding:
         encoding = None
     oeb = OEBBook(log, html_preprocessor,
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index dac93fa2e2..44d4235b6c 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -113,6 +113,11 @@ class PreProcessor(object):
         return wordcount.words
 
     def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
+        '''
+        Searches for common chapter headings throughout the document
+        attempts multiple patterns based on likelihood of a match
+        with minimum false positives.  Exits after finding a successful pattern
+        '''
         # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
         # minimum of chapters to search for
         self.min_chapters = 1
@@ -185,6 +190,10 @@ class PreProcessor(object):
         return html
 
     def punctuation_unwrap(self, length, content, format):
+        '''
+        Unwraps lines based on line length and punctuation
+        supports range of potential html markup and text files
+        '''
         # define the pieces of the regex
         lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
         line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
@@ -201,53 +210,38 @@ class PreProcessor(object):
         return content
 
 
-    def __call__(self, html):
-        self.log("*********  Preprocessing HTML  *********")
+    def text_process_pre(self, html):
+        pre = re.compile(r'<pre>', re.IGNORECASE)
+        if len(pre.findall(html)) == 1:
+            self.log("Running Text Processing")
+            from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
+            separate_paragraphs_single_line
+            outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
+            html = outerhtml.sub('\g<text>', html)
+            html = separate_paragraphs_single_line(html)
+            html = preserve_spaces(html)
+            html = convert_basic(html, epub_split_size_kb=0)
+        else:
+            # Add markup naively
+            # TODO - find out if there are cases where there are more than one <pre> tag or
+            # other types of unmarked html and handle them in some better fashion
+            add_markup = re.compile('(?<!>)(\n)')
+            html = add_markup.sub('</p>\n<p>', html)
+        return html
 
-        # Count the words in the document to estimate how many chapters to look for and whether
-        # other types of processing are attempted
-        totalwords = 0
-        totalwords = self.get_word_count(html)
-
-        if totalwords < 50:
-            self.log("not enough text, not preprocessing")
-            return html
-
-        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+    def arrange_htm_line_endings(self, html):
         html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
         html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
+        return html
 
-        ###### Check Markup ######
-        #
-        # some lit files don't have any <p> tags or equivalent (generally just plain text between
-        # <pre> tags), check and  mark up line endings if required before proceeding
-        if self.no_markup(html, 0.1):
-            self.log("not enough paragraph markers, adding now")
-            # check if content is in pre tags, use txt processor to mark up if so
-            pre = re.compile(r'<pre>', re.IGNORECASE)
-            if len(pre.findall(html)) == 1:
-                self.log("Running Text Processing")
-                from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
-                separate_paragraphs_single_line
-                outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
-                html = outerhtml.sub('\g<text>', html)
-                html = separate_paragraphs_single_line(html)
-                html = preserve_spaces(html)
-                html = convert_basic(html, epub_split_size_kb=0)
-            else:
-                # Add markup naively
-                # TODO - find out if there are cases where there are more than one <pre> tag or
-                # other types of unmarked html and handle them in some better fashion
-                add_markup = re.compile('(?<!>)(\n)')
-                html = add_markup.sub('</p>\n<p>', html)
-
-        ###### Mark Indents/Cleanup ######
-        #
-        # Replace series of non-breaking spaces with text-indent
+    def fix_nbsp_indents(self, html):
         txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
         html = txtindent.sub(self.insert_indent, html)
         if self.found_indents > 1:
             self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
+        return html
+
+    def cleanup_markup(self, html):
         # remove remaining non-breaking spaces
         html = re.sub(ur'\u00a0', ' ', html)
         # Get rid of various common microsoft specific tags which can cause issues later
@@ -259,27 +253,64 @@ class PreProcessor(object):
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
         html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
+        return html
+
+
+    def __call__(self, html):
+        self.log("*********  Preprocessing HTML  *********")
+
+        # Count the words in the document to estimate how many chapters to look for and whether
+        # other types of processing are attempted
+        totalwords = 0
+        totalwords = self.get_word_count(html)
+
+        if totalwords < 50:
+            self.log("flow is too short, not running heuristics")
+            return html
+
+        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+        html = self.arrange_htm_line_endings(html)
+
+
+        ###### Check Markup ######
+        #
+        # some lit files don't have any <p> tags or equivalent (generally just plain text between
+        # <pre> tags), check and  mark up line endings if required before proceeding
+        if self.no_markup(html, 0.1):
+            self.log("not enough paragraph markers, adding now")
+            # check if content is in pre tags, use txt processor to mark up if so
+            html = self.text_process_pre(html)
+
+        ###### Mark Indents/Cleanup ######
+        #
+        # Replace series of non-breaking spaces with text-indent
+        html = self.fix_nbsp_indents(html)
+        
+        html = self.cleanup_markup(html)
+
         # ADE doesn't render <br />, change to empty paragraphs
         #html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
 
-        # If more than 40% of the lines are empty paragraphs and the user has enabled remove
-        # paragraph spacing then delete blank lines to clean up spacing
+        # If more than 40% of the lines are empty paragraphs and the user has enabled delete
+        # blank paragraphs then delete blank lines to clean up spacing
         linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
         blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
         blanklines = blankreg.findall(html)
         lines = linereg.findall(html)
         blanks_between_paragraphs = False
+        if getattr(self.extra_opts, 'delete_blank_paragraphs', False):
+            print "configured to delete blank paragraphs"
         if len(lines) > 1:
             self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
                     unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
             if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
-            'remove_paragraph_spacing', False):
+            'delete_blank_paragraphs', False):
                 self.log("deleting blank lines")
                 html = blankreg.sub('', html)
             elif float(len(blanklines)) / float(len(lines)) > 0.40:
                 blanks_between_paragraphs = True
-                #print "blanks between paragraphs is marked True"
+                print "blanks between paragraphs is marked True"
             else:
                 blanks_between_paragraphs = False
 
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 46a5e75977..d0ecf008b7 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -53,7 +53,7 @@ class LITInput(InputFormatPlugin):
                         pre.append(ne)
 
 
-    def preprocess_html(self, options, html):
+    def heuristics(self, options, html):
         self.options = options
         preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
         return preprocessor(html)

From 4893fa5d3a5ff8f4a3e3ebf8915ff6611c9c3921 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 14 Jan 2011 22:18:44 +0800
Subject: [PATCH 04/54] fixed the other plugins using preprocess

---
 src/calibre/ebooks/html/input.py | 2 +-
 src/calibre/ebooks/lrf/input.py  | 2 +-
 src/calibre/ebooks/mobi/input.py | 2 +-
 src/calibre/ebooks/pdb/input.py  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 1f07f4ca41..479f852c77 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -486,7 +486,7 @@ class HTMLInput(InputFormatPlugin):
             return (None, None)
         return (None, raw)
 
-    def preprocess_html(self, options, html):
+    def heuristics(self, options, html):
         self.options = options
         preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
         return preprocessor(html)
diff --git a/src/calibre/ebooks/lrf/input.py b/src/calibre/ebooks/lrf/input.py
index 70529c0a04..05c8731da5 100644
--- a/src/calibre/ebooks/lrf/input.py
+++ b/src/calibre/ebooks/lrf/input.py
@@ -420,7 +420,7 @@ class LRFInput(InputFormatPlugin):
         styles.write()
         return os.path.abspath('content.opf')
 
-    def preprocess_html(self, options, html):
+    def heuristics(self, options, html):
         self.options = options
         preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
         return preprocessor(html)
diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py
index 9ab7996a74..584be71fe4 100644
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@@ -39,7 +39,7 @@ class MOBIInput(InputFormatPlugin):
                 accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
         return mr.created_opf_path
 
-    def preprocess_html(self, options, html):
+    def heuristics(self, options, html):
         # search for places where a first or second level heading is immediately followed by another
         # top level heading.  demote the second heading to h3 to prevent splitting between chapter
         # headings and titles, images, etc
diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py
index 1b665bf94e..b0e7746c7e 100644
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@@ -33,7 +33,7 @@ class PDBInput(InputFormatPlugin):
 
         return opf
 
-    def preprocess_html(self, options, html):
+    def heuristics(self, options, html):
         self.options = options
         preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
         return preprocessor(html)

From 80ed2e7d4ee1f94b6f6ffe1297b7058345a8d22a Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 14 Jan 2011 22:32:03 +0800
Subject: [PATCH 05/54] ...

---
 src/calibre/ebooks/conversion/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 44d4235b6c..bfb23c45aa 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -299,8 +299,7 @@ class PreProcessor(object):
         blanklines = blankreg.findall(html)
         lines = linereg.findall(html)
         blanks_between_paragraphs = False
-        if getattr(self.extra_opts, 'delete_blank_paragraphs', False):
-            print "configured to delete blank paragraphs"
+        print "delete blank paragraphs is "+str(getattr(self.extra_opts, 'delete_blank_paragraphs', True))
         if len(lines) > 1:
             self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
                     unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")

From 65f9eff665042099269114f2905bfc30eef0a456 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 14 Jan 2011 23:43:38 +0800
Subject: [PATCH 06/54] remove heuristics from pdb input

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 src/calibre/ebooks/pdb/input.py        | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index bfb23c45aa..286fad1aaa 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -299,7 +299,7 @@ class PreProcessor(object):
         blanklines = blankreg.findall(html)
         lines = linereg.findall(html)
         blanks_between_paragraphs = False
-        print "delete blank paragraphs is "+str(getattr(self.extra_opts, 'delete_blank_paragraphs', True))
+        print "delete blank paragraphs is "+str(getattr(self.extra_opts, 'delete_blank_paragraphs', False))
         if len(lines) > 1:
             self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
                     unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py
index b0e7746c7e..de210e0a6d 100644
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@@ -32,8 +32,3 @@ class PDBInput(InputFormatPlugin):
         opf = reader.extract_content(os.getcwd())
 
         return opf
-
-    def heuristics(self, options, html):
-        self.options = options
-        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
-        return preprocessor(html)

From f46749863850242e58e92a2f337a6abb1be03486 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 00:11:21 +0800
Subject: [PATCH 07/54] preserve soft breaks when deleting blank paragraphs

---
 src/calibre/ebooks/conversion/utils.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 286fad1aaa..96bd303933 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -294,8 +294,8 @@ class PreProcessor(object):
         # If more than 40% of the lines are empty paragraphs and the user has enabled delete
         # blank paragraphs then delete blank lines to clean up spacing
         linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
-        blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
-        #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
+        blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
+        multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
         blanklines = blankreg.findall(html)
         lines = linereg.findall(html)
         blanks_between_paragraphs = False
@@ -303,11 +303,8 @@ class PreProcessor(object):
         if len(lines) > 1:
             self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
                     unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
-            if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
-            'delete_blank_paragraphs', False):
-                self.log("deleting blank lines")
-                html = blankreg.sub('', html)
-            elif float(len(blanklines)) / float(len(lines)) > 0.40:
+                    
+            if float(len(blanklines)) / float(len(lines)) > 0.40:
                 blanks_between_paragraphs = True
                 print "blanks between paragraphs is marked True"
             else:
@@ -319,7 +316,12 @@ class PreProcessor(object):
 
         html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
 
-
+        if blanks_between_paragraphs and getattr(self.extra_opts,
+        'delete_blank_paragraphs', False):
+            self.log("deleting blank lines")
+            html = multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
+            html = blankreg.sub('', html)
+            
         ###### Unwrap lines ######
         #
         # Some OCR sourced files have line breaks in the html using a combination of span & p tags

From 680de553652677aae6e532c2b4ece965454457db Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Fri, 14 Jan 2011 18:50:27 -0500
Subject: [PATCH 08/54] Beginnings of Heuristics GUI options widget.

---
 src/calibre/gui2/convert/heuristics.py        |  62 +++++
 src/calibre/gui2/convert/heuristics.ui        | 219 ++++++++++++++++++
 src/calibre/gui2/convert/single.py            |   4 +-
 .../gui2/convert/structure_detection.py       |  32 +--
 .../gui2/convert/structure_detection.ui       | 113 +--------
 src/calibre/gui2/preferences/conversion.py    |   5 +-
 6 files changed, 293 insertions(+), 142 deletions(-)
 create mode 100644 src/calibre/gui2/convert/heuristics.py
 create mode 100644 src/calibre/gui2/convert/heuristics.ui

diff --git a/src/calibre/gui2/convert/heuristics.py b/src/calibre/gui2/convert/heuristics.py
new file mode 100644
index 0000000000..132652701a
--- /dev/null
+++ b/src/calibre/gui2/convert/heuristics.py
@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+from calibre.gui2.convert.heuristics_ui import Ui_Form
+from calibre.gui2.convert import Widget
+from calibre.gui2 import error_dialog
+
+class HeuristicsWidget(Widget, Ui_Form):
+
+    TITLE = _('Heuristics')
+    HELP  = _('')
+    COMMIT_NAME = 'heuristics'
+
+    def __init__(self, parent, get_option, get_help, db=None, book_id=None):
+        Widget.__init__(self, parent,
+                ['enable_heuristics', 'markup_chapter_headings',
+                 'italicize_common_cases', 'fix_indents',
+                 'html_unwrap_factor', 'unwrap_lines',
+                 'delete_blank_paragraphs', 'format_scene_breaks',
+                 'dehyphenate',
+                 'sr1_search', 'sr1_replace',
+                 'sr2_search', 'sr2_replace',
+                 'sr3_search', 'sr3_replace']
+                )
+        self.db, self.book_id = db, book_id
+        self.initialize_options(get_option, get_help, db, book_id)
+        self.opt_sr1_search.set_msg(_('Search regular expression 1:'))
+        self.opt_sr1_replace.set_msg(_('Replace regular expression 1:'))
+        self.opt_sr2_search.set_msg(_('Search regular expression 2:'))
+        self.opt_sr2_replace.set_msg(_('Replace regular expression 2:'))
+        self.opt_sr3_search.set_msg(_('Search regular expression 3:'))
+        self.opt_sr3_replace.set_msg(_('Replace regular expression 3:'))
+
+    def break_cycles(self):
+        Widget.break_cycles(self)
+        self.opt_sr1_search.break_cycles()
+        self.opt_sr1_replace.break_cycles()
+        self.opt_sr2_search.break_cycles()
+        self.opt_sr2_replace.break_cycles()
+        self.opt_sr3_search.break_cycles()
+        self.opt_sr3_replace.break_cycles()
+
+    def pre_commit_check(self):
+        for x in ('sr1-search', 'sr1-replace', 'sr2-search', 'sr2-replace', 'sr3-search', 'sr3-replace',):
+            x = getattr(self, 'opt_'+x)
+            try:
+                pat = unicode(x.regex)
+                re.compile(pat)
+            except Exception, err:
+                error_dialog(self, _('Invalid regular expression'),
+                             _('Invalid regular expression: %s')%err).exec_()
+                return False
+            
+    def set_value_handler(self, g, val):
+        if val is None and g is self.opt_html_unwrap_factor:
+            g.setValue(0.0)
+            return True
diff --git a/src/calibre/gui2/convert/heuristics.ui b/src/calibre/gui2/convert/heuristics.ui
new file mode 100644
index 0000000000..2c103ff5b6
--- /dev/null
+++ b/src/calibre/gui2/convert/heuristics.ui
@@ -0,0 +1,219 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>Form</class>
+ <widget class="QWidget" name="Form">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>657</width>
+    <height>479</height>
+   </rect>
+  </property>
+  <property name="windowTitle">
+   <string>Form</string>
+  </property>
+  <layout class="QVBoxLayout" name="verticalLayout">
+   <item>
+    <widget class="QCheckBox" name="opt_enable_heuristics">
+     <property name="text">
+      <string>&amp;Preprocess input file to possibly improve structure detection</string>
+     </property>
+    </widget>
+   </item>
+   <item>
+    <widget class="QGroupBox" name="groupBox">
+     <property name="title">
+      <string>Heuristics</string>
+     </property>
+     <layout class="QGridLayout" name="gridLayout_2">
+      <item row="0" column="0" colspan="2">
+       <widget class="QCheckBox" name="opt_unwrap_lines">
+        <property name="text">
+         <string>Unwrap lines</string>
+        </property>
+       </widget>
+      </item>
+      <item row="1" column="0">
+       <spacer name="horizontalSpacer">
+        <property name="orientation">
+         <enum>Qt::Horizontal</enum>
+        </property>
+        <property name="sizeHint" stdset="0">
+         <size>
+          <width>131</width>
+          <height>22</height>
+         </size>
+        </property>
+       </spacer>
+      </item>
+      <item row="1" column="1">
+       <widget class="QLabel" name="huf_label">
+        <property name="text">
+         <string>Line &amp;un-wrap factor during preprocess:</string>
+        </property>
+        <property name="buddy">
+         <cstring>opt_html_unwrap_factor</cstring>
+        </property>
+       </widget>
+      </item>
+      <item row="1" column="2">
+       <widget class="QDoubleSpinBox" name="opt_html_unwrap_factor">
+        <property name="toolTip">
+         <string/>
+        </property>
+        <property name="maximum">
+         <double>1.000000000000000</double>
+        </property>
+        <property name="singleStep">
+         <double>0.050000000000000</double>
+        </property>
+        <property name="value">
+         <double>0.400000000000000</double>
+        </property>
+       </widget>
+      </item>
+      <item row="2" column="0" colspan="3">
+       <widget class="QCheckBox" name="opt_markup_chapter_headings">
+        <property name="text">
+         <string>markup_chapter_headings</string>
+        </property>
+       </widget>
+      </item>
+      <item row="3" column="0" colspan="2">
+       <widget class="QCheckBox" name="opt_delete_blank_paragraphs">
+        <property name="text">
+         <string>Delete blank lines between paragraphs</string>
+        </property>
+       </widget>
+      </item>
+      <item row="4" column="0" colspan="2">
+       <widget class="QCheckBox" name="opt_format_scene_breaks">
+        <property name="text">
+         <string>format_scene_breaks</string>
+        </property>
+       </widget>
+      </item>
+      <item row="5" column="0" colspan="2">
+       <widget class="QCheckBox" name="opt_dehyphenate">
+        <property name="text">
+         <string>dehyphenate</string>
+        </property>
+       </widget>
+      </item>
+      <item row="6" column="0" colspan="2">
+       <widget class="QCheckBox" name="opt_italicize_common_cases">
+        <property name="text">
+         <string>italicize_common_cases</string>
+        </property>
+       </widget>
+      </item>
+      <item row="7" column="0" colspan="2">
+       <widget class="QCheckBox" name="opt_fix_indents">
+        <property name="text">
+         <string>fix_indents</string>
+        </property>
+       </widget>
+      </item>
+      <item row="8" column="0">
+       <spacer name="verticalSpacer">
+        <property name="orientation">
+         <enum>Qt::Vertical</enum>
+        </property>
+        <property name="sizeHint" stdset="0">
+         <size>
+          <width>131</width>
+          <height>95</height>
+         </size>
+        </property>
+       </spacer>
+      </item>
+      <item row="1" column="3">
+       <spacer name="horizontalSpacer_2">
+        <property name="orientation">
+         <enum>Qt::Horizontal</enum>
+        </property>
+        <property name="sizeHint" stdset="0">
+         <size>
+          <width>40</width>
+          <height>20</height>
+         </size>
+        </property>
+       </spacer>
+      </item>
+     </layout>
+    </widget>
+   </item>
+   <item>
+    <widget class="QGroupBox" name="groupBox_2">
+     <property name="title">
+      <string>Search and Replace</string>
+     </property>
+     <layout class="QGridLayout" name="gridLayout">
+      <item row="0" column="0">
+       <widget class="RegexEdit" name="opt_sr1_search" native="true"/>
+      </item>
+      <item row="0" column="1">
+       <widget class="RegexEdit" name="opt_sr1_replace" native="true"/>
+      </item>
+      <item row="1" column="0">
+       <widget class="RegexEdit" name="opt_sr2_search" native="true"/>
+      </item>
+      <item row="1" column="1">
+       <widget class="RegexEdit" name="opt_sr2_replace" native="true"/>
+      </item>
+      <item row="2" column="0">
+       <widget class="RegexEdit" name="opt_sr3_search" native="true"/>
+      </item>
+      <item row="2" column="1">
+       <widget class="RegexEdit" name="opt_sr3_replace" native="true"/>
+      </item>
+     </layout>
+    </widget>
+   </item>
+  </layout>
+ </widget>
+ <customwidgets>
+  <customwidget>
+   <class>RegexEdit</class>
+   <extends>QWidget</extends>
+   <header>regex_builder.h</header>
+   <container>1</container>
+  </customwidget>
+ </customwidgets>
+ <resources/>
+ <connections>
+  <connection>
+   <sender>opt_enable_heuristics</sender>
+   <signal>toggled(bool)</signal>
+   <receiver>opt_html_unwrap_factor</receiver>
+   <slot>setEnabled(bool)</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>328</x>
+     <y>87</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>481</x>
+     <y>113</y>
+    </hint>
+   </hints>
+  </connection>
+  <connection>
+   <sender>opt_enable_heuristics</sender>
+   <signal>toggled(bool)</signal>
+   <receiver>huf_label</receiver>
+   <slot>setEnabled(bool)</slot>
+   <hints>
+    <hint type="sourcelabel">
+     <x>295</x>
+     <y>88</y>
+    </hint>
+    <hint type="destinationlabel">
+     <x>291</x>
+     <y>105</y>
+    </hint>
+   </hints>
+  </connection>
+ </connections>
+</ui>
diff --git a/src/calibre/gui2/convert/single.py b/src/calibre/gui2/convert/single.py
index 7fa8c29835..0337b779a0 100644
--- a/src/calibre/gui2/convert/single.py
+++ b/src/calibre/gui2/convert/single.py
@@ -16,6 +16,7 @@ from calibre.ebooks.conversion.config import GuiRecommendations, save_specifics,
 from calibre.gui2.convert.single_ui import Ui_Dialog
 from calibre.gui2.convert.metadata import MetadataWidget
 from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
+from calibre.gui2.convert.heuristics import HeuristicsWidget
 from calibre.gui2.convert.page_setup import PageSetupWidget
 from calibre.gui2.convert.structure_detection import StructureDetectionWidget
 from calibre.gui2.convert.toc import TOCWidget
@@ -170,6 +171,7 @@ class Config(ResizableDialog, Ui_Dialog):
         self.mw = widget_factory(MetadataWidget)
         self.setWindowTitle(_('Convert')+ ' ' + unicode(self.mw.title.text()))
         lf = widget_factory(LookAndFeelWidget)
+        hw = widget_factory(HeuristicsWidget)
         ps = widget_factory(PageSetupWidget)
         sd = widget_factory(StructureDetectionWidget)
         toc = widget_factory(TOCWidget)
@@ -203,7 +205,7 @@ class Config(ResizableDialog, Ui_Dialog):
             if not c: break
             self.stack.removeWidget(c)
 
-        widgets = [self.mw, lf, ps, sd, toc]
+        widgets = [self.mw, lf, hw, ps, sd, toc]
         if input_widget is not None:
             widgets.append(input_widget)
         if output_widget is not None:
diff --git a/src/calibre/gui2/convert/structure_detection.py b/src/calibre/gui2/convert/structure_detection.py
index 3f350d4508..2c64303ee7 100644
--- a/src/calibre/gui2/convert/structure_detection.py
+++ b/src/calibre/gui2/convert/structure_detection.py
@@ -6,8 +6,6 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-import re
-
 from calibre.gui2.convert.structure_detection_ui import Ui_Form
 from calibre.gui2.convert import Widget
 from calibre.gui2 import error_dialog
@@ -24,12 +22,8 @@ class StructureDetectionWidget(Widget, Ui_Form):
         Widget.__init__(self, parent,
                 ['chapter', 'chapter_mark',
                 'remove_first_image',
-                'insert_metadata', 'page_breaks_before',
-                'preprocess_html', 'remove_header', 'header_regex',
-                'remove_footer', 'footer_regex','html_unwrap_factor']
+                'insert_metadata', 'page_breaks_before']
                 )
-        self.opt_html_unwrap_factor.setEnabled(False)
-        self.huf_label.setEnabled(False)
         self.db, self.book_id = db, book_id
         for x in ('pagebreak', 'rule', 'both', 'none'):
             self.opt_chapter_mark.addItem(x)
@@ -37,28 +31,11 @@ class StructureDetectionWidget(Widget, Ui_Form):
         self.opt_chapter.set_msg(_('Detect chapters at (XPath expression):'))
         self.opt_page_breaks_before.set_msg(_('Insert page breaks before '
             '(XPath expression):'))
-        self.opt_header_regex.set_msg(_('Header regular expression:'))
-        self.opt_header_regex.set_book_id(book_id)
-        self.opt_header_regex.set_db(db)
-        self.opt_footer_regex.set_msg(_('Footer regular expression:'))
-        self.opt_footer_regex.set_book_id(book_id)
-        self.opt_footer_regex.set_db(db)
-
+        
     def break_cycles(self):
         Widget.break_cycles(self)
-        self.opt_header_regex.break_cycles()
-        self.opt_footer_regex.break_cycles()
 
     def pre_commit_check(self):
-        for x in ('header_regex', 'footer_regex'):
-            x = getattr(self, 'opt_'+x)
-            try:
-                pat = unicode(x.regex)
-                re.compile(pat)
-            except Exception, err:
-                error_dialog(self, _('Invalid regular expression'),
-                             _('Invalid regular expression: %s')%err).exec_()
-                return False
         for x in ('chapter', 'page_breaks_before'):
             x = getattr(self, 'opt_'+x)
             if not x.check():
@@ -66,8 +43,3 @@ class StructureDetectionWidget(Widget, Ui_Form):
                 _('The XPath expression %s is invalid.')%x.text).exec_()
                 return False
         return True
-
-    def set_value_handler(self, g, val):
-        if val is None and g is self.opt_html_unwrap_factor:
-            g.setValue(0.0)
-            return True
diff --git a/src/calibre/gui2/convert/structure_detection.ui b/src/calibre/gui2/convert/structure_detection.ui
index 21fe365e99..b690a68b0a 100644
--- a/src/calibre/gui2/convert/structure_detection.ui
+++ b/src/calibre/gui2/convert/structure_detection.ui
@@ -41,17 +41,17 @@
      </property>
     </widget>
    </item>
-   <item row="5" column="0" colspan="2">
+   <item row="3" column="0" colspan="2">
     <widget class="QCheckBox" name="opt_insert_metadata">
      <property name="text">
       <string>Insert &amp;metadata as page at start of book</string>
      </property>
     </widget>
    </item>
-   <item row="11" column="0" colspan="3">
+   <item row="5" column="0" colspan="3">
     <widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
    </item>
-   <item row="12" column="0" colspan="3">
+   <item row="6" column="0" colspan="3">
     <spacer name="verticalSpacer">
      <property name="orientation">
       <enum>Qt::Vertical</enum>
@@ -64,72 +64,6 @@
      </property>
     </spacer>
    </item>
-   <item row="8" column="0" colspan="2">
-    <widget class="QCheckBox" name="opt_remove_footer">
-     <property name="text">
-      <string>Remove F&amp;ooter</string>
-     </property>
-    </widget>
-   </item>
-   <item row="6" column="0" colspan="2">
-    <widget class="QCheckBox" name="opt_remove_header">
-     <property name="text">
-      <string>Remove H&amp;eader</string>
-     </property>
-    </widget>
-   </item>
-   <item row="7" column="0" colspan="3">
-    <widget class="RegexEdit" name="opt_header_regex" native="true"/>
-   </item>
-   <item row="9" column="0" colspan="3">
-    <widget class="RegexEdit" name="opt_footer_regex" native="true"/>
-   </item>
-   <item row="4" column="1">
-    <widget class="QLabel" name="huf_label">
-     <property name="text">
-      <string>Line &amp;un-wrap factor during preprocess:</string>
-     </property>
-     <property name="buddy">
-      <cstring>opt_html_unwrap_factor</cstring>
-     </property>
-    </widget>
-   </item>
-   <item row="4" column="2">
-    <widget class="QDoubleSpinBox" name="opt_html_unwrap_factor">
-     <property name="toolTip">
-      <string/>
-     </property>
-     <property name="maximum">
-      <double>1.000000000000000</double>
-     </property>
-     <property name="singleStep">
-      <double>0.050000000000000</double>
-     </property>
-     <property name="value">
-      <double>0.400000000000000</double>
-     </property>
-    </widget>
-   </item>
-   <item row="4" column="0">
-    <spacer name="horizontalSpacer">
-     <property name="orientation">
-      <enum>Qt::Horizontal</enum>
-     </property>
-     <property name="sizeHint" stdset="0">
-      <size>
-       <width>40</width>
-       <height>20</height>
-      </size>
-     </property>
-    </spacer>
-   </item>
-   <item row="3" column="0" colspan="2">
-    <widget class="QCheckBox" name="opt_preprocess_html">
-     <property name="text">
-      <string>&amp;Preprocess input file to possibly improve structure detection</string>
-     </property>
-    </widget>
-   </item>
   </layout>
  </widget>
  <customwidgets>
@@ -139,46 +73,7 @@
    <header>convert/xpath_wizard.h</header>
    <container>1</container>
   </customwidget>
-  <customwidget>
-   <class>RegexEdit</class>
-   <extends>QWidget</extends>
-   <header>regex_builder.h</header>
-   <container>1</container>
-  </customwidget>
  </customwidgets>
  <resources/>
- <connections>
-  <connection>
-   <sender>opt_preprocess_html</sender>
-   <signal>toggled(bool)</signal>
-   <receiver>opt_html_unwrap_factor</receiver>
-   <slot>setEnabled(bool)</slot>
-   <hints>
-    <hint type="sourcelabel">
-     <x>328</x>
-     <y>87</y>
-    </hint>
-    <hint type="destinationlabel">
-     <x>481</x>
-     <y>113</y>
-    </hint>
-   </hints>
-  </connection>
-  <connection>
-   <sender>opt_preprocess_html</sender>
-   <signal>toggled(bool)</signal>
-   <receiver>huf_label</receiver>
-   <slot>setEnabled(bool)</slot>
-   <hints>
-    <hint type="sourcelabel">
-     <x>295</x>
-     <y>88</y>
-    </hint>
-    <hint type="destinationlabel">
-     <x>291</x>
-     <y>105</y>
-    </hint>
-   </hints>
-  </connection>
- </connections>
+ <connections/>
 </ui>
diff --git a/src/calibre/gui2/preferences/conversion.py b/src/calibre/gui2/preferences/conversion.py
index 0063d4a341..a20872cee0 100644
--- a/src/calibre/gui2/preferences/conversion.py
+++ b/src/calibre/gui2/preferences/conversion.py
@@ -12,6 +12,7 @@ from calibre.ebooks.conversion.plumber import Plumber
 from calibre.utils.logging import Log
 from calibre.gui2.preferences.conversion_ui import Ui_Form
 from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
+from calibre.gui2.convert.heuristics import HeuristicsWidget
 from calibre.gui2.convert.page_setup import PageSetupWidget
 from calibre.gui2.convert.structure_detection import StructureDetectionWidget
 from calibre.gui2.convert.toc import TOCWidget
@@ -82,8 +83,8 @@ class Base(ConfigWidgetBase, Ui_Form):
 class CommonOptions(Base):
 
     def load_conversion_widgets(self):
-        self.conversion_widgets = [LookAndFeelWidget, PageSetupWidget,
-                StructureDetectionWidget, TOCWidget]
+        self.conversion_widgets = [LookAndFeelWidget, HeuristicsWidget,
+                PageSetupWidget, StructureDetectionWidget, TOCWidget]
 
 class InputOptions(Base):
 

From 7d75b065126f1bc5feade93c991db0b6fd261073 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Fri, 14 Jan 2011 18:53:02 -0500
Subject: [PATCH 09/54] Change heuristic options to default False to maintain
 consistency.

---
 src/calibre/ebooks/conversion/plumber.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index a40c17a743..2e88baea4e 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -500,7 +500,7 @@ OptionRecommendation(name='italicize_common_cases',
            'italics and italicize them.')),
            
 OptionRecommendation(name='fix_indents',
-    recommended_value=True, level=OptionRecommendation.LOW,
+    recommended_value=False, level=OptionRecommendation.LOW,
     help=_('Turn indentation created from multiple &nbsp; entities '
            'into CSS indents.')),
            
@@ -517,7 +517,7 @@ OptionRecommendation(name='unwrap_lines',
     help=_('Unwrap lines using punctuation and other formatting clues.')),
     
 OptionRecommendation(name='delete_blank_paragraphs',
-    recommended_value=True, level=OptionRecommendation.LOW,
+    recommended_value=False, level=OptionRecommendation.LOW,
     help=_('Remove empty paragraphs from the document when they exist between '
            'every other paragraph')),
     
@@ -528,7 +528,7 @@ OptionRecommendation(name='format_scene_breaks',
            'horizontal rules.')),
 
 OptionRecommendation(name='dehyphenate',
-    recommended_value=True, level=OptionRecommendation.LOW,
+    recommended_value=False, level=OptionRecommendation.LOW,
     help=_('Analyses hyphenated words throughout the document.  The '
            'document itself is used as a dictionary to determine whether hyphens '
            'should be retained or removed.')),

From 8f4d60073f982185522bdc20cfbaf0aea0f9de0c Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Fri, 14 Jan 2011 19:24:31 -0500
Subject: [PATCH 10/54] Finish new GUI option widgets.

---
 src/calibre/ebooks/conversion/cli.py          | 16 +++--
 src/calibre/gui2/convert/bulk.py              |  6 +-
 src/calibre/gui2/convert/heuristics.py        | 68 ++++++++++---------
 src/calibre/gui2/convert/heuristics.ui        | 47 ++-----------
 .../gui2/convert/search_and_replace.py        | 57 ++++++++++++++++
 .../gui2/convert/search_and_replace.ui        | 47 +++++++++++++
 src/calibre/gui2/convert/single.py            |  4 +-
 .../gui2/convert/structure_detection.ui       |  6 +-
 src/calibre/gui2/preferences/conversion.py    |  4 +-
 9 files changed, 172 insertions(+), 83 deletions(-)
 create mode 100644 src/calibre/gui2/convert/search_and_replace.py
 create mode 100644 src/calibre/gui2/convert/search_and_replace.ui

diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index f825776c9c..91f0f95348 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -131,18 +131,24 @@ def add_pipeline_options(parser, plumber):
                   ),
                   
               'HEURISTICS' : (
-                  _('Modify the document text and strucutre using common patterns.'),
+                  _('Modify the document text and structure using common patterns.'),
                   [
                       'enable_heuristics', 'markup_chapter_headings',
                       'italicize_common_cases', 'fix_indents',
                       'html_unwrap_factor', 'unwrap_lines',
                       'delete_blank_paragraphs', 'format_scene_breaks',
                       'dehyphenate',
+                  ]
+                  ),
+                  
+              'SEARCH AND REPLACE' : (
+                 _('Modify the document text and structure using user defined patterns.'),
+                 [
                       'sr1_search', 'sr1_replace',
                       'sr2_search', 'sr2_replace',
                       'sr3_search', 'sr3_replace',
-                  ]
-                  ),
+                 ]
+              ),
 
               'STRUCTURE DETECTION' : (
                   _('Control auto-detection of document structure.'),
@@ -177,8 +183,8 @@ def add_pipeline_options(parser, plumber):
               }
 
     group_order = ['', 'LOOK AND FEEL', 'HEURISTICS',
-            'STRUCTURE DETECTION', 'TABLE OF CONTENTS',
-            'METADATA', 'DEBUG']
+            'SEARCH AND REPLACE' 'STRUCTURE DETECTION',
+            'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
 
     for group in group_order:
         desc, options = groups[group]
diff --git a/src/calibre/gui2/convert/bulk.py b/src/calibre/gui2/convert/bulk.py
index 198f6144e4..b97ab1a2dc 100644
--- a/src/calibre/gui2/convert/bulk.py
+++ b/src/calibre/gui2/convert/bulk.py
@@ -11,6 +11,8 @@ from calibre.gui2.convert.single import Config, sort_formats_by_preference, \
 from calibre.customize.ui import available_output_formats
 from calibre.gui2 import ResizableDialog
 from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
+from calibre.gui2.convert.heuristics import HeuristicsWidget
+from calibre.gui2.convert.search_and_replace import SearchAndReplaceWidget
 from calibre.gui2.convert.page_setup import PageSetupWidget
 from calibre.gui2.convert.structure_detection import StructureDetectionWidget
 from calibre.gui2.convert.toc import TOCWidget
@@ -69,6 +71,8 @@ class BulkConfig(Config):
 
         self.setWindowTitle(_('Bulk Convert'))
         lf = widget_factory(LookAndFeelWidget)
+        hw = widget_factory(HeuristicsWidget)
+        sr = widget_factory(SearchAndReplaceWidget)
         ps = widget_factory(PageSetupWidget)
         sd = widget_factory(StructureDetectionWidget)
         toc = widget_factory(TOCWidget)
@@ -90,7 +94,7 @@ class BulkConfig(Config):
             if not c: break
             self.stack.removeWidget(c)
 
-        widgets = [lf, ps, sd, toc]
+        widgets = [lf, hw, sr, ps, sd, toc]
         if output_widget is not None:
             widgets.append(output_widget)
         for w in widgets:
diff --git a/src/calibre/gui2/convert/heuristics.py b/src/calibre/gui2/convert/heuristics.py
index 132652701a..2b9df50457 100644
--- a/src/calibre/gui2/convert/heuristics.py
+++ b/src/calibre/gui2/convert/heuristics.py
@@ -4,16 +4,15 @@ __license__ = 'GPL 3'
 __copyright__ = '2011, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 
-import re
+from PyQt4.Qt import Qt
 
 from calibre.gui2.convert.heuristics_ui import Ui_Form
 from calibre.gui2.convert import Widget
-from calibre.gui2 import error_dialog
 
 class HeuristicsWidget(Widget, Ui_Form):
 
     TITLE = _('Heuristics')
-    HELP  = _('')
+    HELP  = _('Modify the document text and structure using common patterns.')
     COMMIT_NAME = 'heuristics'
 
     def __init__(self, parent, get_option, get_help, db=None, book_id=None):
@@ -22,41 +21,48 @@ class HeuristicsWidget(Widget, Ui_Form):
                  'italicize_common_cases', 'fix_indents',
                  'html_unwrap_factor', 'unwrap_lines',
                  'delete_blank_paragraphs', 'format_scene_breaks',
-                 'dehyphenate',
-                 'sr1_search', 'sr1_replace',
-                 'sr2_search', 'sr2_replace',
-                 'sr3_search', 'sr3_replace']
+                 'dehyphenate']
                 )
         self.db, self.book_id = db, book_id
         self.initialize_options(get_option, get_help, db, book_id)
-        self.opt_sr1_search.set_msg(_('Search regular expression 1:'))
-        self.opt_sr1_replace.set_msg(_('Replace regular expression 1:'))
-        self.opt_sr2_search.set_msg(_('Search regular expression 2:'))
-        self.opt_sr2_replace.set_msg(_('Replace regular expression 2:'))
-        self.opt_sr3_search.set_msg(_('Search regular expression 3:'))
-        self.opt_sr3_replace.set_msg(_('Replace regular expression 3:'))
+        
+        self.opt_enable_heuristics.stateChanged.connect(self.enable_heuristics)
+        self.opt_unwrap_lines.stateChanged.connect(self.enable_unwrap)
+        
+        self.enable_heuristics(self.opt_enable_heuristics.checkState())
 
     def break_cycles(self):
         Widget.break_cycles(self)
-        self.opt_sr1_search.break_cycles()
-        self.opt_sr1_replace.break_cycles()
-        self.opt_sr2_search.break_cycles()
-        self.opt_sr2_replace.break_cycles()
-        self.opt_sr3_search.break_cycles()
-        self.opt_sr3_replace.break_cycles()
-
-    def pre_commit_check(self):
-        for x in ('sr1-search', 'sr1-replace', 'sr2-search', 'sr2-replace', 'sr3-search', 'sr3-replace',):
-            x = getattr(self, 'opt_'+x)
-            try:
-                pat = unicode(x.regex)
-                re.compile(pat)
-            except Exception, err:
-                error_dialog(self, _('Invalid regular expression'),
-                             _('Invalid regular expression: %s')%err).exec_()
-                return False
-            
+        
+        self.opt_enable_heuristics.stateChanged.disconnect()
+        self.opt_unwrap_lines.stateChanged.disconnect()
+        
     def set_value_handler(self, g, val):
         if val is None and g is self.opt_html_unwrap_factor:
             g.setValue(0.0)
             return True
+
+    def enable_heuristics(self, state):
+        if state == Qt.Checked:
+            state = True
+        else:
+            state = False
+        self.opt_markup_chapter_headings.setEnabled(state)
+        self.opt_italicize_common_cases.setEnabled(state)
+        self.opt_fix_indents.setEnabled(state)
+        self.opt_delete_blank_paragraphs.setEnabled(state)
+        self.opt_format_scene_breaks.setEnabled(state)
+        self.opt_dehyphenate.setEnabled(state)
+        
+        self.opt_unwrap_lines.setEnabled(state)
+        if state and self.opt_unwrap_lines.checkState() == Qt.Checked:
+            self.opt_html_unwrap_factor.setEnabled(True)
+        else:
+            self.opt_html_unwrap_factor.setEnabled(False)
+
+    def enable_unwrap(self, state):
+        if state == Qt.Checked:
+            state = True
+        else:
+            state = False
+        self.opt_html_unwrap_factor.setEnabled(state)
diff --git a/src/calibre/gui2/convert/heuristics.ui b/src/calibre/gui2/convert/heuristics.ui
index 2c103ff5b6..e64e79e1df 100644
--- a/src/calibre/gui2/convert/heuristics.ui
+++ b/src/calibre/gui2/convert/heuristics.ui
@@ -76,7 +76,7 @@
       <item row="2" column="0" colspan="3">
        <widget class="QCheckBox" name="opt_markup_chapter_headings">
         <property name="text">
-         <string>markup_chapter_headings</string>
+         <string>Detect and markup unformatted chapter headings and sub headings</string>
         </property>
        </widget>
       </item>
@@ -90,28 +90,28 @@
       <item row="4" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_format_scene_breaks">
         <property name="text">
-         <string>format_scene_breaks</string>
+         <string>Ensure scene breaks are consistently formatted</string>
         </property>
        </widget>
       </item>
       <item row="5" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_dehyphenate">
         <property name="text">
-         <string>dehyphenate</string>
+         <string>Remove unnecessary hyphens</string>
         </property>
        </widget>
       </item>
       <item row="6" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_italicize_common_cases">
         <property name="text">
-         <string>italicize_common_cases</string>
+         <string>Italicize common words and patterns</string>
         </property>
        </widget>
       </item>
       <item row="7" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_fix_indents">
         <property name="text">
-         <string>fix_indents</string>
+         <string>Replace entity indents with CSS indents</string>
         </property>
        </widget>
       </item>
@@ -123,7 +123,7 @@
         <property name="sizeHint" stdset="0">
          <size>
           <width>131</width>
-          <height>95</height>
+          <height>35</height>
          </size>
         </property>
        </spacer>
@@ -144,43 +144,8 @@
      </layout>
     </widget>
    </item>
-   <item>
-    <widget class="QGroupBox" name="groupBox_2">
-     <property name="title">
-      <string>Search and Replace</string>
-     </property>
-     <layout class="QGridLayout" name="gridLayout">
-      <item row="0" column="0">
-       <widget class="RegexEdit" name="opt_sr1_search" native="true"/>
-      </item>
-      <item row="0" column="1">
-       <widget class="RegexEdit" name="opt_sr1_replace" native="true"/>
-      </item>
-      <item row="1" column="0">
-       <widget class="RegexEdit" name="opt_sr2_search" native="true"/>
-      </item>
-      <item row="1" column="1">
-       <widget class="RegexEdit" name="opt_sr2_replace" native="true"/>
-      </item>
-      <item row="2" column="0">
-       <widget class="RegexEdit" name="opt_sr3_search" native="true"/>
-      </item>
-      <item row="2" column="1">
-       <widget class="RegexEdit" name="opt_sr3_replace" native="true"/>
-      </item>
-     </layout>
-    </widget>
-   </item>
   </layout>
  </widget>
- <customwidgets>
-  <customwidget>
-   <class>RegexEdit</class>
-   <extends>QWidget</extends>
-   <header>regex_builder.h</header>
-   <container>1</container>
-  </customwidget>
- </customwidgets>
  <resources/>
  <connections>
   <connection>
diff --git a/src/calibre/gui2/convert/search_and_replace.py b/src/calibre/gui2/convert/search_and_replace.py
new file mode 100644
index 0000000000..860cc11d4e
--- /dev/null
+++ b/src/calibre/gui2/convert/search_and_replace.py
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+from PyQt4.Qt import Qt
+
+from calibre.gui2.convert.search_and_replace_ui import Ui_Form
+from calibre.gui2.convert import Widget
+from calibre.gui2 import error_dialog
+
+class SearchAndReplaceWidget(Widget, Ui_Form):
+
+    TITLE = _('Search and Replace')
+    HELP  = _('Modify the document text and structure using user defined patterns.')
+    COMMIT_NAME = 'search_and_replace'
+
+    def __init__(self, parent, get_option, get_help, db=None, book_id=None):
+        Widget.__init__(self, parent,
+                ['sr1_search', 'sr1_replace',
+                 'sr2_search', 'sr2_replace',
+                 'sr3_search', 'sr3_replace']
+                )
+        self.db, self.book_id = db, book_id
+        self.initialize_options(get_option, get_help, db, book_id)
+        self.opt_sr1_search.set_msg(_('Search regular expression 1:'))
+        self.opt_sr1_replace.set_msg(_('Replace regular expression 1:'))
+        self.opt_sr2_search.set_msg(_('Search regular expression 2:'))
+        self.opt_sr2_replace.set_msg(_('Replace regular expression 2:'))
+        self.opt_sr3_search.set_msg(_('Search regular expression 3:'))
+        self.opt_sr3_replace.set_msg(_('Replace regular expression 3:'))
+        
+    def break_cycles(self):
+        Widget.break_cycles(self)
+        
+        self.opt_sr1_search.break_cycles()
+        self.opt_sr1_replace.break_cycles()
+        self.opt_sr2_search.break_cycles()
+        self.opt_sr2_replace.break_cycles()
+        self.opt_sr3_search.break_cycles()
+        self.opt_sr3_replace.break_cycles()
+
+    def pre_commit_check(self):
+        for x in ('sr1-search', 'sr1-replace', 'sr2-search', 'sr2-replace', 'sr3-search', 'sr3-replace',):
+            x = getattr(self, 'opt_'+x)
+            try:
+                pat = unicode(x.regex)
+                re.compile(pat)
+            except Exception, err:
+                error_dialog(self, _('Invalid regular expression'),
+                             _('Invalid regular expression: %s')%err).exec_()
+                return False
+            
+
diff --git a/src/calibre/gui2/convert/search_and_replace.ui b/src/calibre/gui2/convert/search_and_replace.ui
new file mode 100644
index 0000000000..5913f2c098
--- /dev/null
+++ b/src/calibre/gui2/convert/search_and_replace.ui
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>Form</class>
+ <widget class="QWidget" name="Form">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>657</width>
+    <height>479</height>
+   </rect>
+  </property>
+  <property name="windowTitle">
+   <string>Form</string>
+  </property>
+  <layout class="QVBoxLayout" name="verticalLayout">
+   <item>
+    <widget class="RegexEdit" name="opt_sr1_search" native="true"/>
+   </item>
+   <item>
+    <widget class="RegexEdit" name="opt_sr1_replace" native="true"/>
+   </item>
+   <item>
+    <widget class="RegexEdit" name="opt_sr2_search" native="true"/>
+   </item>
+   <item>
+    <widget class="RegexEdit" name="opt_sr2_replace" native="true"/>
+   </item>
+   <item>
+    <widget class="RegexEdit" name="opt_sr3_search" native="true"/>
+   </item>
+   <item>
+    <widget class="RegexEdit" name="opt_sr3_replace" native="true"/>
+   </item>
+  </layout>
+ </widget>
+ <customwidgets>
+  <customwidget>
+   <class>RegexEdit</class>
+   <extends>QWidget</extends>
+   <header>regex_builder.h</header>
+   <container>1</container>
+  </customwidget>
+ </customwidgets>
+ <resources/>
+ <connections/>
+</ui>
diff --git a/src/calibre/gui2/convert/single.py b/src/calibre/gui2/convert/single.py
index 0337b779a0..8826d398f5 100644
--- a/src/calibre/gui2/convert/single.py
+++ b/src/calibre/gui2/convert/single.py
@@ -17,6 +17,7 @@ from calibre.gui2.convert.single_ui import Ui_Dialog
 from calibre.gui2.convert.metadata import MetadataWidget
 from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
 from calibre.gui2.convert.heuristics import HeuristicsWidget
+from calibre.gui2.convert.search_and_replace import SearchAndReplaceWidget
 from calibre.gui2.convert.page_setup import PageSetupWidget
 from calibre.gui2.convert.structure_detection import StructureDetectionWidget
 from calibre.gui2.convert.toc import TOCWidget
@@ -172,6 +173,7 @@ class Config(ResizableDialog, Ui_Dialog):
         self.setWindowTitle(_('Convert')+ ' ' + unicode(self.mw.title.text()))
         lf = widget_factory(LookAndFeelWidget)
         hw = widget_factory(HeuristicsWidget)
+        sr = widget_factory(SearchAndReplaceWidget)
         ps = widget_factory(PageSetupWidget)
         sd = widget_factory(StructureDetectionWidget)
         toc = widget_factory(TOCWidget)
@@ -205,7 +207,7 @@ class Config(ResizableDialog, Ui_Dialog):
             if not c: break
             self.stack.removeWidget(c)
 
-        widgets = [self.mw, lf, hw, ps, sd, toc]
+        widgets = [self.mw, lf, hw, sr, ps, sd, toc]
         if input_widget is not None:
             widgets.append(input_widget)
         if output_widget is not None:
diff --git a/src/calibre/gui2/convert/structure_detection.ui b/src/calibre/gui2/convert/structure_detection.ui
index b690a68b0a..262894d42d 100644
--- a/src/calibre/gui2/convert/structure_detection.ui
+++ b/src/calibre/gui2/convert/structure_detection.ui
@@ -14,10 +14,10 @@
    <string>Form</string>
   </property>
   <layout class="QGridLayout" name="gridLayout">
-   <item row="0" column="1" colspan="2">
+   <item row="0" column="0" colspan="3">
     <widget class="XPathEdit" name="opt_chapter" native="true"/>
    </item>
-   <item row="1" column="0" colspan="2">
+   <item row="1" column="0">
     <widget class="QLabel" name="label">
      <property name="text">
       <string>Chapter &amp;mark:</string>
@@ -27,7 +27,7 @@
      </property>
     </widget>
    </item>
-   <item row="1" column="2">
+   <item row="1" column="1" colspan="2">
     <widget class="QComboBox" name="opt_chapter_mark">
      <property name="minimumContentsLength">
       <number>20</number>
diff --git a/src/calibre/gui2/preferences/conversion.py b/src/calibre/gui2/preferences/conversion.py
index a20872cee0..0a8fc375ea 100644
--- a/src/calibre/gui2/preferences/conversion.py
+++ b/src/calibre/gui2/preferences/conversion.py
@@ -13,6 +13,7 @@ from calibre.utils.logging import Log
 from calibre.gui2.preferences.conversion_ui import Ui_Form
 from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
 from calibre.gui2.convert.heuristics import HeuristicsWidget
+from calibre.gui2.convert.search_and_replace import SearchAndReplaceWidget
 from calibre.gui2.convert.page_setup import PageSetupWidget
 from calibre.gui2.convert.structure_detection import StructureDetectionWidget
 from calibre.gui2.convert.toc import TOCWidget
@@ -84,7 +85,8 @@ class CommonOptions(Base):
 
     def load_conversion_widgets(self):
         self.conversion_widgets = [LookAndFeelWidget, HeuristicsWidget,
-                PageSetupWidget, StructureDetectionWidget, TOCWidget]
+                SearchAndReplaceWidget, PageSetupWidget,
+                StructureDetectionWidget, TOCWidget]
 
 class InputOptions(Base):
 

From 9d29e46a2cbe82879caa4ceeccb51cefbe153ed4 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Fri, 14 Jan 2011 19:28:36 -0500
Subject: [PATCH 11/54] ...

---
 src/calibre/gui2/convert/search_and_replace.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/calibre/gui2/convert/search_and_replace.py b/src/calibre/gui2/convert/search_and_replace.py
index 860cc11d4e..de9033a46e 100644
--- a/src/calibre/gui2/convert/search_and_replace.py
+++ b/src/calibre/gui2/convert/search_and_replace.py
@@ -6,8 +6,6 @@ __docformat__ = 'restructuredtext en'
 
 import re
 
-from PyQt4.Qt import Qt
-
 from calibre.gui2.convert.search_and_replace_ui import Ui_Form
 from calibre.gui2.convert import Widget
 from calibre.gui2 import error_dialog

From e8133432fd1280468a98666e05532e8e1da6d5b3 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 09:06:05 +0800
Subject: [PATCH 12/54] ...

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 96bd303933..417f3a1e5b 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -156,7 +156,7 @@ class PreProcessor(object):
         n_lookahead_open = "\s+(?!"
         n_lookahead_close = ")"
 
-        default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
+        default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
 
         chapter_types = [
             [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],

From f85ba4e3261b4e64c84722087471824fbf12278e Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Fri, 14 Jan 2011 21:15:34 -0500
Subject: [PATCH 13/54] Fix sr key. Change footer and header removal to generic
 search and replace options.

---
 src/calibre/ebooks/conversion/cli.py        |  2 +-
 src/calibre/ebooks/conversion/plumber.py    |  9 ++---
 src/calibre/ebooks/conversion/preprocess.py | 42 +++++++++++----------
 3 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index 91f0f95348..db1ec0857d 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -183,7 +183,7 @@ def add_pipeline_options(parser, plumber):
               }
 
     group_order = ['', 'LOOK AND FEEL', 'HEURISTICS',
-            'SEARCH AND REPLACE' 'STRUCTURE DETECTION',
+            'SEARCH AND REPLACE', 'STRUCTURE DETECTION',
             'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
 
     for group in group_order:
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 2e88baea4e..a12dbd48e1 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -540,8 +540,7 @@ OptionRecommendation(name='sr1_search',
     
 OptionRecommendation(name='sr1_replace',
     recommended_value='', level=OptionRecommendation.LOW,
-    help=_('Replace characters (can be lambda expression) to '
-           'replace the text found with sr1-search.')),
+    help=_('Replace characters to replace the text found with sr1-search.')),
 
 OptionRecommendation(name='sr2_search',
     recommended_value='', level=OptionRecommendation.LOW,
@@ -550,8 +549,7 @@ OptionRecommendation(name='sr2_search',
 
 OptionRecommendation(name='sr2_replace',
     recommended_value='', level=OptionRecommendation.LOW,
-    help=_('Replace characters (can be lambda expression) to '
-           'replace the text found with sr2-search.')),
+    help=_('Replace characters to replace the text found with sr2-search.')),
 
 OptionRecommendation(name='sr3_search',
     recommended_value='', level=OptionRecommendation.LOW,
@@ -560,8 +558,7 @@ OptionRecommendation(name='sr3_search',
 
 OptionRecommendation(name='sr3_replace',
     recommended_value='', level=OptionRecommendation.LOW,
-    help=_('Replace characters (can be lambda expression) to '
-           'replace the text found with sr3-search.')),
+    help=_('Replace characters to replace the text found with sr3-search.')),
 ]
         # }}}
 
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 08a46cb8d9..35a311d58f 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -436,27 +436,29 @@ class HTMLPreProcessor(object):
         if not getattr(self.extra_opts, 'keep_ligatures', False):
             html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
 
+        if getattr(self.extra_opts, 'sr3_search', None):
+            try:
+                rules.insert(0,  (re.compile(self.extra_opts.sr3_search), self.extra_opts.sr3_replace))
+            except:
+                import traceback
+                print 'Failed to parse sr3-search regexp'
+                traceback.print_exc()
+        if getattr(self.extra_opts, 'sr2_search', None):
+            try:
+                rules.insert(0, (re.compile(self.extra_opts.sr2_search), self.extra_opts.sr2_replace))
+            except:
+                import traceback
+                print 'Failed to parse sr2-search regexp'
+                traceback.print_exc()
+        if getattr(self.extra_opts, 'sr1_search', None):
+            try:
+                rules.insert(0, (re.compile(self.extra_opts.sr1_search), self.extra_opts.sr1_replace))
+            except:
+                import traceback
+                print 'Failed to parse sr1-search regexp'
+                traceback.print_exc()
+
         end_rules = []
-        if getattr(self.extra_opts, 'remove_header', None):
-            try:
-                rules.insert(0,
-                    (re.compile(self.extra_opts.header_regex), lambda match : '')
-                )
-            except:
-                import traceback
-                print 'Failed to parse remove_header regexp'
-                traceback.print_exc()
-
-        if getattr(self.extra_opts, 'remove_footer', None):
-            try:
-                rules.insert(0,
-                    (re.compile(self.extra_opts.footer_regex), lambda match : '')
-                )
-            except:
-                import traceback
-                print 'Failed to parse remove_footer regexp'
-                traceback.print_exc()
-
         # delete soft hyphens - moved here so it's executed after header/footer removal
         if is_pdftohtml:
             # unwrap/delete soft hyphens

From 33793372759002f287f837a4a51cdc6767501035 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 10:50:01 +0800
Subject: [PATCH 14/54] tied rtf input to heuristics, removed option to not
 include softbreaks, users can combine delete_blank_paragraphs and
 remove_paragraph_spacing to achieve desired results

---
 src/calibre/ebooks/rtf/input.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 92ac8a2519..2f931d1d04 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -320,11 +320,10 @@ class RTFInput(InputFormatPlugin):
             res = transform.tostring(result)
             res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
             # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
-            if not getattr(self.opts, 'remove_paragraph_spacing', False):
-                res = re.sub('\s*<body>', '<body>', res)
-                res = re.sub('(?<=\n)\n{2}',
-                        u'<p>\u00a0</p>\n'.encode('utf-8'), res)
-            if self.opts.preprocess_html:
+            res = re.sub('\s*<body>', '<body>', res)
+            res = re.sub('(?<=\n)\n{2}',
+                    u'<p>\u00a0</p>\n'.encode('utf-8'), res)
+            if self.opts.enable_heuristics:
                 preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None))
                 res = preprocessor(res.decode('utf-8')).encode('utf-8')
             f.write(res)

From 6f252bb1050a6a7d66dcad365fb3992088f9fe86 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 13:34:35 +0800
Subject: [PATCH 15/54] tied all the new heuristics options to preprocess.utils

---
 src/calibre/ebooks/conversion/plumber.py |   2 +-
 src/calibre/ebooks/conversion/utils.py   | 177 ++++++++++++-----------
 2 files changed, 97 insertions(+), 82 deletions(-)

diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index a12dbd48e1..48b965f624 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -523,7 +523,7 @@ OptionRecommendation(name='delete_blank_paragraphs',
     
 OptionRecommendation(name='format_scene_breaks',
     recommended_value=False, level=OptionRecommendation.LOW,
-    help=_('Detects left aligned scene break markers and center aligns them. '
+    help=_('left aligned scene break markers are center aligned. '
            'Replace soft scene breaks that use multiple blank lines with'
            'horizontal rules.')),
 
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 417f3a1e5b..68afc464a0 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -18,6 +18,11 @@ class PreProcessor(object):
         self.html_preprocess_sections = 0
         self.found_indents = 0
         self.extra_opts = extra_opts
+        self.deleted_nbsps = False
+        self.min_chapters = 1
+        self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
+        self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
+        self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
 
     def is_pdftohtml(self, src):
         return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
@@ -120,7 +125,6 @@ class PreProcessor(object):
         '''
         # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
         # minimum of chapters to search for
-        self.min_chapters = 1
         if wordcount > 7000:
             self.min_chapters = int(ceil(wordcount / 7000.))
         #print "minimum chapters required are: "+str(self.min_chapters)
@@ -192,21 +196,28 @@ class PreProcessor(object):
     def punctuation_unwrap(self, length, content, format):
         '''
         Unwraps lines based on line length and punctuation
-        supports range of potential html markup and text files
+        supports a range of html markup and text files
         '''
         # define the pieces of the regex
         lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
-        line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
+        em_en_lookahead = "(?<=.{"+str(length)+"}[\u2013\u2014])"
+        line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
         blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
-        line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
+        line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
         txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
 
         unwrap_regex = lookahead+line_ending+blanklines+line_opening
+        em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
+
         if format == 'txt':
             unwrap_regex = lookahead+txt_line_wrap
+            em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
 
         unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
+        em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
+
         content = unwrap.sub(' ', content)
+        content = em_en_unwrap.sub('', content)
         return content
 
 
@@ -253,8 +264,38 @@ class PreProcessor(object):
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
         html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
+        self.deleted_nbsps = True
         return html
 
+    def analyze_line_endings(self, html):
+        '''
+        determines the type of html line ending used most commonly in a document
+        use before calling docanalysis functions
+        '''
+        paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
+        spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
+        paras = len(paras_reg.findall(html))
+        spans = len(spans_reg.findall(html))
+        if spans > 1:
+            if float(paras) / float(spans) < 0.75:
+                return 'spanned_html'
+            else:
+                return 'html'
+        else:
+            return 'html'
+
+    def analyze_blanks(self, html):
+        blanklines = self.blankreg.findall(html)
+        lines = self.linereg.findall(html)
+        if len(lines) > 1:
+            self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
+                    unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
+                    
+            if float(len(blanklines)) / float(len(lines)) > 0.40:
+                return True
+            else:
+                return False
+
 
     def __call__(self, html):
         self.log("*********  Preprocessing HTML  *********")
@@ -281,97 +322,69 @@ class PreProcessor(object):
             # check if content is in pre tags, use txt processor to mark up if so
             html = self.text_process_pre(html)
 
-        ###### Mark Indents/Cleanup ######
-        #
         # Replace series of non-breaking spaces with text-indent
-        html = self.fix_nbsp_indents(html)
+        if getattr(self.extra_opts, 'fix_indents', True):
+            html = self.fix_nbsp_indents(html)
         
         html = self.cleanup_markup(html)
 
         # ADE doesn't render <br />, change to empty paragraphs
         #html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
 
-        # If more than 40% of the lines are empty paragraphs and the user has enabled delete
-        # blank paragraphs then delete blank lines to clean up spacing
-        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
-        blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
-        multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
-        blanklines = blankreg.findall(html)
-        lines = linereg.findall(html)
-        blanks_between_paragraphs = False
-        print "delete blank paragraphs is "+str(getattr(self.extra_opts, 'delete_blank_paragraphs', False))
-        if len(lines) > 1:
-            self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
-                    unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
-                    
-            if float(len(blanklines)) / float(len(lines)) > 0.40:
-                blanks_between_paragraphs = True
-                print "blanks between paragraphs is marked True"
-            else:
-                blanks_between_paragraphs = False
+        # Determine whether the document uses interleaved blank lines
+        blanks_between_paragraphs = self.analyze_blanks(html)
 
         #self.dump(html, 'before_chapter_markup')
         # detect chapters/sections to match xpath or splitting logic
-        #
 
-        html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
+        if getattr(self.extra_opts, 'markup_chapter_headings', True):
+            html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
 
+        # If more than 40% of the lines are empty paragraphs and the user has enabled delete
+        # blank paragraphs then delete blank lines to clean up spacing
         if blanks_between_paragraphs and getattr(self.extra_opts,
         'delete_blank_paragraphs', False):
             self.log("deleting blank lines")
-            html = multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
-            html = blankreg.sub('', html)
+            html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
+            html = self.blankreg.sub('', html)
             
         ###### Unwrap lines ######
-        #
-        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
-        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
-        # that lines can be un-wrapped across page boundaries
-        paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
-        spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
-        paras = len(paras_reg.findall(html))
-        spans = len(spans_reg.findall(html))
-        if spans > 1:
-            if float(paras) / float(spans) < 0.75:
-                format = 'spanned_html'
-            else:
-                format = 'html'
-        else:
-            format = 'html'
-        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
-        # more of the lines break in the same region of the document then unwrapping is required
-        docanalysis = DocAnalysis(format, html)
-        hardbreaks = docanalysis.line_histogram(.50)
-        self.log("Hard line breaks check returned "+unicode(hardbreaks))
-        # Calculate Length
-        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
-        length = docanalysis.line_length(unwrap_factor)
-        self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
-        # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
-        if hardbreaks or unwrap_factor < 0.4:
-            self.log("Unwrapping required, unwrapping Lines")
-            # Unwrap em/en dashes
-            html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
-            # Dehyphenate
-            self.log("Unwrapping/Removing hyphens")
-            dehyphenator = Dehyphenator()
-            html = dehyphenator(html,'html', length)
-            self.log("Done dehyphenating")
-            # Unwrap lines using punctation and line length
-            #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
-            html = self.punctuation_unwrap(length, html, 'html')
-            #check any remaining hyphens, but only unwrap if there is a match
-            dehyphenator = Dehyphenator()
-            html = dehyphenator(html,'html_cleanup', length)
-        else:
-            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
-            self.log("Cleaning up hyphenation")
-            dehyphenator = Dehyphenator()
-            html = dehyphenator(html,'html_cleanup', length)
-            self.log("Done dehyphenating")
+        if getattr(self.extra_opts, 'unwrap_lines', True):
+            # Determine line ending type
+            # Some OCR sourced files have line breaks in the html using a combination of span & p tags
+            # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
+            # that lines can be un-wrapped across page boundaries
+            format = self.analyze_line_endings(html)
 
-        # delete soft hyphens
-        html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
+            # Check Line histogram to determine if the document uses hard line breaks, If 50% or
+            # more of the lines break in the same region of the document then unwrapping is required
+            docanalysis = DocAnalysis(format, html)
+            hardbreaks = docanalysis.line_histogram(.50)
+            self.log("Hard line breaks check returned "+unicode(hardbreaks))
+
+            # Calculate Length
+            unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
+            length = docanalysis.line_length(unwrap_factor)
+            self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
+
+            # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
+            if hardbreaks or unwrap_factor < 0.4:
+                self.log("Unwrapping required, unwrapping Lines")
+                # Dehyphenate with line length limiters
+                dehyphenator = Dehyphenator()
+                html = dehyphenator(html,'html', length)
+                html = self.punctuation_unwrap(length, html, 'html')
+                #check any remaining hyphens, but only unwrap if there is a match
+                dehyphenator = Dehyphenator()
+                html = dehyphenator(html,'html_cleanup', length)
+
+        if getattr(self.extra_opts, 'dehyphenate', True):
+            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
+            self.log("Fixing hyphenated content")
+            dehyphenator = Dehyphenator()
+            html = dehyphenator(html,'html_cleanup', length)
+            # delete soft hyphens
+            html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
 
         # If still no sections after unwrapping mark split points on lines with no punctuation
         if self.html_preprocess_sections < self.min_chapters:
@@ -385,10 +398,12 @@ class PreProcessor(object):
         doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
         html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
 
-        # put back non-breaking spaces in empty paragraphs to preserve original formatting
-        html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
+        if getattr(self.extra_opts, 'dehyphenate', True):
+            # Center separator lines
+            html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
 
-        # Center separator lines
-        html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
+        if self.deleted_nbsps:
+            # put back non-breaking spaces in empty paragraphs to preserve original formatting
+            html = self.blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
 
         return html

From e581c8c5dedf0e68fa5c3ca5a06d660546e9996c Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 15:40:59 +0800
Subject: [PATCH 16/54] created sub-functions for text processing, added soft
 hyphens to punctuation unwrap

---
 src/calibre/ebooks/conversion/utils.py | 43 +++++++++++++++-----------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 68afc464a0..99685e90d1 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -19,6 +19,7 @@ class PreProcessor(object):
         self.found_indents = 0
         self.extra_opts = extra_opts
         self.deleted_nbsps = False
+        self.totalwords = 0
         self.min_chapters = 1
         self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
         self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
@@ -201,6 +202,7 @@ class PreProcessor(object):
         # define the pieces of the regex
         lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
         em_en_lookahead = "(?<=.{"+str(length)+"}[\u2013\u2014])"
+        soft_hyphen = "\xad"
         line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
         blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
         line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
@@ -208,10 +210,12 @@ class PreProcessor(object):
 
         unwrap_regex = lookahead+line_ending+blanklines+line_opening
         em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
+        shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
 
         if format == 'txt':
             unwrap_regex = lookahead+txt_line_wrap
             em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
+            shy_unwrap_regex = soft_hyphen+txt_line_wrap
 
         unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
         em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
@@ -220,18 +224,21 @@ class PreProcessor(object):
         content = em_en_unwrap.sub('', content)
         return content
 
+    def txt_process(self, match):
+        from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
+        separate_paragraphs_single_line
+        content = match.group('text')
+        content = separate_paragraphs_single_line(content)
+        content = preserve_spaces(content)
+        content = convert_basic(content, epub_split_size_kb=0)
+        return content
 
-    def text_process_pre(self, html):
+    def markup_pre(self, html):
         pre = re.compile(r'<pre>', re.IGNORECASE)
-        if len(pre.findall(html)) == 1:
+        if len(pre.findall(html)) >= 1:
             self.log("Running Text Processing")
-            from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
-            separate_paragraphs_single_line
             outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
-            html = outerhtml.sub('\g<text>', html)
-            html = separate_paragraphs_single_line(html)
-            html = preserve_spaces(html)
-            html = convert_basic(html, epub_split_size_kb=0)
+            html = outerhtml.sub(self.txt_process, html)
         else:
             # Add markup naively
             # TODO - find out if there are cases where there are more than one <pre> tag or
@@ -302,25 +309,26 @@ class PreProcessor(object):
 
         # Count the words in the document to estimate how many chapters to look for and whether
         # other types of processing are attempted
-        totalwords = 0
-        totalwords = self.get_word_count(html)
+        try:
+            self.totalwords = self.get_word_count(html)
+        except:
+            self.log("Can't get wordcount")
 
-        if totalwords < 50:
+        if 0 < self.totalwords < 50:
             self.log("flow is too short, not running heuristics")
             return html
 
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
         html = self.arrange_htm_line_endings(html)
 
-
         ###### Check Markup ######
         #
         # some lit files don't have any <p> tags or equivalent (generally just plain text between
         # <pre> tags), check and  mark up line endings if required before proceeding
         if self.no_markup(html, 0.1):
             self.log("not enough paragraph markers, adding now")
-            # check if content is in pre tags, use txt processor to mark up if so
-            html = self.text_process_pre(html)
+            # markup using text processing
+            html = self.markup_pre(html)
 
         # Replace series of non-breaking spaces with text-indent
         if getattr(self.extra_opts, 'fix_indents', True):
@@ -338,7 +346,7 @@ class PreProcessor(object):
         # detect chapters/sections to match xpath or splitting logic
 
         if getattr(self.extra_opts, 'markup_chapter_headings', True):
-            html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
+            html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
 
         # If more than 40% of the lines are empty paragraphs and the user has enabled delete
         # blank paragraphs then delete blank lines to clean up spacing
@@ -383,8 +391,6 @@ class PreProcessor(object):
             self.log("Fixing hyphenated content")
             dehyphenator = Dehyphenator()
             html = dehyphenator(html,'html_cleanup', length)
-            # delete soft hyphens
-            html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
 
         # If still no sections after unwrapping mark split points on lines with no punctuation
         if self.html_preprocess_sections < self.min_chapters:
@@ -392,13 +398,14 @@ class PreProcessor(object):
                     " currently have " + unicode(self.html_preprocess_sections))
             chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
             html = chapdetect3.sub(self.chapter_break, html)
+
         # search for places where a first or second level heading is immediately followed by another
         # top level heading.  demote the second heading to h3 to prevent splitting between chapter
         # headings and titles, images, etc
         doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
         html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
 
-        if getattr(self.extra_opts, 'dehyphenate', True):
+        if getattr(self.extra_opts, 'format_scene_breaks', True):
             # Center separator lines
             html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
 

From a44d29e840acd0eb14b43093e0a4c178da4a69a6 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 20:13:51 +0800
Subject: [PATCH 17/54] only run cleanup_markup when required, begin
 markup_chapters rewrite

---
 src/calibre/ebooks/conversion/utils.py | 35 +++++++++++++++-----------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 99685e90d1..ec175061cc 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -25,6 +25,16 @@ class PreProcessor(object):
         self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
 
+        self.chapter_types = [
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings", 'common'],
+            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
+            [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
+            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
+            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
+            [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings", 'plain_number'],  # Numeric Chapters, no dot or colon
+            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
+            ]
+
     def is_pdftohtml(self, src):
         return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
 
@@ -163,18 +173,8 @@ class PreProcessor(object):
 
         default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
 
-        chapter_types = [
-            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],
-            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
-            [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
-            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"],  # Spaced Lettering
-            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
-            [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"],  # Numeric Chapters, no dot or colon
-            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
-            ]
-
         # Start with most typical chapter headings, get more aggressive until one works
-        for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
+        for [chapter_type, lookahead_ignorecase, log_message, type_name] in self.chapter_types:
             if self.html_preprocess_sections >= self.min_chapters:
                 break
             full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
@@ -303,6 +303,12 @@ class PreProcessor(object):
             else:
                 return False
 
+    def cleanup_required(self):
+        for option in ['unwrap_lines', 'markup_chapter_headings', 'format_scene_breaks', 'delete_blank_paragraphs']:
+            if getattr(self.extra_opts, option, False):
+                return True
+        return False
+
 
     def __call__(self, html):
         self.log("*********  Preprocessing HTML  *********")
@@ -333,8 +339,9 @@ class PreProcessor(object):
         # Replace series of non-breaking spaces with text-indent
         if getattr(self.extra_opts, 'fix_indents', True):
             html = self.fix_nbsp_indents(html)
-        
-        html = self.cleanup_markup(html)
+
+        if self.cleanup_required():
+            html = self.cleanup_markup(html)
 
         # ADE doesn't render <br />, change to empty paragraphs
         #html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
@@ -393,7 +400,7 @@ class PreProcessor(object):
             html = dehyphenator(html,'html_cleanup', length)
 
         # If still no sections after unwrapping mark split points on lines with no punctuation
-        if self.html_preprocess_sections < self.min_chapters:
+        if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
             self.log("Looking for more split points based on punctuation,"
                     " currently have " + unicode(self.html_preprocess_sections))
             chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)

From 1301fe69d16e452644944efbd2447f61fd6fe4fb Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 20:53:16 +0800
Subject: [PATCH 18/54] started multi-pass chapter analysis

---
 src/calibre/ebooks/conversion/utils.py | 71 +++++++++++++++++---------
 1 file changed, 47 insertions(+), 24 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index ec175061cc..2a88d371cc 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -21,20 +21,12 @@ class PreProcessor(object):
         self.deleted_nbsps = False
         self.totalwords = 0
         self.min_chapters = 1
+        self.chapters_no_title = 0
+        self.chapters_with_title = 0
         self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
         self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
 
-        self.chapter_types = [
-            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings", 'common'],
-            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
-            [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
-            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
-            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
-            [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings", 'plain_number'],  # Numeric Chapters, no dot or colon
-            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
-            ]
-
     def is_pdftohtml(self, src):
         return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
 
@@ -60,6 +52,14 @@ class PreProcessor(object):
                 " section markers based on punctuation. - " + unicode(chap))
         return '<'+styles+' style="page-break-before:always">'+chap
 
+    def analyze_title_matches(self, match):
+        chap = match.group('chap')
+        title = match.group('title')
+        if not title:
+            self.chapters_no_title = self.chapters_no_title + 1
+        else:
+            self.chapters_with_title = self.chapters_with_title + 1
+
     def insert_indent(self, match):
         pstyle = match.group('formatting')
         span = match.group('span')
@@ -173,20 +173,43 @@ class PreProcessor(object):
 
         default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
 
-        # Start with most typical chapter headings, get more aggressive until one works
-        for [chapter_type, lookahead_ignorecase, log_message, type_name] in self.chapter_types:
-            if self.html_preprocess_sections >= self.min_chapters:
-                break
-            full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
-            n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-            self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
-            if lookahead_ignorecase:
-                chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-                chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
-            else:
-                chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
-                chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
-            html = chapdetect.sub(self.chapter_head, html)
+        chapter_types = [
+            [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for most common chapter headings", 'chapter'],  # Highest frequency headings which include titles
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common section headings", 'common'],
+            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
+            [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
+            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
+            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
+            [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric headings", 'plain_number'],  # Numeric Chapters, no dot or colon
+            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
+            ]
+
+        def recurse_patterns(html, analyze):
+            # Start with most typical chapter headings, get more aggressive until one works
+            for [chapter_type, lookahead_ignorecase, log_message, type_name] in chapter_types:
+                if self.html_preprocess_sections >= self.min_chapters:
+                    break
+                full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
+                n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
+                self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
+                if lookahead_ignorecase:
+                    chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+                    chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+                else:
+                    chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
+                    chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
+                if analyze:
+                    hits = len(chapdetect.findall(html))
+                    print unicode(type_name)+" had "+unicode(hits)+" hits"
+                    chapdetect.sub(self.analyze_title_matches, html)
+                    print unicode(self.chapters_no_title)+" chapters with no title"
+                    print unicode(self.chapters_with_title)+" chapters with titles"
+                else:
+                    html = chapdetect.sub(self.chapter_head, html)
+                    return html
+
+        recurse_patterns(html, True)
+        html = recurse_patterns(html, False)
 
         words_per_chptr = wordcount
         if words_per_chptr > 0 and self.html_preprocess_sections > 0:

From fabd4f5fdfdc56d98c96ffa2b8b726fcf60c340b Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 15 Jan 2011 08:36:15 -0500
Subject: [PATCH 19/54] Clean up search and replace GUI widget.

---
 .../gui2/convert/search_and_replace.py        | 14 ++--
 .../gui2/convert/search_and_replace.ui        | 68 ++++++++++++++++---
 2 files changed, 62 insertions(+), 20 deletions(-)

diff --git a/src/calibre/gui2/convert/search_and_replace.py b/src/calibre/gui2/convert/search_and_replace.py
index de9033a46e..36a496c520 100644
--- a/src/calibre/gui2/convert/search_and_replace.py
+++ b/src/calibre/gui2/convert/search_and_replace.py
@@ -24,25 +24,19 @@ class SearchAndReplaceWidget(Widget, Ui_Form):
                 )
         self.db, self.book_id = db, book_id
         self.initialize_options(get_option, get_help, db, book_id)
-        self.opt_sr1_search.set_msg(_('Search regular expression 1:'))
-        self.opt_sr1_replace.set_msg(_('Replace regular expression 1:'))
-        self.opt_sr2_search.set_msg(_('Search regular expression 2:'))
-        self.opt_sr2_replace.set_msg(_('Replace regular expression 2:'))
-        self.opt_sr3_search.set_msg(_('Search regular expression 3:'))
-        self.opt_sr3_replace.set_msg(_('Replace regular expression 3:'))
+        self.opt_sr1_search.set_msg(_('Regular Expression'))
+        self.opt_sr2_search.set_msg(_('Regular Expression'))
+        self.opt_sr3_search.set_msg(_('Regular Expression'))
         
     def break_cycles(self):
         Widget.break_cycles(self)
         
         self.opt_sr1_search.break_cycles()
-        self.opt_sr1_replace.break_cycles()
         self.opt_sr2_search.break_cycles()
-        self.opt_sr2_replace.break_cycles()
         self.opt_sr3_search.break_cycles()
-        self.opt_sr3_replace.break_cycles()
 
     def pre_commit_check(self):
-        for x in ('sr1-search', 'sr1-replace', 'sr2-search', 'sr2-replace', 'sr3-search', 'sr3-replace',):
+        for x in ('sr1-search', 'sr2-search', 'sr3-search'):
             x = getattr(self, 'opt_'+x)
             try:
                 pat = unicode(x.regex)
diff --git a/src/calibre/gui2/convert/search_and_replace.ui b/src/calibre/gui2/convert/search_and_replace.ui
index 5913f2c098..ed500a4dd0 100644
--- a/src/calibre/gui2/convert/search_and_replace.ui
+++ b/src/calibre/gui2/convert/search_and_replace.ui
@@ -13,24 +13,72 @@
   <property name="windowTitle">
    <string>Form</string>
   </property>
-  <layout class="QVBoxLayout" name="verticalLayout">
-   <item>
+  <layout class="QGridLayout" name="gridLayout">
+   <item row="0" column="1">
+    <widget class="QLabel" name="label_4">
+     <property name="text">
+      <string>Search</string>
+     </property>
+    </widget>
+   </item>
+   <item row="0" column="2">
+    <widget class="QLabel" name="label_5">
+     <property name="text">
+      <string>Replace</string>
+     </property>
+    </widget>
+   </item>
+   <item row="1" column="0">
+    <widget class="QLabel" name="label">
+     <property name="text">
+      <string>1.</string>
+     </property>
+    </widget>
+   </item>
+   <item row="1" column="1">
     <widget class="RegexEdit" name="opt_sr1_search" native="true"/>
    </item>
-   <item>
-    <widget class="RegexEdit" name="opt_sr1_replace" native="true"/>
+   <item row="1" column="2">
+    <widget class="QLineEdit" name="opt_sr1_replace"/>
    </item>
-   <item>
+   <item row="2" column="0">
+    <widget class="QLabel" name="label_2">
+     <property name="text">
+      <string>2.</string>
+     </property>
+    </widget>
+   </item>
+   <item row="2" column="1">
     <widget class="RegexEdit" name="opt_sr2_search" native="true"/>
    </item>
-   <item>
-    <widget class="RegexEdit" name="opt_sr2_replace" native="true"/>
+   <item row="2" column="2">
+    <widget class="QLineEdit" name="opt_sr2_replace"/>
    </item>
-   <item>
+   <item row="3" column="0">
+    <widget class="QLabel" name="label_3">
+     <property name="text">
+      <string>3.</string>
+     </property>
+    </widget>
+   </item>
+   <item row="3" column="1">
     <widget class="RegexEdit" name="opt_sr3_search" native="true"/>
    </item>
-   <item>
-    <widget class="RegexEdit" name="opt_sr3_replace" native="true"/>
+   <item row="3" column="2">
+    <widget class="QLineEdit" name="opt_sr3_replace"/>
+   </item>
+   <item row="4" column="1">
+    <spacer name="verticalSpacer">
+     <property name="orientation">
+      <enum>Qt::Vertical</enum>
+     </property>
+     <property name="sizeHint" stdset="0">
+      <size>
+       <width>20</width>
+       <height>330</height>
+      </size>
+     </property>
+    </spacer>
    </item>
   </layout>
  </widget>

From cfaa113f9557b9359208409a538302d9ec0af1d4 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 15 Jan 2011 09:05:08 -0500
Subject: [PATCH 20/54] Move italic marking to preprocessor. Have TXT input use
 the preprocessor for heuristics. Change preprocessor getattr to default to
 False otherwise every option set to off will run.

---
 src/calibre/ebooks/conversion/utils.py       | 46 +++++++++++++---
 src/calibre/ebooks/txt/heuristicprocessor.py | 58 --------------------
 src/calibre/ebooks/txt/input.py              | 13 +++--
 src/calibre/ebooks/txt/processor.py          |  5 --
 4 files changed, 48 insertions(+), 74 deletions(-)
 delete mode 100644 src/calibre/ebooks/txt/heuristicprocessor.py

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 2a88d371cc..56d4339d8c 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -128,6 +128,36 @@ class PreProcessor(object):
         wordcount = get_wordcount_obj(word_count_text)
         return wordcount.words
 
+    def markup_italicis(self, html):
+        ITALICIZE_WORDS = [
+            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
+            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
+            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
+            'Mlle.', 'Mons.', 'PS.', 'PPS.',
+        ]
+        
+        ITALICIZE_STYLE_PATS = [
+            r'(?msu)_(?P<words>.+?)_',
+            r'(?msu)/(?P<words>[^<>]+?)/',
+            r'(?msu)~~(?P<words>.+?)~~',
+            r'(?msu)\*(?P<words>.+?)\*',
+            r'(?msu)~(?P<words>.+?)~',
+            r'(?msu)_/(?P<words>[^<>]+?)/_',
+            r'(?msu)_\*(?P<words>.+?)\*_',
+            r'(?msu)\*/(?P<words>[^<>]+?)/\*',
+            r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
+            r'(?msu)/:(?P<words>[^<>]+?):/',
+            r'(?msu)\|:(?P<words>.+?):\|',
+        ]
+        
+        for word in ITALICIZE_WORDS:
+            html = html.replace(word, '<i>%s</i>' % word)
+
+        for pat in ITALICIZE_STYLE_PATS:
+            html = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), html)
+
+        return html
+
     def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
         '''
         Searches for common chapter headings throughout the document
@@ -360,7 +390,7 @@ class PreProcessor(object):
             html = self.markup_pre(html)
 
         # Replace series of non-breaking spaces with text-indent
-        if getattr(self.extra_opts, 'fix_indents', True):
+        if getattr(self.extra_opts, 'fix_indents', False):
             html = self.fix_nbsp_indents(html)
 
         if self.cleanup_required():
@@ -375,19 +405,21 @@ class PreProcessor(object):
         #self.dump(html, 'before_chapter_markup')
         # detect chapters/sections to match xpath or splitting logic
 
-        if getattr(self.extra_opts, 'markup_chapter_headings', True):
+        if getattr(self.extra_opts, 'markup_chapter_headings', False):
             html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
 
+        if getattr(self.extra_opts, 'italicize_common_cases', False): 
+            html = self.markup_italicis(html)
+
         # If more than 40% of the lines are empty paragraphs and the user has enabled delete
         # blank paragraphs then delete blank lines to clean up spacing
-        if blanks_between_paragraphs and getattr(self.extra_opts,
-        'delete_blank_paragraphs', False):
+        if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
             self.log("deleting blank lines")
             html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
             html = self.blankreg.sub('', html)
             
         ###### Unwrap lines ######
-        if getattr(self.extra_opts, 'unwrap_lines', True):
+        if getattr(self.extra_opts, 'unwrap_lines', False):
             # Determine line ending type
             # Some OCR sourced files have line breaks in the html using a combination of span & p tags
             # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
@@ -416,7 +448,7 @@ class PreProcessor(object):
                 dehyphenator = Dehyphenator()
                 html = dehyphenator(html,'html_cleanup', length)
 
-        if getattr(self.extra_opts, 'dehyphenate', True):
+        if getattr(self.extra_opts, 'dehyphenate', False):
             # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
             self.log("Fixing hyphenated content")
             dehyphenator = Dehyphenator()
@@ -435,7 +467,7 @@ class PreProcessor(object):
         doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
         html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
 
-        if getattr(self.extra_opts, 'format_scene_breaks', True):
+        if getattr(self.extra_opts, 'format_scene_breaks', False):
             # Center separator lines
             html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
 
diff --git a/src/calibre/ebooks/txt/heuristicprocessor.py b/src/calibre/ebooks/txt/heuristicprocessor.py
deleted file mode 100644
index b9d18fd23a..0000000000
--- a/src/calibre/ebooks/txt/heuristicprocessor.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# -*- coding: utf-8 -*-
-
-__license__ = 'GPL 3'
-__copyright__ = '2011, John Schember <john@nachtimwald.com>'
-__docformat__ = 'restructuredtext en'
-
-import re
-
-from calibre import prepare_string_for_xml
-
-class TXTHeuristicProcessor(object):
-
-    def __init__(self):
-        self.ITALICIZE_WORDS = [
-            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
-            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
-            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
-            'Mlle.', 'Mons.', 'PS.', 'PPS.',
-        ]
-        self.ITALICIZE_STYLE_PATS = [
-            r'(?msu)_(?P<words>.+?)_',
-            r'(?msu)/(?P<words>[^<>]+?)/',
-            r'(?msu)~~(?P<words>.+?)~~',
-            r'(?msu)\*(?P<words>.+?)\*',
-            r'(?msu)~(?P<words>.+?)~',
-            r'(?msu)_/(?P<words>[^<>]+?)/_',
-            r'(?msu)_\*(?P<words>.+?)\*_',
-            r'(?msu)\*/(?P<words>[^<>]+?)/\*',
-            r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
-            r'(?msu)/:(?P<words>[^<>]+?):/',
-            r'(?msu)\|:(?P<words>.+?):\|',
-        ]
-
-    def process_paragraph(self, paragraph):
-        for word in self.ITALICIZE_WORDS:
-            paragraph = paragraph.replace(word, '<i>%s</i>' % word)
-        for pat in self.ITALICIZE_STYLE_PATS:
-            paragraph = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), paragraph)
-        return paragraph
-
-    def convert(self, txt, title='', epub_split_size_kb=0):
-        from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE
-        txt = clean_txt(txt)
-        txt = split_txt(txt, epub_split_size_kb)
-
-        processed = []
-        for line in txt.split('\n\n'):
-            processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
-
-        txt = u'\n'.join(processed)
-        txt = re.sub('[ ]{2,}', ' ', txt)
-        html = HTML_TEMPLATE % (title, txt)
-
-        from calibre.ebooks.conversion.utils import PreProcessor
-        pp = PreProcessor()
-        html = pp.markup_chapters(html, pp.get_word_count(html), False)
-
-        return html
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 0b0bd6d570..5cffbafe21 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
     separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
     preserve_spaces, detect_paragraph_type, detect_formatting_type, \
-    convert_heuristic, normalize_line_endings, convert_textile
+    normalize_line_endings, convert_textile
 from calibre import _ent_pat, xml_entity_to_unicode
 
 class TXTInput(InputFormatPlugin):
@@ -126,11 +126,16 @@ class TXTInput(InputFormatPlugin):
                 txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
 
             flow_size = getattr(options, 'flow_size', 0)
+            html = convert_basic(txt, epub_split_size_kb=flow_size)
 
             if options.formatting_type == 'heuristic':
-                html = convert_heuristic(txt, epub_split_size_kb=flow_size)
-            else:
-                html = convert_basic(txt, epub_split_size_kb=flow_size)
+                setattr(options, 'enable_heuristics', True)
+                setattr(options, 'markup_chapter_headings', True)
+                setattr(options, 'italicize_common_cases', True)
+                setattr(options, 'fix_indents', True)
+                setattr(options, 'delete_blank_paragraphs', True)
+                setattr(options, 'format_scene_breaks', True)
+                setattr(options, 'dehyphenate', True)
 
         # Dehyphenate in cleanup mode for missed txt and markdown conversion
         dehyphenator = Dehyphenator()
diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index e1979063c0..9fd8af0d70 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -12,7 +12,6 @@ import os, re
 
 from calibre import prepare_string_for_xml, isbytestring
 from calibre.ebooks.metadata.opf2 import OPFCreator
-from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
 from calibre.ebooks.conversion.preprocess import DocAnalysis
 from calibre.utils.cleantext import clean_ascii_chars
 
@@ -67,10 +66,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
 
     return HTML_TEMPLATE % (title, u'\n'.join(lines))
 
-def convert_heuristic(txt, title='', epub_split_size_kb=0):
-    tp = TXTHeuristicProcessor()
-    return tp.convert(txt, title, epub_split_size_kb)
-
 def convert_markdown(txt, title='', disable_toc=False):
     from calibre.ebooks.markdown import markdown
     md = markdown.Markdown(

From 946f1cf6c0e332898d34a7cf41680b6b2e3fce7b Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 22:07:20 +0800
Subject: [PATCH 21/54] added option for renumbering heading tags

---
 src/calibre/ebooks/conversion/cli.py     |  2 +-
 src/calibre/ebooks/conversion/plumber.py |  8 +++++++-
 src/calibre/ebooks/conversion/utils.py   | 15 ++++++++-------
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index db1ec0857d..c9612d97b9 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -137,7 +137,7 @@ def add_pipeline_options(parser, plumber):
                       'italicize_common_cases', 'fix_indents',
                       'html_unwrap_factor', 'unwrap_lines',
                       'delete_blank_paragraphs', 'format_scene_breaks',
-                      'dehyphenate',
+                      'dehyphenate', 'renumber_headings',
                   ]
                   ),
                   
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 48b965f624..b8c45dfa14 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -532,7 +532,13 @@ OptionRecommendation(name='dehyphenate',
     help=_('Analyses hyphenated words throughout the document.  The '
            'document itself is used as a dictionary to determine whether hyphens '
            'should be retained or removed.')),
-    
+
+OptionRecommendation(name='renumber_headings',
+    recommended_value=False, level=OptionRecommendation.LOW,
+    help=_('Looks for occurences of sequential <h1> or <h2> tags. '
+           'The tags are renumbered to prevent splitting in the middle '
+           'of chapter headings.')),
+
 OptionRecommendation(name='sr1_search',
     recommended_value='', level=OptionRecommendation.LOW,
     help=_('Search pattern (regular expression) to be replaced with '
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 2a88d371cc..4c62d2c06f 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -416,7 +416,7 @@ class PreProcessor(object):
                 dehyphenator = Dehyphenator()
                 html = dehyphenator(html,'html_cleanup', length)
 
-        if getattr(self.extra_opts, 'dehyphenate', True):
+        if getattr(self.extra_opts, 'dehyphenate', False):
             # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
             self.log("Fixing hyphenated content")
             dehyphenator = Dehyphenator()
@@ -429,13 +429,14 @@ class PreProcessor(object):
             chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
             html = chapdetect3.sub(self.chapter_break, html)
 
-        # search for places where a first or second level heading is immediately followed by another
-        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
-        # headings and titles, images, etc
-        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
-        html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
+        if getattr(self.extra_opts, 'renumber_headings', True):
+            # search for places where a first or second level heading is immediately followed by another
+            # top level heading.  demote the second heading to h3 to prevent splitting between chapter
+            # headings and titles, images, etc
+            doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
+            html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
 
-        if getattr(self.extra_opts, 'format_scene_breaks', True):
+        if getattr(self.extra_opts, 'format_scene_breaks', False):
             # Center separator lines
             html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
 

From 81c365b3a9efd8b546fa096bbe4eb737e607b6ba Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 22:41:38 +0800
Subject: [PATCH 22/54] ...

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 96d386bf78..3693d11cee 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -461,7 +461,7 @@ class PreProcessor(object):
             chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
             html = chapdetect3.sub(self.chapter_break, html)
 
-        if getattr(self.extra_opts, 'renumber_headings', True):
+        if getattr(self.extra_opts, 'renumber_headings', False):
             # search for places where a first or second level heading is immediately followed by another
             # top level heading.  demote the second heading to h3 to prevent splitting between chapter
             # headings and titles, images, etc

From 0edf1e550ea35f4b63208138ecd07c3d5dcb6856 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 15 Jan 2011 22:47:51 +0800
Subject: [PATCH 23/54] ...

---
 src/calibre/ebooks/conversion/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 3693d11cee..305346d496 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -272,9 +272,11 @@ class PreProcessor(object):
 
         unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
         em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
+        shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
 
         content = unwrap.sub(' ', content)
         content = em_en_unwrap.sub('', content)
+        content = shy_unwrap.sub('', content)
         return content
 
     def txt_process(self, match):

From d6256ef452c130c471a184a9517b50e247e6f854 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 15 Jan 2011 11:06:43 -0500
Subject: [PATCH 24/54] Add renumber_headings option to GUI.

---
 src/calibre/gui2/convert/heuristics.py |  3 ++-
 src/calibre/gui2/convert/heuristics.ui | 21 ++++++++++++++-------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/calibre/gui2/convert/heuristics.py b/src/calibre/gui2/convert/heuristics.py
index 2b9df50457..904804f32e 100644
--- a/src/calibre/gui2/convert/heuristics.py
+++ b/src/calibre/gui2/convert/heuristics.py
@@ -21,7 +21,7 @@ class HeuristicsWidget(Widget, Ui_Form):
                  'italicize_common_cases', 'fix_indents',
                  'html_unwrap_factor', 'unwrap_lines',
                  'delete_blank_paragraphs', 'format_scene_breaks',
-                 'dehyphenate']
+                 'dehyphenate', 'renumber_headings']
                 )
         self.db, self.book_id = db, book_id
         self.initialize_options(get_option, get_help, db, book_id)
@@ -53,6 +53,7 @@ class HeuristicsWidget(Widget, Ui_Form):
         self.opt_delete_blank_paragraphs.setEnabled(state)
         self.opt_format_scene_breaks.setEnabled(state)
         self.opt_dehyphenate.setEnabled(state)
+        self.opt_renumber_headings(state)
         
         self.opt_unwrap_lines.setEnabled(state)
         if state and self.opt_unwrap_lines.checkState() == Qt.Checked:
diff --git a/src/calibre/gui2/convert/heuristics.ui b/src/calibre/gui2/convert/heuristics.ui
index e64e79e1df..c5f3c2cb3e 100644
--- a/src/calibre/gui2/convert/heuristics.ui
+++ b/src/calibre/gui2/convert/heuristics.ui
@@ -6,7 +6,7 @@
    <rect>
     <x>0</x>
     <y>0</y>
-    <width>657</width>
+    <width>811</width>
     <height>479</height>
    </rect>
   </property>
@@ -80,42 +80,42 @@
         </property>
        </widget>
       </item>
-      <item row="3" column="0" colspan="2">
+      <item row="4" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_delete_blank_paragraphs">
         <property name="text">
          <string>Delete blank lines between paragraphs</string>
         </property>
        </widget>
       </item>
-      <item row="4" column="0" colspan="2">
+      <item row="5" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_format_scene_breaks">
         <property name="text">
          <string>Ensure scene breaks are consistently formatted</string>
         </property>
        </widget>
       </item>
-      <item row="5" column="0" colspan="2">
+      <item row="6" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_dehyphenate">
         <property name="text">
          <string>Remove unnecessary hyphens</string>
         </property>
        </widget>
       </item>
-      <item row="6" column="0" colspan="2">
+      <item row="7" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_italicize_common_cases">
         <property name="text">
          <string>Italicize common words and patterns</string>
         </property>
        </widget>
       </item>
-      <item row="7" column="0" colspan="2">
+      <item row="8" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_fix_indents">
         <property name="text">
          <string>Replace entity indents with CSS indents</string>
         </property>
        </widget>
       </item>
-      <item row="8" column="0">
+      <item row="9" column="0">
        <spacer name="verticalSpacer">
         <property name="orientation">
          <enum>Qt::Vertical</enum>
@@ -141,6 +141,13 @@
         </property>
        </spacer>
       </item>
+      <item row="3" column="0">
+       <widget class="QCheckBox" name="opt_renumber_headings">
+        <property name="text">
+         <string>Renumber sequences of &lt;h1&gt; or &lt;h2&gt; tags to prevent splitting</string>
+        </property>
+       </widget>
+      </item>
      </layout>
     </widget>
    </item>

From 64796696ae0bec276c798bcc12e8b6d10a878788 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 15 Jan 2011 12:35:02 -0500
Subject: [PATCH 25/54] Enable heuristic processing over the entire conversion
 pipe line when option is enabled.

---
 src/calibre/customize/conversion.py         | 12 ------------
 src/calibre/ebooks/conversion/plumber.py    |  6 ++----
 src/calibre/ebooks/conversion/preprocess.py | 12 ++++++------
 src/calibre/ebooks/conversion/utils.py      |  4 ++--
 src/calibre/ebooks/html/input.py            |  7 -------
 src/calibre/ebooks/lit/input.py             |  9 +--------
 src/calibre/ebooks/lrf/input.py             |  9 ---------
 src/calibre/ebooks/pdb/input.py             |  1 -
 src/calibre/ebooks/rtf/input.py             |  1 -
 9 files changed, 11 insertions(+), 50 deletions(-)

diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py
index a9e573ffa0..b77ac81587 100644
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin):
         '''
         raise NotImplementedError()
 
-    def heuristics(self, opts, html):
-        '''
-        This method is called by the conversion pipeline on all HTML before it
-        is parsed. It is meant to be used to do any required preprocessing on
-        the HTML, like removing hard line breaks, etc.
-
-        :param html: A unicode string
-        :return: A unicode string
-        '''
-        return html
-
-
     def convert(self, stream, options, file_ext, log, accelerators):
         '''
         This method must be implemented in sub-classes. It must return
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index b8c45dfa14..249f848661 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -899,7 +899,6 @@ OptionRecommendation(name='sr3_replace',
                 self.opts_to_mi(self.user_metadata)
             if not hasattr(self.oeb, 'manifest'):
                 self.oeb = create_oebbook(self.log, self.oeb, self.opts,
-                        self.input_plugin,
                         encoding=self.input_plugin.output_encoding)
             self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
             self.opts.is_image_collection = self.input_plugin.is_image_collection
@@ -1009,14 +1008,13 @@ OptionRecommendation(name='sr3_replace',
         self.log(self.output_fmt.upper(), 'output written to', self.output)
         self.flush()
 
-def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
+def create_oebbook(log, path_or_stream, opts, reader=None,
         encoding='utf-8', populate=True):
     '''
     Create an OEBBook.
     '''
     from calibre.ebooks.oeb.base import OEBBook
-    html_preprocessor = HTMLPreProcessor(input_plugin.heuristics,
-            opts.enable_heuristics, opts)
+    html_preprocessor = HTMLPreProcessor(log, opts)
     if not encoding:
         encoding = None
     oeb = OEBBook(log, html_preprocessor,
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 35a311d58f..abaff77f33 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -397,10 +397,8 @@ class HTMLPreProcessor(object):
                      (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
                       lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
                      ]
-    def __init__(self, input_plugin_preprocess, plugin_preprocess,
-            extra_opts=None):
-        self.input_plugin_preprocess = input_plugin_preprocess
-        self.plugin_preprocess = plugin_preprocess
+    def __init__(self, log=None, extra_opts=None):
+        self.log = log
         self.extra_opts = extra_opts
 
     def is_baen(self, src):
@@ -542,8 +540,10 @@ class HTMLPreProcessor(object):
             unidecoder = Unidecoder()
             html = unidecoder.decode(html)
 
-        if self.plugin_preprocess:
-            html = self.input_plugin_preprocess(self.extra_opts, html)
+        if getattr(self.extra_opts, 'enable_heuristics', False):
+            from calibre.ebooks.conversion.utils import HeuristicProcessor
+            preprocessor = HeuristicProcessor(self.extra_opts, self.log)
+            html = preprocessor(html)
 
         if getattr(self.extra_opts, 'smarten_punctuation', False):
             html = self.smarten_punctuation(html)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 305346d496..48806e78e7 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -11,7 +11,7 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.utils.logging import default_log
 from calibre.utils.wordcount import get_wordcount_obj
 
-class PreProcessor(object):
+class HeuristicProcessor(object):
 
     def __init__(self, extra_opts=None, log=None):
         self.log = default_log if log is None else log
@@ -366,7 +366,7 @@ class PreProcessor(object):
 
 
     def __call__(self, html):
-        self.log("*********  Preprocessing HTML  *********")
+        self.log("*********  Heuristic processing HTML  *********")
 
         # Count the words in the document to estimate how many chapters to look for and whether
         # other types of processing are attempted
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 479f852c77..ed0bf7b3ef 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -24,7 +24,6 @@ from calibre.constants import islinux, isfreebsd, iswindows
 from calibre import unicode_path
 from calibre.utils.localization import get_lang
 from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.conversion.utils import PreProcessor
 
 class Link(object):
     '''
@@ -485,9 +484,3 @@ class HTMLInput(InputFormatPlugin):
             self.log.exception('Failed to read CSS file: %r'%link)
             return (None, None)
         return (None, raw)
-
-    def heuristics(self, options, html):
-        self.options = options
-        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
-        return preprocessor(html)
-
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index d0ecf008b7..7b822b68a6 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -7,7 +7,7 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
+from calibre.ebooks.conversion.utils import HeuristicProcessor
 
 
 class LITInput(InputFormatPlugin):
@@ -51,10 +51,3 @@ class LITInput(InputFormatPlugin):
                     for elem in body:
                         ne = copy.deepcopy(elem)
                         pre.append(ne)
-
-
-    def heuristics(self, options, html):
-        self.options = options
-        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
-        return preprocessor(html)
-
diff --git a/src/calibre/ebooks/lrf/input.py b/src/calibre/ebooks/lrf/input.py
index 05c8731da5..70f3c3a15a 100644
--- a/src/calibre/ebooks/lrf/input.py
+++ b/src/calibre/ebooks/lrf/input.py
@@ -12,7 +12,6 @@ from copy import deepcopy
 from lxml import etree
 
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
 from calibre import guess_type
 
 class Canvas(etree.XSLTExtension):
@@ -419,11 +418,3 @@ class LRFInput(InputFormatPlugin):
             f.write(result)
         styles.write()
         return os.path.abspath('content.opf')
-
-    def heuristics(self, options, html):
-        self.options = options
-        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
-        return preprocessor(html)
-
-
-
diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py
index de210e0a6d..cd861216af 100644
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@@ -9,7 +9,6 @@ import os
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.pdb.header import PdbHeaderReader
 from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
-from calibre.ebooks.conversion.utils import PreProcessor
 
 class PDBInput(InputFormatPlugin):
 
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 2f931d1d04..d3849bc5f5 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -7,7 +7,6 @@ import os, glob, re, textwrap
 from lxml import etree
 
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
 
 border_style_map = {
         'single' : 'solid',

From 60c50f39442b09872fb5aeb98a3be2bea3f4ec56 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 16 Jan 2011 01:46:04 +0800
Subject: [PATCH 26/54] tied mobi into preprocess

---
 src/calibre/ebooks/conversion/utils.py |  5 +++--
 src/calibre/ebooks/mobi/input.py       | 11 ++++-------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 305346d496..9825585cbf 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -236,7 +236,7 @@ class PreProcessor(object):
                     print unicode(self.chapters_with_title)+" chapters with titles"
                 else:
                     html = chapdetect.sub(self.chapter_head, html)
-                    return html
+            return html
 
         recurse_patterns(html, True)
         html = recurse_patterns(html, False)
@@ -322,7 +322,8 @@ class PreProcessor(object):
         html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
         # Delete microsoft 'smart' tags
         html = re.sub('(?i)</?st1:\w+>', '', html)
-        # Get rid of empty span, bold, & italics tags
+        # Get rid of empty span, bold, font, & italics tags
+        html = re.sub(r'\s*<font[^>]*>\s*</font>\s*', '', html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
         html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py
index 584be71fe4..4f3a087065 100644
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@@ -5,6 +5,7 @@ __docformat__ = 'restructuredtext en'
 
 import re
 from calibre.customize.conversion import InputFormatPlugin
+from calibre.ebooks.conversion.utils import PreProcessor
 
 class MOBIInput(InputFormatPlugin):
 
@@ -40,10 +41,6 @@ class MOBIInput(InputFormatPlugin):
         return mr.created_opf_path
 
     def heuristics(self, options, html):
-        # search for places where a first or second level heading is immediately followed by another
-        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
-        # headings and titles, images, etc
-        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
-        html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
-        return html
-
+        self.options = options
+        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
+        return preprocessor(html)

From d354a085b8e06f3283231a18fecbf2ee775f52bd Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 16 Jan 2011 01:53:49 +0800
Subject: [PATCH 27/54] ...

---
 src/calibre/ebooks/mobi/input.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py
index 4f3a087065..8188027e01 100644
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@@ -5,7 +5,6 @@ __docformat__ = 'restructuredtext en'
 
 import re
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
 
 class MOBIInput(InputFormatPlugin):
 
@@ -40,7 +39,3 @@ class MOBIInput(InputFormatPlugin):
                 accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
         return mr.created_opf_path
 
-    def heuristics(self, options, html):
-        self.options = options
-        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
-        return preprocessor(html)

From 8f345212da4d3c0289e54babd2b01cd4bf4fd767 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 15 Jan 2011 14:32:20 -0500
Subject: [PATCH 28/54] Fix issue with disabling checkbox.

---
 src/calibre/gui2/convert/heuristics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/gui2/convert/heuristics.py b/src/calibre/gui2/convert/heuristics.py
index 904804f32e..525d5ba2f1 100644
--- a/src/calibre/gui2/convert/heuristics.py
+++ b/src/calibre/gui2/convert/heuristics.py
@@ -53,7 +53,7 @@ class HeuristicsWidget(Widget, Ui_Form):
         self.opt_delete_blank_paragraphs.setEnabled(state)
         self.opt_format_scene_breaks.setEnabled(state)
         self.opt_dehyphenate.setEnabled(state)
-        self.opt_renumber_headings(state)
+        self.opt_renumber_headings.setEnabled(state)
         
         self.opt_unwrap_lines.setEnabled(state)
         if state and self.opt_unwrap_lines.checkState() == Qt.Checked:

From 9134d51377c1813fadec856867646bce8b74d762 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 15 Jan 2011 20:32:21 -0500
Subject: [PATCH 29/54] Clean up GUI option widgets.

---
 src/calibre/gui2/convert/heuristics.ui        |  65 +++---
 .../gui2/convert/search_and_replace.py        |   8 +-
 .../gui2/convert/search_and_replace.ui        | 206 +++++++++++++-----
 .../gui2/convert/structure_detection.ui       |  15 +-
 src/calibre/gui2/convert/xexp_edit.ui         |  21 +-
 5 files changed, 199 insertions(+), 116 deletions(-)

diff --git a/src/calibre/gui2/convert/heuristics.ui b/src/calibre/gui2/convert/heuristics.ui
index c5f3c2cb3e..1578b7146c 100644
--- a/src/calibre/gui2/convert/heuristics.ui
+++ b/src/calibre/gui2/convert/heuristics.ui
@@ -6,8 +6,8 @@
    <rect>
     <x>0</x>
     <y>0</y>
-    <width>811</width>
-    <height>479</height>
+    <width>938</width>
+    <height>470</height>
    </rect>
   </property>
   <property name="windowTitle">
@@ -26,7 +26,7 @@
      <property name="title">
       <string>Heuristics</string>
      </property>
-     <layout class="QGridLayout" name="gridLayout_2">
+     <layout class="QGridLayout" name="gridLayout">
       <item row="0" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_unwrap_lines">
         <property name="text">
@@ -34,19 +34,6 @@
         </property>
        </widget>
       </item>
-      <item row="1" column="0">
-       <spacer name="horizontalSpacer">
-        <property name="orientation">
-         <enum>Qt::Horizontal</enum>
-        </property>
-        <property name="sizeHint" stdset="0">
-         <size>
-          <width>131</width>
-          <height>22</height>
-         </size>
-        </property>
-       </spacer>
-      </item>
       <item row="1" column="1">
        <widget class="QLabel" name="huf_label">
         <property name="text">
@@ -73,13 +60,33 @@
         </property>
        </widget>
       </item>
-      <item row="2" column="0" colspan="3">
+      <item row="1" column="3">
+       <spacer name="horizontalSpacer_2">
+        <property name="orientation">
+         <enum>Qt::Horizontal</enum>
+        </property>
+        <property name="sizeHint" stdset="0">
+         <size>
+          <width>40</width>
+          <height>20</height>
+         </size>
+        </property>
+       </spacer>
+      </item>
+      <item row="2" column="0" colspan="4">
        <widget class="QCheckBox" name="opt_markup_chapter_headings">
         <property name="text">
          <string>Detect and markup unformatted chapter headings and sub headings</string>
         </property>
        </widget>
       </item>
+      <item row="3" column="0" colspan="4">
+       <widget class="QCheckBox" name="opt_renumber_headings">
+        <property name="text">
+         <string>Renumber sequences of &lt;h1&gt; or &lt;h2&gt; tags to prevent splitting</string>
+        </property>
+       </widget>
+      </item>
       <item row="4" column="0" colspan="2">
        <widget class="QCheckBox" name="opt_delete_blank_paragraphs">
         <property name="text">
@@ -87,7 +94,7 @@
         </property>
        </widget>
       </item>
-      <item row="5" column="0" colspan="2">
+      <item row="5" column="0" colspan="3">
        <widget class="QCheckBox" name="opt_format_scene_breaks">
         <property name="text">
          <string>Ensure scene breaks are consistently formatted</string>
@@ -115,7 +122,7 @@
         </property>
        </widget>
       </item>
-      <item row="9" column="0">
+      <item row="9" column="0" colspan="2">
        <spacer name="verticalSpacer">
         <property name="orientation">
          <enum>Qt::Vertical</enum>
@@ -128,26 +135,6 @@
         </property>
        </spacer>
       </item>
-      <item row="1" column="3">
-       <spacer name="horizontalSpacer_2">
-        <property name="orientation">
-         <enum>Qt::Horizontal</enum>
-        </property>
-        <property name="sizeHint" stdset="0">
-         <size>
-          <width>40</width>
-          <height>20</height>
-         </size>
-        </property>
-       </spacer>
-      </item>
-      <item row="3" column="0">
-       <widget class="QCheckBox" name="opt_renumber_headings">
-        <property name="text">
-         <string>Renumber sequences of &lt;h1&gt; or &lt;h2&gt; tags to prevent splitting</string>
-        </property>
-       </widget>
-      </item>
      </layout>
     </widget>
    </item>
diff --git a/src/calibre/gui2/convert/search_and_replace.py b/src/calibre/gui2/convert/search_and_replace.py
index 36a496c520..fff75a29ba 100644
--- a/src/calibre/gui2/convert/search_and_replace.py
+++ b/src/calibre/gui2/convert/search_and_replace.py
@@ -12,7 +12,7 @@ from calibre.gui2 import error_dialog
 
 class SearchAndReplaceWidget(Widget, Ui_Form):
 
-    TITLE = _('Search and Replace')
+    TITLE = _('Search &\nReplace')
     HELP  = _('Modify the document text and structure using user defined patterns.')
     COMMIT_NAME = 'search_and_replace'
 
@@ -24,9 +24,9 @@ class SearchAndReplaceWidget(Widget, Ui_Form):
                 )
         self.db, self.book_id = db, book_id
         self.initialize_options(get_option, get_help, db, book_id)
-        self.opt_sr1_search.set_msg(_('Regular Expression'))
-        self.opt_sr2_search.set_msg(_('Regular Expression'))
-        self.opt_sr3_search.set_msg(_('Regular Expression'))
+        self.opt_sr1_search.set_msg(_('Search Regular Expression'))
+        self.opt_sr2_search.set_msg(_('Search Regular Expression'))
+        self.opt_sr3_search.set_msg(_('Search Regular Expression'))
         
     def break_cycles(self):
         Widget.break_cycles(self)
diff --git a/src/calibre/gui2/convert/search_and_replace.ui b/src/calibre/gui2/convert/search_and_replace.ui
index ed500a4dd0..e0e9570f8c 100644
--- a/src/calibre/gui2/convert/search_and_replace.ui
+++ b/src/calibre/gui2/convert/search_and_replace.ui
@@ -6,80 +6,176 @@
    <rect>
     <x>0</x>
     <y>0</y>
-    <width>657</width>
-    <height>479</height>
+    <width>198</width>
+    <height>350</height>
    </rect>
   </property>
+  <property name="sizePolicy">
+   <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+    <horstretch>0</horstretch>
+    <verstretch>0</verstretch>
+   </sizepolicy>
+  </property>
   <property name="windowTitle">
    <string>Form</string>
   </property>
-  <layout class="QGridLayout" name="gridLayout">
-   <item row="0" column="1">
-    <widget class="QLabel" name="label_4">
-     <property name="text">
-      <string>Search</string>
+  <layout class="QGridLayout" name="gridLayout_4">
+   <property name="sizeConstraint">
+    <enum>QLayout::SetDefaultConstraint</enum>
+   </property>
+   <item row="0" column="0">
+    <widget class="QGroupBox" name="groupBox">
+     <property name="sizePolicy">
+      <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+       <horstretch>0</horstretch>
+       <verstretch>0</verstretch>
+      </sizepolicy>
      </property>
-    </widget>
-   </item>
-   <item row="0" column="2">
-    <widget class="QLabel" name="label_5">
-     <property name="text">
-      <string>Replace</string>
+     <property name="title">
+      <string>1.</string>
      </property>
+     <layout class="QGridLayout" name="gridLayout_2">
+      <property name="sizeConstraint">
+       <enum>QLayout::SetMinimumSize</enum>
+      </property>
+      <item row="0" column="0">
+       <widget class="RegexEdit" name="opt_sr1_search" native="true">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+       </widget>
+      </item>
+      <item row="1" column="0">
+       <widget class="QLabel" name="label_4">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+        <property name="text">
+         <string>Replacement Text</string>
+        </property>
+       </widget>
+      </item>
+      <item row="2" column="0">
+       <widget class="QLineEdit" name="opt_sr1_replace">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Fixed">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+       </widget>
+      </item>
+     </layout>
     </widget>
    </item>
    <item row="1" column="0">
-    <widget class="QLabel" name="label">
-     <property name="text">
-      <string>1.</string>
+    <widget class="QGroupBox" name="groupBox_2">
+     <property name="sizePolicy">
+      <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+       <horstretch>0</horstretch>
+       <verstretch>0</verstretch>
+      </sizepolicy>
      </property>
-    </widget>
-   </item>
-   <item row="1" column="1">
-    <widget class="RegexEdit" name="opt_sr1_search" native="true"/>
-   </item>
-   <item row="1" column="2">
-    <widget class="QLineEdit" name="opt_sr1_replace"/>
-   </item>
-   <item row="2" column="0">
-    <widget class="QLabel" name="label_2">
-     <property name="text">
+     <property name="title">
       <string>2.</string>
      </property>
+     <layout class="QGridLayout" name="gridLayout">
+      <property name="sizeConstraint">
+       <enum>QLayout::SetMinimumSize</enum>
+      </property>
+      <item row="0" column="0">
+       <widget class="RegexEdit" name="opt_sr2_search" native="true">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+       </widget>
+      </item>
+      <item row="1" column="0">
+       <widget class="QLabel" name="label_5">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+        <property name="text">
+         <string>Replacement Text</string>
+        </property>
+       </widget>
+      </item>
+      <item row="2" column="0">
+       <widget class="QLineEdit" name="opt_sr2_replace">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Fixed">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+       </widget>
+      </item>
+     </layout>
     </widget>
    </item>
-   <item row="2" column="1">
-    <widget class="RegexEdit" name="opt_sr2_search" native="true"/>
-   </item>
-   <item row="2" column="2">
-    <widget class="QLineEdit" name="opt_sr2_replace"/>
-   </item>
-   <item row="3" column="0">
-    <widget class="QLabel" name="label_3">
-     <property name="text">
+   <item row="2" column="0">
+    <widget class="QGroupBox" name="groupBox_3">
+     <property name="sizePolicy">
+      <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+       <horstretch>0</horstretch>
+       <verstretch>0</verstretch>
+      </sizepolicy>
+     </property>
+     <property name="title">
       <string>3.</string>
      </property>
+     <layout class="QGridLayout" name="gridLayout_3">
+      <property name="sizeConstraint">
+       <enum>QLayout::SetMinimumSize</enum>
+      </property>
+      <item row="0" column="0">
+       <widget class="RegexEdit" name="opt_sr3_search" native="true">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+       </widget>
+      </item>
+      <item row="1" column="0">
+       <widget class="QLabel" name="label_6">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+        <property name="text">
+         <string>Replacement Text</string>
+        </property>
+       </widget>
+      </item>
+      <item row="2" column="0">
+       <widget class="QLineEdit" name="opt_sr3_replace">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Minimum" vsizetype="Fixed">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+       </widget>
+      </item>
+     </layout>
     </widget>
    </item>
-   <item row="3" column="1">
-    <widget class="RegexEdit" name="opt_sr3_search" native="true"/>
-   </item>
-   <item row="3" column="2">
-    <widget class="QLineEdit" name="opt_sr3_replace"/>
-   </item>
-   <item row="4" column="1">
-    <spacer name="verticalSpacer">
-     <property name="orientation">
-      <enum>Qt::Vertical</enum>
-     </property>
-     <property name="sizeHint" stdset="0">
-      <size>
-       <width>20</width>
-       <height>330</height>
-      </size>
-     </property>
-    </spacer>
-   </item>
   </layout>
  </widget>
  <customwidgets>
diff --git a/src/calibre/gui2/convert/structure_detection.ui b/src/calibre/gui2/convert/structure_detection.ui
index 262894d42d..ef0677a67c 100644
--- a/src/calibre/gui2/convert/structure_detection.ui
+++ b/src/calibre/gui2/convert/structure_detection.ui
@@ -27,7 +27,7 @@
      </property>
     </widget>
    </item>
-   <item row="1" column="1" colspan="2">
+   <item row="1" column="1">
     <widget class="QComboBox" name="opt_chapter_mark">
      <property name="minimumContentsLength">
       <number>20</number>
@@ -64,6 +64,19 @@
      </property>
     </spacer>
    </item>
+   <item row="1" column="2">
+    <spacer name="horizontalSpacer">
+     <property name="orientation">
+      <enum>Qt::Horizontal</enum>
+     </property>
+     <property name="sizeHint" stdset="0">
+      <size>
+       <width>40</width>
+       <height>20</height>
+      </size>
+     </property>
+    </spacer>
+   </item>
   </layout>
  </widget>
  <customwidgets>
diff --git a/src/calibre/gui2/convert/xexp_edit.ui b/src/calibre/gui2/convert/xexp_edit.ui
index 7e89ec5d43..4b26eb8dcf 100644
--- a/src/calibre/gui2/convert/xexp_edit.ui
+++ b/src/calibre/gui2/convert/xexp_edit.ui
@@ -6,8 +6,8 @@
    <rect>
     <x>0</x>
     <y>0</y>
-    <width>422</width>
-    <height>64</height>
+    <width>434</width>
+    <height>74</height>
    </rect>
   </property>
   <property name="windowTitle">
@@ -53,13 +53,13 @@
    <item row="0" column="1">
     <widget class="QToolButton" name="button">
      <property name="toolTip">
-      <string>Use a wizard to help construct the XPath expression</string>
+      <string>Use a wizard to help construct the Regular expression</string>
      </property>
      <property name="text">
       <string>...</string>
      </property>
      <property name="icon">
-      <iconset resource="../../../../resources/images.qrc">
+      <iconset>
        <normaloff>:/images/wizard.png</normaloff>:/images/wizard.png</iconset>
      </property>
      <property name="iconSize">
@@ -70,19 +70,6 @@
      </property>
     </widget>
    </item>
-   <item row="0" column="2">
-    <spacer name="horizontalSpacer">
-     <property name="orientation">
-      <enum>Qt::Horizontal</enum>
-     </property>
-     <property name="sizeHint" stdset="0">
-      <size>
-       <width>20</width>
-       <height>20</height>
-      </size>
-     </property>
-    </spacer>
-   </item>
   </layout>
  </widget>
  <customwidgets>

From 1272988089814321248ffe0c58232f1d061a67a3 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 16 Jan 2011 20:11:52 +0800
Subject: [PATCH 30/54] enabled hyphen removal across the entire document text,
 refactored logic to reduce false positives, added verbose debug output

---
 src/calibre/ebooks/conversion/preprocess.py | 47 +++++++++-----
 src/calibre/ebooks/conversion/utils.py      | 69 +++++++++++----------
 src/calibre/ebooks/txt/input.py             |  4 +-
 3 files changed, 72 insertions(+), 48 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index abaff77f33..9dedd05e33 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -174,13 +174,19 @@ class Dehyphenator(object):
     retain hyphens.
     '''
 
-    def __init__(self):
+    def __init__(self, verbose=0, log=None):
+        self.log = default_log if log is None else log
+        self.verbose = verbose
         # Add common suffixes to the regex below to increase the likelihood of a match -
         # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
+        # only remove if it's not already the point of hyphenation
+        self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$"
+        self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
+        self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
         # remove prefixes if the prefix was not already the point of hyphenation
-        self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
-        self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
+        self.prefix_string = '^(dis|re|un|in|ex)'
+        self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
+        self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
 
     def dehyphenate(self, match):
         firsthalf = match.group('firstpart')
@@ -191,31 +197,44 @@ class Dehyphenator(object):
             wraptags = ''
         hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf)
         dehyphenated = unicode(firsthalf) + unicode(secondhalf)
-        lookupword = self.removesuffixes.sub('', dehyphenated)
-        if self.prefixes.match(firsthalf) is None:
+        if self.suffixes.match(secondhalf) is None:
+            lookupword = self.removesuffixes.sub('', dehyphenated)
+        else:
+            lookupword = dehyphenated
+        if len(firsthalf) > 3 and self.prefixes.match(firsthalf) is None:
             lookupword = self.removeprefix.sub('', lookupword)
-        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+        if self.verbose > 2:
+            self.log("lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated))
         try:
             searchresult = self.html.find(lookupword.lower())
         except:
             return hyphenated
         if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
             if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+                if self.verbose > 2:
+                    self.log("    Cleanup:returned dehyphenated word: " + str(dehyphenated))
                 return dehyphenated
             elif self.html.find(hyphenated) != -1:
-                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
+                if self.verbose > 2:
+                    self.log("        Cleanup:returned hyphenated word: " + str(hyphenated))
                 return hyphenated
             else:
-                #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+                if self.verbose > 2:
+                    self.log("            Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf))
                 return firsthalf+u'\u2014'+wraptags+secondhalf
 
         else:
+            if len(firsthalf) <= 2 and len(secondhalf) <= 2:
+                if self.verbose > 2:
+                    self.log("too short, returned hyphenated word: " + str(hyphenated))
+                return hyphenated
             if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "returned dehyphenated word: " + str(dehyphenated)
+                if self.verbose > 2:
+                    self.log("     returned dehyphenated word: " + str(dehyphenated))
                 return dehyphenated
             else:
-                #print "           returned hyphenated word: " + str(hyphenated)
+                if self.verbose > 2:
+                    self.log("          returned hyphenated word: " + str(hyphenated))
                 return hyphenated
 
     def __call__(self, html, format, length=1):
@@ -228,7 +247,7 @@ class Dehyphenator(object):
         elif format == 'txt':
             intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
         elif format == 'individual_words':
-            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
+            intextmatch = re.compile(u'(?!<)(?P<firstpart>\w+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)') # for later, not called anywhere yet
         elif format == 'html_cleanup':
             intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
         elif format == 'txt_cleanup':
@@ -512,7 +531,7 @@ class HTMLPreProcessor(object):
 
         if is_pdftohtml and length > -1:
             # Dehyphenate
-            dehyphenator = Dehyphenator()
+            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
             html = dehyphenator(html,'html', length)
 
         if is_pdftohtml:
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 96a9a4783d..4a118d423c 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -322,11 +322,11 @@ class HeuristicProcessor(object):
         html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
         # Delete microsoft 'smart' tags
         html = re.sub('(?i)</?st1:\w+>', '', html)
-        # Get rid of empty span, bold, font, & italics tags
-        html = re.sub(r'\s*<font[^>]*>\s*</font>\s*', '', html)
+        # Get rid of empty span, bold, font, em, & italics tags
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
-        html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
+        html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
+        html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
         self.deleted_nbsps = True
         return html
 
@@ -376,27 +376,31 @@ class HeuristicProcessor(object):
         except:
             self.log("Can't get wordcount")
 
-        if 0 < self.totalwords < 50:
+        print "found "+unicode(self.totalwords)+" words in the flow"
+        if self.totalwords < 50:
             self.log("flow is too short, not running heuristics")
             return html
 
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
         html = self.arrange_htm_line_endings(html)
 
-        ###### Check Markup ######
-        #
-        # some lit files don't have any <p> tags or equivalent (generally just plain text between
-        # <pre> tags), check and  mark up line endings if required before proceeding
-        if self.no_markup(html, 0.1):
-            self.log("not enough paragraph markers, adding now")
-            # markup using text processing
-            html = self.markup_pre(html)
+        if self.cleanup_required():
+            ###### Check Markup ######
+            #
+            # some lit files don't have any <p> tags or equivalent (generally just plain text between
+            # <pre> tags), check and  mark up line endings if required before proceeding
+            # fix indents must run after this step
+            if self.no_markup(html, 0.1):
+                self.log("not enough paragraph markers, adding now")
+                # markup using text processing
+                html = self.markup_pre(html)
 
         # Replace series of non-breaking spaces with text-indent
         if getattr(self.extra_opts, 'fix_indents', False):
             html = self.fix_nbsp_indents(html)
 
         if self.cleanup_required():
+            # fix indents must run before this step, as it removes non-breaking spaces
             html = self.cleanup_markup(html)
 
         # ADE doesn't render <br />, change to empty paragraphs
@@ -420,26 +424,26 @@ class HeuristicProcessor(object):
             self.log("deleting blank lines")
             html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
             html = self.blankreg.sub('', html)
+
+        # Determine line ending type
+        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
+        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
+        # that lines can be un-wrapped across page boundaries
+        format = self.analyze_line_endings(html)
+
+        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
+        # more of the lines break in the same region of the document then unwrapping is required
+        docanalysis = DocAnalysis(format, html)
+        hardbreaks = docanalysis.line_histogram(.50)
+        self.log("Hard line breaks check returned "+unicode(hardbreaks))
+
+        # Calculate Length
+        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
+        length = docanalysis.line_length(unwrap_factor)
+        self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
             
         ###### Unwrap lines ######
         if getattr(self.extra_opts, 'unwrap_lines', False):
-            # Determine line ending type
-            # Some OCR sourced files have line breaks in the html using a combination of span & p tags
-            # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
-            # that lines can be un-wrapped across page boundaries
-            format = self.analyze_line_endings(html)
-
-            # Check Line histogram to determine if the document uses hard line breaks, If 50% or
-            # more of the lines break in the same region of the document then unwrapping is required
-            docanalysis = DocAnalysis(format, html)
-            hardbreaks = docanalysis.line_histogram(.50)
-            self.log("Hard line breaks check returned "+unicode(hardbreaks))
-
-            # Calculate Length
-            unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
-            length = docanalysis.line_length(unwrap_factor)
-            self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
-
             # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
             if hardbreaks or unwrap_factor < 0.4:
                 self.log("Unwrapping required, unwrapping Lines")
@@ -447,15 +451,16 @@ class HeuristicProcessor(object):
                 dehyphenator = Dehyphenator()
                 html = dehyphenator(html,'html', length)
                 html = self.punctuation_unwrap(length, html, 'html')
-                #check any remaining hyphens, but only unwrap if there is a match
-                dehyphenator = Dehyphenator()
+                # unwrap remaining hyphens based on line length, but only remove if there is a match
+                dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
                 html = dehyphenator(html,'html_cleanup', length)
 
         if getattr(self.extra_opts, 'dehyphenate', False):
             # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
             self.log("Fixing hyphenated content")
-            dehyphenator = Dehyphenator()
+            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
             html = dehyphenator(html,'html_cleanup', length)
+            html = dehyphenator(html, 'individual_words', length)
 
         # If still no sections after unwrapping mark split points on lines with no punctuation
         if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 5cffbafe21..8bf33c4837 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -106,7 +106,7 @@ class TXTInput(InputFormatPlugin):
                     log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
 
             # Dehyphenate
-            dehyphenator = Dehyphenator()
+            dehyphenator = Dehyphenator(options.verbose, log=getattr(self, 'log', None))
             txt = dehyphenator(txt,'txt', length)
 
             # We don't check for block because the processor assumes block.
@@ -138,7 +138,7 @@ class TXTInput(InputFormatPlugin):
                 setattr(options, 'dehyphenate', True)
 
         # Dehyphenate in cleanup mode for missed txt and markdown conversion
-        dehyphenator = Dehyphenator()
+        dehyphenator = Dehyphenator(options.verbose, log=getattr(self, 'log', None))
         html = dehyphenator(html,'txt_cleanup', length)
         html = dehyphenator(html,'html_cleanup', length)
 

From 89dd86056e727de35ff844cf712051b96a96e712 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 16 Jan 2011 20:26:52 +0800
Subject: [PATCH 31/54] ...

---
 src/calibre/ebooks/conversion/preprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 9dedd05e33..d1d275eb97 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -247,7 +247,7 @@ class Dehyphenator(object):
         elif format == 'txt':
             intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
         elif format == 'individual_words':
-            intextmatch = re.compile(u'(?!<)(?P<firstpart>\w+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)') # for later, not called anywhere yet
+            intextmatch = re.compile(u'(?!<)(?P<firstpart>\w+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)')
         elif format == 'html_cleanup':
             intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
         elif format == 'txt_cleanup':

From e0d1de2ce8832eb55abacf85edbfdcb1fb5d549e Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 16 Jan 2011 20:54:17 +0800
Subject: [PATCH 32/54] removed hyphen removal from text input that's covered
 by the heuristics option

---
 src/calibre/ebooks/txt/input.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 8bf33c4837..39bfb4b132 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -137,11 +137,6 @@ class TXTInput(InputFormatPlugin):
                 setattr(options, 'format_scene_breaks', True)
                 setattr(options, 'dehyphenate', True)
 
-        # Dehyphenate in cleanup mode for missed txt and markdown conversion
-        dehyphenator = Dehyphenator(options.verbose, log=getattr(self, 'log', None))
-        html = dehyphenator(html,'txt_cleanup', length)
-        html = dehyphenator(html,'html_cleanup', length)
-
         from calibre.customize.ui import plugin_for_input_format
         html_input = plugin_for_input_format('html')
         for opt in html_input.options:

From b2626bace330aba990a35e5caf15a33193a43652 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 16 Jan 2011 08:55:26 -0500
Subject: [PATCH 33/54] Fix search_and_replace option names.

---
 src/calibre/gui2/convert/search_and_replace.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/gui2/convert/search_and_replace.py b/src/calibre/gui2/convert/search_and_replace.py
index fff75a29ba..34c6cdf1e9 100644
--- a/src/calibre/gui2/convert/search_and_replace.py
+++ b/src/calibre/gui2/convert/search_and_replace.py
@@ -36,7 +36,7 @@ class SearchAndReplaceWidget(Widget, Ui_Form):
         self.opt_sr3_search.break_cycles()
 
     def pre_commit_check(self):
-        for x in ('sr1-search', 'sr2-search', 'sr3-search'):
+        for x in ('sr1_search', 'sr2_search', 'sr3_search'):
             x = getattr(self, 'opt_'+x)
             try:
                 pat = unicode(x.regex)

From 68587e8679b70463c51bd66bdd78339ea9838a8a Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 16 Jan 2011 09:16:05 -0500
Subject: [PATCH 34/54] Fix GUI dialog errors preventing them from returning
 properly.

---
 src/calibre/gui2/convert/heuristics.py         | 7 +++++--
 src/calibre/gui2/convert/search_and_replace.py | 3 +--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/calibre/gui2/convert/heuristics.py b/src/calibre/gui2/convert/heuristics.py
index 525d5ba2f1..4735782f52 100644
--- a/src/calibre/gui2/convert/heuristics.py
+++ b/src/calibre/gui2/convert/heuristics.py
@@ -34,8 +34,11 @@ class HeuristicsWidget(Widget, Ui_Form):
     def break_cycles(self):
         Widget.break_cycles(self)
         
-        self.opt_enable_heuristics.stateChanged.disconnect()
-        self.opt_unwrap_lines.stateChanged.disconnect()
+        try:
+            self.opt_enable_heuristics.stateChanged.disconnect()
+            self.opt_unwrap_lines.stateChanged.disconnect()
+        except:
+            pass
         
     def set_value_handler(self, g, val):
         if val is None and g is self.opt_html_unwrap_factor:
diff --git a/src/calibre/gui2/convert/search_and_replace.py b/src/calibre/gui2/convert/search_and_replace.py
index 34c6cdf1e9..af944a74d1 100644
--- a/src/calibre/gui2/convert/search_and_replace.py
+++ b/src/calibre/gui2/convert/search_and_replace.py
@@ -45,5 +45,4 @@ class SearchAndReplaceWidget(Widget, Ui_Form):
                 error_dialog(self, _('Invalid regular expression'),
                              _('Invalid regular expression: %s')%err).exec_()
                 return False
-            
-
+        return True

From d271747cffafb7723ff15ba5628d6e0a96ac98c7 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 16 Jan 2011 09:44:39 -0500
Subject: [PATCH 35/54] Simplify more GUI widgets.

---
 src/calibre/gui2/convert/pdb_output.py | 19 ++++-----------
 src/calibre/gui2/convert/pdf_output.py | 33 ++++++--------------------
 src/calibre/gui2/convert/txt_output.py | 18 ++++----------
 src/calibre/gui2/widgets.py            | 26 --------------------
 4 files changed, 15 insertions(+), 81 deletions(-)

diff --git a/src/calibre/gui2/convert/pdb_output.py b/src/calibre/gui2/convert/pdb_output.py
index ec6b7abb08..bf1d5048e2 100644
--- a/src/calibre/gui2/convert/pdb_output.py
+++ b/src/calibre/gui2/convert/pdb_output.py
@@ -6,8 +6,6 @@ __docformat__ = 'restructuredtext en'
 
 from calibre.gui2.convert.pdb_output_ui import Ui_Form
 from calibre.gui2.convert import Widget
-from calibre.ebooks.pdb import FORMAT_WRITERS
-from calibre.gui2.widgets import BasicComboModel
 
 format_model = None
 
@@ -21,17 +19,8 @@ class PluginWidget(Widget, Ui_Form):
     def __init__(self, parent, get_option, get_help, db=None, book_id=None):
         Widget.__init__(self, parent, ['format', 'inline_toc', 'pdb_output_encoding'])
         self.db, self.book_id = db, book_id
+
+        for x in get_option('format').option.choices:
+            self.opt_format.addItem(x)
+        
         self.initialize_options(get_option, get_help, db, book_id)
-
-        default = self.opt_format.currentText()
-
-        global format_model
-        if format_model is None:
-            format_model = BasicComboModel(FORMAT_WRITERS.keys())
-        self.format_model = format_model
-        self.opt_format.setModel(self.format_model)
-
-        default_index = self.opt_format.findText(default)
-        format_index = self.opt_format.findText('doc')
-        self.opt_format.setCurrentIndex(default_index if default_index != -1 else format_index if format_index != -1 else 0)
-
diff --git a/src/calibre/gui2/convert/pdf_output.py b/src/calibre/gui2/convert/pdf_output.py
index 5d6a595079..1c526939c2 100644
--- a/src/calibre/gui2/convert/pdf_output.py
+++ b/src/calibre/gui2/convert/pdf_output.py
@@ -6,8 +6,6 @@ __docformat__ = 'restructuredtext en'
 
 from calibre.gui2.convert.pdf_output_ui import Ui_Form
 from calibre.gui2.convert import Widget
-from calibre.ebooks.pdf.pageoptions import PAPER_SIZES, ORIENTATIONS
-from calibre.gui2.widgets import BasicComboModel
 
 paper_size_model = None
 orientation_model = None
@@ -23,28 +21,11 @@ class PluginWidget(Widget, Ui_Form):
         Widget.__init__(self, parent, ['paper_size',
             'orientation', 'preserve_cover_aspect_ratio'])
         self.db, self.book_id = db, book_id
+        
+        for x in get_option('paper_size').option.choices:
+            self.opt_paper_size.addItem(x)
+        for x in get_option('orientation').option.choices:
+            self.opt_orientation.addItem(x)
+        
         self.initialize_options(get_option, get_help, db, book_id)
-
-        default_paper_size = self.opt_paper_size.currentText()
-        default_orientation = self.opt_orientation.currentText()
-
-        global paper_size_model
-        if paper_size_model is None:
-            paper_size_model = BasicComboModel(PAPER_SIZES.keys())
-        self.paper_size_model = paper_size_model
-        self.opt_paper_size.setModel(self.paper_size_model)
-
-        default_paper_size_index = self.opt_paper_size.findText(default_paper_size)
-        letter_index = self.opt_paper_size.findText('letter')
-        self.opt_paper_size.setCurrentIndex(default_paper_size_index if default_paper_size_index != -1 else letter_index if letter_index != -1 else 0)
-
-        global orientation_model
-        if orientation_model is None:
-            orientation_model = BasicComboModel(ORIENTATIONS.keys())
-        self.orientation_model = orientation_model
-        self.opt_orientation.setModel(self.orientation_model)
-
-        default_orientation_index = self.opt_orientation.findText(default_orientation)
-        orientation_index = self.opt_orientation.findText('portrait')
-        self.opt_orientation.setCurrentIndex(default_orientation_index if default_orientation_index != -1 else orientation_index if orientation_index != -1 else 0)
-
+        
\ No newline at end of file
diff --git a/src/calibre/gui2/convert/txt_output.py b/src/calibre/gui2/convert/txt_output.py
index 9a228bd4cf..21a9e60bed 100644
--- a/src/calibre/gui2/convert/txt_output.py
+++ b/src/calibre/gui2/convert/txt_output.py
@@ -6,8 +6,6 @@ __docformat__ = 'restructuredtext en'
 
 from calibre.gui2.convert.txt_output_ui import Ui_Form
 from calibre.gui2.convert import Widget
-from calibre.ebooks.txt.newlines import TxtNewlines
-from calibre.gui2.widgets import BasicComboModel
 
 newline_model = None
 
@@ -24,16 +22,8 @@ class PluginWidget(Widget, Ui_Form):
         'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references',
         'txt_output_encoding'])
         self.db, self.book_id = db, book_id
+        
+        for x in get_option('newline').option.choices:
+            self.opt_newline.addItem(x)
+        
         self.initialize_options(get_option, get_help, db, book_id)
-
-        default = self.opt_newline.currentText()
-
-        global newline_model
-        if newline_model is None:
-            newline_model = BasicComboModel(TxtNewlines.NEWLINE_TYPES.keys())
-        self.newline_model = newline_model
-        self.opt_newline.setModel(self.newline_model)
-
-        default_index = self.opt_newline.findText(default)
-        system_index = self.opt_newline.findText('system')
-        self.opt_newline.setCurrentIndex(default_index if default_index != -1 else system_index if system_index != -1 else 0)
diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py
index f2ff783a76..28c5de4322 100644
--- a/src/calibre/gui2/widgets.py
+++ b/src/calibre/gui2/widgets.py
@@ -311,32 +311,6 @@ class FontFamilyModel(QAbstractListModel):
     def index_of(self, family):
         return self.families.index(family.strip())
 
-class BasicComboModel(QAbstractListModel):
-
-    def __init__(self, items, *args):
-        QAbstractListModel.__init__(self, *args)
-        self.items = [i for i in items]
-        self.items.sort()
-
-    def rowCount(self, *args):
-        return len(self.items)
-
-    def data(self, index, role):
-        try:
-            item = self.items[index.row()]
-        except:
-            traceback.print_exc()
-            return NONE
-        if role == Qt.DisplayRole:
-            return QVariant(item)
-        if role == Qt.FontRole:
-            return QVariant(QFont(item))
-        return NONE
-
-    def index_of(self, item):
-        return self.items.index(item.strip())
-
-
 class BasicListItem(QListWidgetItem):
 
     def __init__(self, text, user_data=None):

From f7650de369d1dec7e3ee82744b55f292870335f7 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 16 Jan 2011 09:51:03 -0500
Subject: [PATCH 36/54] TXT Output GUI widet: Disable markdown options when
 markdown is not enabled.

---
 src/calibre/gui2/convert/txt_output.py | 28 ++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/calibre/gui2/convert/txt_output.py b/src/calibre/gui2/convert/txt_output.py
index 21a9e60bed..a16dd68014 100644
--- a/src/calibre/gui2/convert/txt_output.py
+++ b/src/calibre/gui2/convert/txt_output.py
@@ -4,6 +4,8 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 
+from PyQt4.Qt import Qt
+
 from calibre.gui2.convert.txt_output_ui import Ui_Form
 from calibre.gui2.convert import Widget
 
@@ -21,9 +23,27 @@ class PluginWidget(Widget, Ui_Form):
         ['newline', 'max_line_length', 'force_max_line_length',
         'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references',
         'txt_output_encoding'])
-        self.db, self.book_id = db, book_id
-        
+        self.db, self.book_id = db, book_id        
         for x in get_option('newline').option.choices:
-            self.opt_newline.addItem(x)
-        
+            self.opt_newline.addItem(x)        
         self.initialize_options(get_option, get_help, db, book_id)
+
+        self.opt_markdown_format.stateChanged.connect(self.enable_markdown_format)
+        self.enable_markdown_format(self.opt_markdown_format.checkState())
+
+    def break_cycles(self):
+        Widget.break_cycles(self)
+        
+        try:
+            self.opt_markdown_format.stateChanged.disconnect()
+        except:
+            pass
+        
+    def enable_markdown_format(self, state):
+        if state == Qt.Checked:
+            state = True
+        else:
+            state = False
+        self.opt_keep_links.setEnabled(state)
+        self.opt_keep_image_references.setEnabled(state)
+        
\ No newline at end of file

From 9148320a8bd9263f3495a28cd4d32f6cfa467c35 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 16 Jan 2011 10:00:23 -0500
Subject: [PATCH 37/54] Heuristic class use log.debug to reduce output during
 CLI conversion.

---
 src/calibre/ebooks/conversion/utils.py | 42 +++++++++++++-------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 4a118d423c..15522d25e6 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -35,12 +35,12 @@ class HeuristicProcessor(object):
         title = match.group('title')
         if not title:
             self.html_preprocess_sections = self.html_preprocess_sections + 1
-            self.log("marked " + unicode(self.html_preprocess_sections) +
+            self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                     " chapters. - " + unicode(chap))
             return '<h2>'+chap+'</h2>\n'
         else:
             self.html_preprocess_sections = self.html_preprocess_sections + 1
-            self.log("marked " + unicode(self.html_preprocess_sections) +
+            self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                     " chapters & titles. - " + unicode(chap) + ", " + unicode(title))
             return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
 
@@ -48,7 +48,7 @@ class HeuristicProcessor(object):
         chap = match.group('section')
         styles = match.group('styles')
         self.html_preprocess_sections = self.html_preprocess_sections + 1
-        self.log("marked " + unicode(self.html_preprocess_sections) +
+        self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                 " section markers based on punctuation. - " + unicode(chap))
         return '<'+styles+' style="page-break-before:always">'+chap
 
@@ -91,7 +91,7 @@ class HeuristicProcessor(object):
         line_end = line_end_ere.findall(raw)
         tot_htm_ends = len(htm_end)
         tot_ln_fds = len(line_end)
-        self.log("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
+        self.log.debug("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
                 unicode(tot_htm_ends) + " marked up endings")
 
         if percent > 1:
@@ -100,7 +100,7 @@ class HeuristicProcessor(object):
             percent = 0
 
         min_lns = tot_ln_fds * percent
-        self.log("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
+        self.log.debug("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
         if min_lns > tot_htm_ends:
             return True
 
@@ -171,7 +171,7 @@ class HeuristicProcessor(object):
         #print "minimum chapters required are: "+str(self.min_chapters)
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
-        self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
+        self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
 
         # Build the Regular Expressions in pieces
         init_lookahead = "(?=<(p|div))"
@@ -221,7 +221,7 @@ class HeuristicProcessor(object):
                     break
                 full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
                 n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-                self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
+                self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
                 if lookahead_ignorecase:
                     chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
                     chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
@@ -244,7 +244,7 @@ class HeuristicProcessor(object):
         words_per_chptr = wordcount
         if words_per_chptr > 0 and self.html_preprocess_sections > 0:
             words_per_chptr = wordcount / self.html_preprocess_sections
-        self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
+        self.log.debug("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
         return html
 
     def punctuation_unwrap(self, length, content, format):
@@ -291,7 +291,7 @@ class HeuristicProcessor(object):
     def markup_pre(self, html):
         pre = re.compile(r'<pre>', re.IGNORECASE)
         if len(pre.findall(html)) >= 1:
-            self.log("Running Text Processing")
+            self.log.debug("Running Text Processing")
             outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
             html = outerhtml.sub(self.txt_process, html)
         else:
@@ -311,7 +311,7 @@ class HeuristicProcessor(object):
         txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
         html = txtindent.sub(self.insert_indent, html)
         if self.found_indents > 1:
-            self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
+            self.log.debug("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
         return html
 
     def cleanup_markup(self, html):
@@ -351,7 +351,7 @@ class HeuristicProcessor(object):
         blanklines = self.blankreg.findall(html)
         lines = self.linereg.findall(html)
         if len(lines) > 1:
-            self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
+            self.log.debug("There are " + unicode(len(blanklines)) + " blank lines. " +
                     unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
                     
             if float(len(blanklines)) / float(len(lines)) > 0.40:
@@ -367,18 +367,18 @@ class HeuristicProcessor(object):
 
 
     def __call__(self, html):
-        self.log("*********  Heuristic processing HTML  *********")
+        self.log.debug("*********  Heuristic processing HTML  *********")
 
         # Count the words in the document to estimate how many chapters to look for and whether
         # other types of processing are attempted
         try:
             self.totalwords = self.get_word_count(html)
         except:
-            self.log("Can't get wordcount")
+            self.log.warn("Can't get wordcount")
 
         print "found "+unicode(self.totalwords)+" words in the flow"
         if self.totalwords < 50:
-            self.log("flow is too short, not running heuristics")
+            self.log.warn("flow is too short, not running heuristics")
             return html
 
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
@@ -391,7 +391,7 @@ class HeuristicProcessor(object):
             # <pre> tags), check and  mark up line endings if required before proceeding
             # fix indents must run after this step
             if self.no_markup(html, 0.1):
-                self.log("not enough paragraph markers, adding now")
+                self.log.debug("not enough paragraph markers, adding now")
                 # markup using text processing
                 html = self.markup_pre(html)
 
@@ -421,7 +421,7 @@ class HeuristicProcessor(object):
         # If more than 40% of the lines are empty paragraphs and the user has enabled delete
         # blank paragraphs then delete blank lines to clean up spacing
         if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
-            self.log("deleting blank lines")
+            self.log.debug("deleting blank lines")
             html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
             html = self.blankreg.sub('', html)
 
@@ -435,18 +435,18 @@ class HeuristicProcessor(object):
         # more of the lines break in the same region of the document then unwrapping is required
         docanalysis = DocAnalysis(format, html)
         hardbreaks = docanalysis.line_histogram(.50)
-        self.log("Hard line breaks check returned "+unicode(hardbreaks))
+        self.log.debug("Hard line breaks check returned "+unicode(hardbreaks))
 
         # Calculate Length
         unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
         length = docanalysis.line_length(unwrap_factor)
-        self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
+        self.log.debug("Median line length is " + unicode(length) + ", calculated with " + format + " format")
             
         ###### Unwrap lines ######
         if getattr(self.extra_opts, 'unwrap_lines', False):
             # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
             if hardbreaks or unwrap_factor < 0.4:
-                self.log("Unwrapping required, unwrapping Lines")
+                self.log.debug("Unwrapping required, unwrapping Lines")
                 # Dehyphenate with line length limiters
                 dehyphenator = Dehyphenator()
                 html = dehyphenator(html,'html', length)
@@ -457,14 +457,14 @@ class HeuristicProcessor(object):
 
         if getattr(self.extra_opts, 'dehyphenate', False):
             # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
-            self.log("Fixing hyphenated content")
+            self.log.debug("Fixing hyphenated content")
             dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
             html = dehyphenator(html,'html_cleanup', length)
             html = dehyphenator(html, 'individual_words', length)
 
         # If still no sections after unwrapping mark split points on lines with no punctuation
         if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
-            self.log("Looking for more split points based on punctuation,"
+            self.log.debug("Looking for more split points based on punctuation,"
                     " currently have " + unicode(self.html_preprocess_sections))
             chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
             html = chapdetect3.sub(self.chapter_break, html)

From f8971944fb6b72836f61f8989861db06c3ce415a Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 16 Jan 2011 23:22:17 +0800
Subject: [PATCH 38/54] made replace optional for users who just want
 equivalent of old feature, eliminate requirement to populate replace box

---
 src/calibre/ebooks/conversion/preprocess.py | 34 ++++++++-------------
 1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index d1d275eb97..0ceed67bf9 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -453,27 +453,19 @@ class HTMLPreProcessor(object):
         if not getattr(self.extra_opts, 'keep_ligatures', False):
             html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
 
-        if getattr(self.extra_opts, 'sr3_search', None):
-            try:
-                rules.insert(0,  (re.compile(self.extra_opts.sr3_search), self.extra_opts.sr3_replace))
-            except:
-                import traceback
-                print 'Failed to parse sr3-search regexp'
-                traceback.print_exc()
-        if getattr(self.extra_opts, 'sr2_search', None):
-            try:
-                rules.insert(0, (re.compile(self.extra_opts.sr2_search), self.extra_opts.sr2_replace))
-            except:
-                import traceback
-                print 'Failed to parse sr2-search regexp'
-                traceback.print_exc()
-        if getattr(self.extra_opts, 'sr1_search', None):
-            try:
-                rules.insert(0, (re.compile(self.extra_opts.sr1_search), self.extra_opts.sr1_replace))
-            except:
-                import traceback
-                print 'Failed to parse sr1-search regexp'
-                traceback.print_exc()
+        for search, replace in [['sr3_search', 'sr3_replace'], ['sr2_search', 'sr2_replace'], ['sr1_search', 'sr1_replace']]:
+            replace_pattern = ''
+            if getattr(self.extra_opts, search, None):
+                search_pattern = getattr(self.extra_opts, search, None)
+                if getattr(self.extra_opts, replace, None):
+                    replace_pattern = getattr(self.extra_opts, replace, None)
+                try:
+                    rules.insert(0,  (re.compile(search_pattern), replace_pattern))
+                except:
+                    import traceback
+                    print 'Failed to parse sr3-search regexp'
+                    traceback.print_exc()
+
 
         end_rules = []
         # delete soft hyphens - moved here so it's executed after header/footer removal

From 73e60f8c7e70893294f32f594401dcb6f19aacf0 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 16 Jan 2011 11:20:11 -0500
Subject: [PATCH 39/54] Fix search and replace.

---
 src/calibre/ebooks/conversion/preprocess.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index b35a163044..d2bdba4928 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -458,7 +458,10 @@ class HTMLPreProcessor(object):
             if search_pattern:
                 try:
                     search_re = re.compile(search_pattern)
-                    rules.insert(0,  (search_re, getattr(self.extra_opts, replace, '')))
+                    replace_txt = getattr(self.extra_opts, replace, '')
+                    if replace_txt == None:
+                        replace_txt = ''
+                    rules.insert(0, (search_re, replace_txt))
                 except Exception as e:
                     self.log.error('Failed to parse %s regexp because %s' % (search, e))
 

From d18910510ba19700690bf596b39025d3ea0cebde Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sun, 16 Jan 2011 11:42:43 -0500
Subject: [PATCH 40/54] Fix Regex builder in search and replace.

---
 src/calibre/gui2/convert/search_and_replace.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/calibre/gui2/convert/search_and_replace.py b/src/calibre/gui2/convert/search_and_replace.py
index af944a74d1..c85e4fe414 100644
--- a/src/calibre/gui2/convert/search_and_replace.py
+++ b/src/calibre/gui2/convert/search_and_replace.py
@@ -25,8 +25,14 @@ class SearchAndReplaceWidget(Widget, Ui_Form):
         self.db, self.book_id = db, book_id
         self.initialize_options(get_option, get_help, db, book_id)
         self.opt_sr1_search.set_msg(_('Search Regular Expression'))
+        self.opt_sr1_search.set_book_id(book_id)
+        self.opt_sr1_search.set_db(db)
         self.opt_sr2_search.set_msg(_('Search Regular Expression'))
+        self.opt_sr2_search.set_book_id(book_id)
+        self.opt_sr2_search.set_db(db)
         self.opt_sr3_search.set_msg(_('Search Regular Expression'))
+        self.opt_sr3_search.set_book_id(book_id)
+        self.opt_sr3_search.set_db(db)
         
     def break_cycles(self):
         Widget.break_cycles(self)

From 2ebf94812e8ed82491b7579333f66cde7ce15096 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 17 Jan 2011 17:13:19 +0800
Subject: [PATCH 41/54] started updating manuals, fix lit postprocess to handle
 content in pre tags correctly

---
 src/calibre/ebooks/conversion/plumber.py |  6 ++-
 src/calibre/ebooks/lit/input.py          |  9 ++--
 src/calibre/manual/conversion.rst        | 59 ++++++++++++++++++++----
 3 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 249f848661..6fdf7ddc68 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -492,7 +492,9 @@ OptionRecommendation(name='enable_heuristics',
 OptionRecommendation(name='markup_chapter_headings',
     recommended_value=False, level=OptionRecommendation.LOW,
     help=_('Detect unformatted chapter headings and sub headings. Change ' 
-           'them to h2 and h3 tags.')),
+           'them to h2 and h3 tags.  This setting will not create a TOC, '
+           'but can be used in conjunction with structure detection to create '
+           'one.')),
            
 OptionRecommendation(name='italicize_common_cases',
     recommended_value=False, level=OptionRecommendation.LOW,
@@ -501,7 +503,7 @@ OptionRecommendation(name='italicize_common_cases',
            
 OptionRecommendation(name='fix_indents',
     recommended_value=False, level=OptionRecommendation.LOW,
-    help=_('Turn indentation created from multiple &nbsp; entities '
+    help=_('Turn indentation created from multiple non-breaking space entities '
            'into CSS indents.')),
            
 OptionRecommendation(name='html_unwrap_factor',
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 7b822b68a6..ff8955939e 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -22,7 +22,7 @@ class LITInput(InputFormatPlugin):
         from calibre.ebooks.lit.reader import LitReader
         from calibre.ebooks.conversion.plumber import create_oebbook
         self.log = log
-        return create_oebbook(log, stream, options, self, reader=LitReader)
+        return create_oebbook(log, stream, options, reader=LitReader)
 
     def postprocess_book(self, oeb, opts, log):
         from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
@@ -39,10 +39,13 @@ class LITInput(InputFormatPlugin):
                 body = body[0]
                 if len(body) == 1 and body[0].tag == XHTML('pre'):
                     pre = body[0]
-                    from calibre.ebooks.txt.processor import convert_basic
+                    from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
+                    separate_paragraphs_single_line
                     from lxml import etree
                     import copy
-                    html = convert_basic(pre.text).replace('<html>',
+                    html = separate_paragraphs_single_line(pre.text)
+                    html = preserve_spaces(html)
+                    html = convert_basic(html).replace('<html>',
                             '<html xmlns="%s">'%XHTML_NS)
                     root = etree.fromstring(html)
                     body = XPath('//h:body')(root)
diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index 4b2b169d72..3383708b72 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -255,6 +255,46 @@ you are producing are meant for a particular device type, choose the correspondi
 
 The Output profile also controls the screen size. This will cause, for example, images to be auto-resized to be fit to the screen in some output formats. So choose a profile of a device that has a screen size similar to your device.
 
+.. _heuristic-processing:
+
+Heuristic Processing
+---------------------
+
+:guilabel:`Preprocess input`
+    This option activates various algorithms that try to detect and correct common cases of
+    badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
+    Turn this option on if your input document suffers from bad formatting. But be aware that in
+    some cases, this option can lead to worse results, so use with care.
+
+:guilabel:`Line-unwrap factor`
+    This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
+    option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
+    than the length of 40% of all lines in the document. 
+
+:guilabel:`Unwrap lines`
+    Lorem ipsum
+    
+:guilabel:`Detect and markup unformatted chapter headings and sub headings`
+    Lorem ipsum
+
+:guilabel:`Renumber sequences of &lt;h1&gt; or &lt;h2&gt; tags to prevent splitting`
+    Lorem ipsum
+    
+:guilabel:`Delete blank lines between paragraphs`
+    Lorem ipsum
+
+:guilabel:`Ensure scene breaks are consistently formatted`
+    Lorem ipsum
+
+:guilabel:`Remove unnecessary hyphens`
+    Lorem ipsum
+
+:guilabel:`Italicize common words and patterns`
+    Lorem ipsum
+
+:guilabel:`Replace entity indents with CSS indents`
+    Lorem ipsum
+
 .. _structure-detection:
 
 Structure Detection
@@ -330,16 +370,6 @@ There are a few more options in this section.
     two covers. This option will simply remove the first image from the source document, thereby
     ensuring that the converted book has only one cover, the one specified in |app|.
 
-:guilabel:`Preprocess input`
-    This option activates various algorithms that try to detect and correct common cases of
-    badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
-    Turn this option on if your input document suffers from bad formatting. But be aware that in
-    some cases, this option can lead to worse results, so use with care.
-
-:guilabel:`Line-unwrap factor`
-    This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
-    option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
-    than the length of 40% of all lines in the document. 
     
 Table of Contents
 ------------------
@@ -500,6 +530,9 @@ more blank lines are a paragraph boundary::
 
 TXT input supports a number of options to differentiate how paragraphs are detected.
 
+    :guilabel:`Auto`
+        Analyzes the text file and attempts to determine how paragraphs are defined.
+
     :guilabel:`Treat each line as a paragraph`
         Assumes that every line is a paragraph::
 
@@ -518,6 +551,12 @@ TXT input supports a number of options to differentiate how paragraphs are detec
             This is the
             third.
 
+    :guilabel:`Unformatted`
+        Assumes that the document has no formatting, but does use hard line breaks.  Punctuation
+        and median line length are used to attempt to re-create paragraphs.
+
+    :guilabel:`Process using Textile`
+
     :guilabel:`Process using markdown`
         |app| also supports running TXT input though a transformation preprocessor known as markdown. Markdown
         allows for basic formatting to be added to TXT documents, such as bold, italics, section headings, tables,

From 05730e1886c8562e819364c43a7fa58c172392d6 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 17 Jan 2011 19:00:14 +0800
Subject: [PATCH 42/54] insert horizontal rules for softbreaks when option is
 enabled

---
 src/calibre/ebooks/conversion/utils.py |  5 +++++
 src/calibre/manual/conversion.rst      | 15 +++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 15522d25e6..d9e5246223 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -23,6 +23,7 @@ class HeuristicProcessor(object):
         self.min_chapters = 1
         self.chapters_no_title = 0
         self.chapters_with_title = 0
+        self.blanks_deleted = False
         self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
         self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
@@ -422,6 +423,7 @@ class HeuristicProcessor(object):
         # blank paragraphs then delete blank lines to clean up spacing
         if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
             self.log.debug("deleting blank lines")
+            self.blanks_deleted = True
             html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
             html = self.blankreg.sub('', html)
 
@@ -479,6 +481,9 @@ class HeuristicProcessor(object):
         if getattr(self.extra_opts, 'format_scene_breaks', False):
             # Center separator lines
             html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
+            if not self.blanks_deleted:
+                html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
+            html = re.sub('<p\s+id="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
 
         if self.deleted_nbsps:
             # put back non-breaking spaces in empty paragraphs to preserve original formatting
diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index 3383708b72..96a8e30e3c 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -530,17 +530,18 @@ more blank lines are a paragraph boundary::
 
 TXT input supports a number of options to differentiate how paragraphs are detected.
 
-    :guilabel:`Auto`
-        Analyzes the text file and attempts to determine how paragraphs are defined.
+    :guilabel:`Paragraph Style: Auto`
+        Analyzes the text file and attempts to automatically determine how paragraphs are defined.  This
+        option will generally work fine, if you achieve undesirable results try one of the manual options.
 
-    :guilabel:`Treat each line as a paragraph`
+    :guilabel:`Paragraph Style: Single`
         Assumes that every line is a paragraph::
 
             This is the first.
             This is the second.
             This is the third.
         
-    :guilabel:`Assume print formatting`
+    :guilabel:`Paragraph Style: Print`
         Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when
         the next line that starts with an indent is reached::
 
@@ -551,11 +552,13 @@ TXT input supports a number of options to differentiate how paragraphs are detec
             This is the
             third.
 
-    :guilabel:`Unformatted`
+    :guilabel:`Paragraph Style: Unformatted`
         Assumes that the document has no formatting, but does use hard line breaks.  Punctuation
         and median line length are used to attempt to re-create paragraphs.
 
-    :guilabel:`Process using Textile`
+    :guilabel:`Formatting Style: Auto`
+
+    :guilabel:`Formatting Style: Heuristic`
 
     :guilabel:`Process using markdown`
         |app| also supports running TXT input though a transformation preprocessor known as markdown. Markdown

From a0aa719bb0b8de97a12c96c41a4bff70f656b213 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 17 Jan 2011 23:53:27 +0800
Subject: [PATCH 43/54] implemented multi-pass analysis for chapter detection

---
 src/calibre/ebooks/conversion/utils.py | 93 +++++++++++++++++++-------
 1 file changed, 68 insertions(+), 25 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index d9e5246223..1a691b2e14 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -21,6 +21,7 @@ class HeuristicProcessor(object):
         self.deleted_nbsps = False
         self.totalwords = 0
         self.min_chapters = 1
+        self.max_chapters = 150
         self.chapters_no_title = 0
         self.chapters_with_title = 0
         self.blanks_deleted = False
@@ -132,7 +133,7 @@ class HeuristicProcessor(object):
     def markup_italicis(self, html):
         ITALICIZE_WORDS = [
             'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
-            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
+            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
             'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
             'Mlle.', 'Mons.', 'PS.', 'PPS.',
         ]
@@ -166,9 +167,11 @@ class HeuristicProcessor(object):
         with minimum false positives.  Exits after finding a successful pattern
         '''
         # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
-        # minimum of chapters to search for
+        # minimum of chapters to search for.  A max limit is calculated to prevent things like OCR
+        # or pdf page numbers from being treated as TOC markers
         if wordcount > 7000:
             self.min_chapters = int(ceil(wordcount / 7000.))
+            self.max_chapters = int(ceil(wordcount / 100.))
         #print "minimum chapters required are: "+str(self.min_chapters)
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
@@ -202,44 +205,84 @@ class HeuristicProcessor(object):
         n_lookahead_open = "\s+(?!"
         n_lookahead_close = ")"
 
-        default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
+        default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
+        simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter).{0,50}?(</[ibu][^>]*>)?(?=<)"
+
+        analysis_result = []
 
         chapter_types = [
-            [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for most common chapter headings", 'chapter'],  # Highest frequency headings which include titles
-            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common section headings", 'common'],
-            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
-            [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
-            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
-            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
-            [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric headings", 'plain_number'],  # Numeric Chapters, no dot or colon
-            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
+            [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],  # Highest frequency headings which include titles
+            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
+            [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
+            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
+            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
+            [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'],  # Numeric Chapters, no dot or colon
+            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
             ]
 
         def recurse_patterns(html, analyze):
             # Start with most typical chapter headings, get more aggressive until one works
-            for [chapter_type, lookahead_ignorecase, log_message, type_name] in chapter_types:
+            for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
+                n_lookahead = ''
+                hits = 0
+                self.chapters_no_title = 0
+                self.chapters_with_title = 0
+
+                if n_lookahead_req:
+                    lp_n_lookahead_open = n_lookahead_open
+                    lp_n_lookahead_close = n_lookahead_close
+                else:
+                    lp_n_lookahead_open = ''
+                    lp_n_lookahead_close = ''
+
+                if strict_title:
+                    lp_title = default_title
+                else:
+                    lp_title = simple_title
+                 
+                if ignorecase:
+                    arg_ignorecase = r'(?i)'
+                else:
+                    arg_ignorecase = ''
+
+                if title_req:
+                    lp_opt_title_open = ''
+                    lp_opt_title_close = ''        
+                else:
+                    lp_opt_title_open = opt_title_open
+                    lp_opt_title_close = opt_title_close
+
                 if self.html_preprocess_sections >= self.min_chapters:
                     break
                 full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
-                n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-                self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
-                if lookahead_ignorecase:
-                    chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-                    chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
-                else:
-                    chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
-                    chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
+                if n_lookahead_req:
+                    n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
+                if not analyze:
+                    self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
+
+                chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
+                chapdetect = re.compile(r'%s' % chapter_marker)
+
                 if analyze:
                     hits = len(chapdetect.findall(html))
-                    print unicode(type_name)+" had "+unicode(hits)+" hits"
-                    chapdetect.sub(self.analyze_title_matches, html)
-                    print unicode(self.chapters_no_title)+" chapters with no title"
-                    print unicode(self.chapters_with_title)+" chapters with titles"
+                    if hits:
+                        chapdetect.sub(self.analyze_title_matches, html)
+                        if float(self.chapters_with_title) / float(hits) > .5:
+                            title_req = True
+                            strict_title = False
+                        self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
+                        if type_name == 'common':
+                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
+                        elif self.min_chapters <= hits < self.max_chapters:
+                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
+                            break
                 else:
                     html = chapdetect.sub(self.chapter_head, html)
             return html
 
         recurse_patterns(html, True)
+        chapter_types = analysis_result
         html = recurse_patterns(html, False)
 
         words_per_chptr = wordcount
@@ -293,7 +336,7 @@ class HeuristicProcessor(object):
         pre = re.compile(r'<pre>', re.IGNORECASE)
         if len(pre.findall(html)) >= 1:
             self.log.debug("Running Text Processing")
-            outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
+            outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
             html = outerhtml.sub(self.txt_process, html)
         else:
             # Add markup naively

From 3ca18da2cfc48a1ce3a201245eeee8ed005f0541 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 18 Jan 2011 01:17:42 +0800
Subject: [PATCH 44/54] fix pdf preprocess call

---
 src/calibre/ebooks/conversion/preprocess.py | 13 +++----------
 src/calibre/ebooks/conversion/utils.py      |  7 ++++---
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index d2bdba4928..54639df93c 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -473,12 +473,6 @@ class HTMLPreProcessor(object):
             # unwrap/delete soft hyphens with formatting
             end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
 
-        # Make the more aggressive chapter marking regex optional with the preprocess option to
-        # reduce false positives and move after header/footer removal
-        if getattr(self.extra_opts, 'preprocess_html', None):
-            if is_pdftohtml:
-                end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
-
         length = -1
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
             docanalysis = DocAnalysis('pdf', html)
@@ -525,11 +519,10 @@ class HTMLPreProcessor(object):
             html = dehyphenator(html,'html', length)
 
         if is_pdftohtml:
-            from calibre.ebooks.conversion.utils import PreProcessor
-            pdf_markup = PreProcessor(self.extra_opts, None)
+            from calibre.ebooks.conversion.utils import HeuristicProcessor
+            pdf_markup = HeuristicProcessor(self.extra_opts, None)
             totalwords = 0
-            totalwords = pdf_markup.get_word_count(html)
-            if totalwords > 7000:
+            if pdf_markup.get_word_count(html) > 7000:
                 html = pdf_markup.markup_chapters(html, totalwords, True)
 
         #dump(html, 'post-preprocess')
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 1a691b2e14..888d24d791 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -170,9 +170,9 @@ class HeuristicProcessor(object):
         # minimum of chapters to search for.  A max limit is calculated to prevent things like OCR
         # or pdf page numbers from being treated as TOC markers
         if wordcount > 7000:
-            self.min_chapters = int(ceil(wordcount / 7000.))
-            self.max_chapters = int(ceil(wordcount / 100.))
-        #print "minimum chapters required are: "+str(self.min_chapters)
+            self.min_chapters = int(ceil(wordcount / 15000.))
+            self.max_chapters = int(ceil(wordcount / 1200.))
+        print "minimum chapters required are: "+str(self.min_chapters)
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
         self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
@@ -272,6 +272,7 @@ class HeuristicProcessor(object):
                             title_req = True
                             strict_title = False
                         self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
+                        print "max chapters is "+str(self.max_chapters)
                         if type_name == 'common':
                             analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
                         elif self.min_chapters <= hits < self.max_chapters:

From 1a20df3291576cb346c42584b14d044f80255bde Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Mon, 17 Jan 2011 13:17:33 -0500
Subject: [PATCH 45/54] Modify italicize patterns to reduce false positivies.

---
 src/calibre/ebooks/conversion/utils.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 888d24d791..bfb5f1c153 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -139,17 +139,17 @@ class HeuristicProcessor(object):
         ]
         
         ITALICIZE_STYLE_PATS = [
-            r'(?msu)_(?P<words>.+?)_',
-            r'(?msu)/(?P<words>[^<>]+?)/',
-            r'(?msu)~~(?P<words>.+?)~~',
-            r'(?msu)\*(?P<words>.+?)\*',
-            r'(?msu)~(?P<words>.+?)~',
-            r'(?msu)_/(?P<words>[^<>]+?)/_',
-            r'(?msu)_\*(?P<words>.+?)\*_',
-            r'(?msu)\*/(?P<words>[^<>]+?)/\*',
-            r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
-            r'(?msu)/:(?P<words>[^<>]+?):/',
-            r'(?msu)\|:(?P<words>.+?):\|',
+            r'(?msu)_(?P<words>[^\s][^<>_]+?[^\s])?_',
+            r'(?msu)/(?P<words>[^\s][^<>/]+?[^\s])?/',
+            r'(?msu)~~(?P<words>[^\s][^<>~]+?[^\s])?~~',
+            r'(?msu)\*(?P<words>[^\s][^<>\*]+?[^\s])?\*',
+            r'(?msu)~(?P<words>[^\s][^<>~]+?[^\s])?~',
+            r'(?msu)_/(?P<words>[^\s][^<>/_]+?[^\s])?/_',
+            r'(?msu)_\*(?P<words>[^\s][^<>\*_]+?[^\s])?\*_',
+            r'(?msu)\*/(?P<words>[^\s][^<>/\*]+?[^\s])?/\*',
+            r'(?msu)_\*/(?P<words>[^\s][^<>\*_]+?[^\s])?/\*_',
+            r'(?msu)/:(?P<words>[^\s][^<>:/]+?[^\s])?:/',
+            r'(?msu)\|:(?P<words>[^\s][^<>:\|]+?[^\s])?:\|',
         ]
         
         for word in ITALICIZE_WORDS:

From 539f24213d0410f413c4802dc3ae83bcd338c783 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 18 Jan 2011 12:51:55 +0800
Subject: [PATCH 46/54] tweaked chapter thresholds

---
 src/calibre/ebooks/conversion/utils.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index bfb5f1c153..4d017b7df4 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -21,7 +21,6 @@ class HeuristicProcessor(object):
         self.deleted_nbsps = False
         self.totalwords = 0
         self.min_chapters = 1
-        self.max_chapters = 150
         self.chapters_no_title = 0
         self.chapters_with_title = 0
         self.blanks_deleted = False
@@ -169,9 +168,12 @@ class HeuristicProcessor(object):
         # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
         # minimum of chapters to search for.  A max limit is calculated to prevent things like OCR
         # or pdf page numbers from being treated as TOC markers
+        max_chapters = 150
+        typical_chapters = 7000.
         if wordcount > 7000:
-            self.min_chapters = int(ceil(wordcount / 15000.))
-            self.max_chapters = int(ceil(wordcount / 1200.))
+            if wordcount > 200000:
+                typical_chapters = 15000.
+            self.min_chapters = int(ceil(wordcount / typical_chapters))
         print "minimum chapters required are: "+str(self.min_chapters)
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))

From e72ceb5c59ef96a7d67b8aaa675b8b90a057a642 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 18 Jan 2011 16:35:40 +0800
Subject: [PATCH 47/54] updated docs and labels

---
 src/calibre/ebooks/conversion/cli.py   |   4 +-
 src/calibre/gui2/convert/heuristics.py |   2 +-
 src/calibre/gui2/convert/heuristics.ui |   2 +-
 src/calibre/manual/conversion.rst      | 100 ++++++++++++++++++-------
 4 files changed, 75 insertions(+), 33 deletions(-)

diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index c9612d97b9..b5c057b0f9 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -130,7 +130,7 @@ def add_pipeline_options(parser, plumber):
                   ]
                   ),
                   
-              'HEURISTICS' : (
+              'HEURISTIC PROCESSING' : (
                   _('Modify the document text and structure using common patterns.'),
                   [
                       'enable_heuristics', 'markup_chapter_headings',
@@ -182,7 +182,7 @@ def add_pipeline_options(parser, plumber):
 
               }
 
-    group_order = ['', 'LOOK AND FEEL', 'HEURISTICS',
+    group_order = ['', 'LOOK AND FEEL', 'HEURISTIC PROCESSING',
             'SEARCH AND REPLACE', 'STRUCTURE DETECTION',
             'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
 
diff --git a/src/calibre/gui2/convert/heuristics.py b/src/calibre/gui2/convert/heuristics.py
index 4735782f52..6739c199b7 100644
--- a/src/calibre/gui2/convert/heuristics.py
+++ b/src/calibre/gui2/convert/heuristics.py
@@ -11,7 +11,7 @@ from calibre.gui2.convert import Widget
 
 class HeuristicsWidget(Widget, Ui_Form):
 
-    TITLE = _('Heuristics')
+    TITLE = _('Heuristic Processing')
     HELP  = _('Modify the document text and structure using common patterns.')
     COMMIT_NAME = 'heuristics'
 
diff --git a/src/calibre/gui2/convert/heuristics.ui b/src/calibre/gui2/convert/heuristics.ui
index 1578b7146c..8048bef204 100644
--- a/src/calibre/gui2/convert/heuristics.ui
+++ b/src/calibre/gui2/convert/heuristics.ui
@@ -24,7 +24,7 @@
    <item>
     <widget class="QGroupBox" name="groupBox">
      <property name="title">
-      <string>Heuristics</string>
+      <string>Heuristic Processing</string>
      </property>
      <layout class="QGridLayout" name="gridLayout">
       <item row="0" column="0" colspan="2">
diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index c392df9a5e..94a3a60721 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -260,40 +260,72 @@ The Output profile also controls the screen size. This will cause, for example,
 Heuristic Processing
 ---------------------
 
-:guilabel:`Preprocess input`
-    This option activates various algorithms that try to detect and correct common cases of
-    badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
-    Turn this option on if your input document suffers from bad formatting. But be aware that in
-    some cases, this option can lead to worse results, so use with care.
+Heuristic Processing provides a variety of functions which can be used that try to detect and correct 
+common problems in poorly formatted input documents.  Use these functions if your input document suffers 
+from bad formatting. Because these functions rely on common patterns, be aware that in some cases an 
+option may lead to worse results, so use with care.  As an example, several of these options will
+remove all non-breaking-space entities.
 
-:guilabel:`Line-unwrap factor`
-    This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
-    option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
-    than the length of 40% of all lines in the document. 
+:guilabel:`Preprocess input`
+    This option activates various activates |app|'s Heuristic Processing stage of the conversion pipeline.
+    This must be enabled in order for various sub-functions to be applied
 
 :guilabel:`Unwrap lines`
-    Lorem ipsum
+    Enabling this option will cause |app| to attempt to detect and correct hard line breaks that exist 
+    within a document using punctuation clues and line length.  |app| will first attempt to detect whether 
+    hard line breaks exist, if they do not appear to exist |app| will not attempt to unwrap lines.  The 
+    line-unwrap factor can be reduced if you want to 'force' |app| to unwrap lines.
+
+:guilabel:`Line-unwrap factor`
+    This option controls the algorithm |app| uses to remove hard line breaks. For example, if the value of this
+    option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
+    than the length of 40% of all lines in the document.  If your document only has a few line breaks which need
+    correction, then this value should be reduced to somewhere between 0.1 and 0.2.
     
 :guilabel:`Detect and markup unformatted chapter headings and sub headings`
-    Lorem ipsum
+    If your document does not have Chapter Markers and titles formatted differently from the rest of the text,
+    |app| can use this option to attempt detection them and surround them with heading tags. &lt;h2&gt; tags are used 
+    for chapter headings; &lt;h3&gt; tags are used for any titles that are detected.  This function will 
+    not create a TOC, but in many cases it will cause |app|'s default chapter detection settings to correctly
+    detect chapters and build a TOC.  Adjust the Xpath under Structure Detection if a TOC is not automatically
+    created.  The inserted heading tags are not formatted, to apply formatting use the 'extra_css' option under
+    the Look and Feel conversion settings.  For example, to center heading tags, use the following::
+
+        h2, h3 { text-align: center }
+
+:guilabel:`Renumber sequences of &lt;h1&gt; or &lt;h2&gt; tags`
+    Some publishers format chapter headings using multiple &lt;h1&gt; or &lt;h2&gt; tags sequentially.  
+    |app|'s default conversion settings will cause such titles to be split into two pieces.  This option 
+    will re-number the heading tags to prevent splitting.
 
-:guilabel:`Renumber sequences of &lt;h1&gt; or &lt;h2&gt; tags to prevent splitting`
-    Lorem ipsum
-    
 :guilabel:`Delete blank lines between paragraphs`
-    Lorem ipsum
+    This option will cause |app| to analyze blank lines included within the document.  If every paragraph is interleaved
+    with a blank line, then |app| will remove all those blank paragraphs.  Sequences of multiple blank lines will be
+    considered scene breaks and retained as a single paragraph.  This option differs from the 'Remove Paragraph Spacing' 
+    option under 'Look and Feel' in that it actually modifies the HTML content, while the other option modifies the document
+    styles.  This option can also remove paragraphs which were inserted using |app|'s 'Insert blank line' option.
 
 :guilabel:`Ensure scene breaks are consistently formatted`
-    Lorem ipsum
+    With this option |app| will attempt to detect common scene-break markers and ensure that they are center aligned.  
+    It also attempts to detect scene breaks defined by white space and replace them with a horizontal rule 15% of the
+    page width.  Some readers may find this desirable as these 'soft' scene breaks often become page breaks on readers, and 
+    thus become difficult to distinguish.
 
 :guilabel:`Remove unnecessary hyphens`
-    Lorem ipsum
+    |app| will analyze all hyphenated content in the document when this option is enabled.  The document itself is used
+    as a dictionary for analysis.  This allows |app| to accurately remove hyphens for any words in the document in any language, 
+    along with made-up and obscure scientific words.  The primary drawback is words appearing only a single time in the document 
+    will not be changed.  Analysis happens in two passes, the first pass analyzes line endings.  Lines are only unwrapped if the 
+    word exists with or without a hyphen in the document.  The second pass analyzes all hyphenated words throughout the document, 
+    hyphens are removed if the word exists elsewhere in the document without a match.
 
 :guilabel:`Italicize common words and patterns`
-    Lorem ipsum
+    When enabled, |app| will look for common words and patterns that denote italics and italicize them.  Examples are common text
+    conventions such as ~word~ or phrases that should generally be italicized, e.g. latin phrases like 'etc.' or 'et cetera'.
 
 :guilabel:`Replace entity indents with CSS indents`
-    Lorem ipsum
+    Some documents use a convention of defining text indents using non-breaking space entities.  When this option is enabled |app| will
+    attempt to detect this sort of formatting and convert them to a 3% text indent using css.
 
 .. _structure-detection:
 
@@ -518,15 +550,10 @@ at `mobileread <http://www.mobileread.com/forums/showthread.php?t=28313>`_.
 Convert TXT documents
 ~~~~~~~~~~~~~~~~~~~~~~
 
-TXT documents have no well defined way to specify formatting like bold, italics, etc, or document structure like paragraphs, headings, sections and so on.
-Since TXT documents provide no way to explicitly mark parts of
-the text, by default |app| only groups lines in the input document into paragraphs. The default is to assume one or
-more blank lines are a paragraph boundary::
-
-    This is the first.
-    
-    This is the
-    second paragraph.
+TXT documents have no well defined way to specify formatting like bold, italics, etc, or document 
+structure like paragraphs, headings, sections and so on, but there are a variety of conventions commonly 
+used.  By default |app| attempts automatic detection of the correct formatting and markup based on those
+conventions.
 
 TXT input supports a number of options to differentiate how paragraphs are detected.
 
@@ -534,6 +561,14 @@ TXT input supports a number of options to differentiate how paragraphs are detec
         Analyzes the text file and attempts to automatically determine how paragraphs are defined.  This
         option will generally work fine, if you achieve undesirable results try one of the manual options.
 
+    :guilabel:`Paragraph Style: Block`
+        Assumes one or more blank lines are a paragraph boundary::
+        
+            This is the first.
+    
+            This is the
+            second paragraph.
+
     :guilabel:`Paragraph Style: Single`
         Assumes that every line is a paragraph::
 
@@ -557,16 +592,23 @@ TXT input supports a number of options to differentiate how paragraphs are detec
         and median line length are used to attempt to re-create paragraphs.
 
     :guilabel:`Formatting Style: Auto`
+        Attemtps to detect the type of formatting markup being used.  If no markup is used then heuristic
+        formatting will be applied.
 
     :guilabel:`Formatting Style: Heuristic`
+        Analyses the document for common chapter headings, scene breaks, and italicized words and applies the
+        appropriate html markup during conversion.
 
-    :guilabel:`Process using markdown`
+    :guilabel:`Formatting Style: Markdown`
         |app| also supports running TXT input though a transformation preprocessor known as markdown. Markdown
         allows for basic formatting to be added to TXT documents, such as bold, italics, section headings, tables,
         lists, a Table of Contents, etc. Marking chapter headings with a leading # and setting the chapter XPath detection
         expression to "//h:h1" is the easiest way to have a proper table of contents generated from a TXT document.
         You can learn more about the markdown syntax at `daringfireball <http://daringfireball.net/projects/markdown/syntax>`_.
 
+    :guilabel:`Formatting Style: None`
+        Applies no special formatting to the text, the document is converted to html with no other changes.
+
 
 Convert PDF documents
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~

From cff26ebcbba92ba1bf9d65e7dcc4393b156677f2 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Tue, 18 Jan 2011 06:45:53 -0500
Subject: [PATCH 48/54] Rework Italicize patterns to match less false
 positives.

---
 src/calibre/ebooks/conversion/utils.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index bfb5f1c153..5fc986b7d8 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -139,17 +139,17 @@ class HeuristicProcessor(object):
         ]
         
         ITALICIZE_STYLE_PATS = [
-            r'(?msu)_(?P<words>[^\s][^<>_]+?[^\s])?_',
-            r'(?msu)/(?P<words>[^\s][^<>/]+?[^\s])?/',
-            r'(?msu)~~(?P<words>[^\s][^<>~]+?[^\s])?~~',
-            r'(?msu)\*(?P<words>[^\s][^<>\*]+?[^\s])?\*',
-            r'(?msu)~(?P<words>[^\s][^<>~]+?[^\s])?~',
-            r'(?msu)_/(?P<words>[^\s][^<>/_]+?[^\s])?/_',
-            r'(?msu)_\*(?P<words>[^\s][^<>\*_]+?[^\s])?\*_',
-            r'(?msu)\*/(?P<words>[^\s][^<>/\*]+?[^\s])?/\*',
-            r'(?msu)_\*/(?P<words>[^\s][^<>\*_]+?[^\s])?/\*_',
-            r'(?msu)/:(?P<words>[^\s][^<>:/]+?[^\s])?:/',
-            r'(?msu)\|:(?P<words>[^\s][^<>:\|]+?[^\s])?:\|',
+            r'(?msu)(?<=\s)_(?P<words>\S[^_]{0,40}?\S)?_(?=\s)',
+            r'(?msu)(?<=\s)/(?P<words>\S[^/]{0,40}?\S)?/(?=\s)',
+            r'(?msu)(?<=\s)~~(?P<words>\S[^~]{0,40}?\S)?~~(?=\s)',
+            r'(?msu)(?<=\s)\*(?P<words>\S[^\*]{0,40}?\S)?\*(?=\s)',
+            r'(?msu)(?<=\s)~(?P<words>\S[^~]{0,40}?\S)?~(?=\s)',
+            r'(?msu)(?<=\s)_/(?P<words>\S[^/_]{0,40}?\S)?/_(?=\s)',
+            r'(?msu)(?<=\s)_\*(?P<words>\S[^\*_]{0,40}?\S)?\*_(?=\s)',
+            r'(?msu)(?<=\s)\*/(?P<words>\S[^/\*]{0,40}?\S)?/\*(?=\s)',
+            r'(?msu)(?<=\s)_\*/(?P<words>\S[^\*_]{0,40}?\S)?/\*_(?=\s)',
+            r'(?msu)(?<=\s)/:(?P<words>\S[^:/]{0,40}?\S)?:/(?=\s)',
+            r'(?msu)(?<=\s)\|:(?P<words>\S[^:\|]{0,40}?\S)?:\|(?=\s)',
         ]
         
         for word in ITALICIZE_WORDS:

From 4fd784a9c17bf6f286819656ce2baf82f9f9bada Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 18 Jan 2011 20:28:53 +0800
Subject: [PATCH 49/54] ...

---
 src/calibre/ebooks/conversion/utils.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 5984723aa3..bcc6f5a236 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -496,12 +496,9 @@ class HeuristicProcessor(object):
             if hardbreaks or unwrap_factor < 0.4:
                 self.log.debug("Unwrapping required, unwrapping Lines")
                 # Dehyphenate with line length limiters
-                dehyphenator = Dehyphenator()
+                dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
                 html = dehyphenator(html,'html', length)
                 html = self.punctuation_unwrap(length, html, 'html')
-                # unwrap remaining hyphens based on line length, but only remove if there is a match
-                dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
-                html = dehyphenator(html,'html_cleanup', length)
 
         if getattr(self.extra_opts, 'dehyphenate', False):
             # dehyphenate in cleanup mode to fix anything previous conversions/editing missed

From 80065cb443021536762bf0fdf8d479b1b06bbd0d Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 19 Jan 2011 00:18:11 +0800
Subject: [PATCH 50/54] tweaked chapter_markup for false positives/negatives

---
 src/calibre/ebooks/conversion/utils.py | 10 ++++------
 src/calibre/manual/conversion.rst      | 12 ++++++++----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index bcc6f5a236..812a863717 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -174,7 +174,7 @@ class HeuristicProcessor(object):
             if wordcount > 200000:
                 typical_chapters = 15000.
             self.min_chapters = int(ceil(wordcount / typical_chapters))
-        print "minimum chapters required are: "+str(self.min_chapters)
+        self.log.debug("minimum chapters required are: "+str(self.min_chapters))
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
         self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
@@ -208,12 +208,12 @@ class HeuristicProcessor(object):
         n_lookahead_close = ")"
 
         default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
-        simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter).{0,50}?(</[ibu][^>]*>)?(?=<)"
+        simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"
 
         analysis_result = []
 
         chapter_types = [
-            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
             [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],  # Highest frequency headings which include titles
             [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
             [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
@@ -274,10 +274,9 @@ class HeuristicProcessor(object):
                             title_req = True
                             strict_title = False
                         self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
-                        print "max chapters is "+str(self.max_chapters)
                         if type_name == 'common':
                             analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
-                        elif self.min_chapters <= hits < self.max_chapters:
+                        elif self.min_chapters <= hits < max_chapters:
                             analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
                             break
                 else:
@@ -423,7 +422,6 @@ class HeuristicProcessor(object):
         except:
             self.log.warn("Can't get wordcount")
 
-        print "found "+unicode(self.totalwords)+" words in the flow"
         if self.totalwords < 50:
             self.log.warn("flow is too short, not running heuristics")
             return html
diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index 94a3a60721..e7c09a57a5 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -285,10 +285,14 @@ remove all non-breaking-space entities.
 :guilabel:`Detect and markup unformatted chapter headings and sub headings`
     If your document does not have Chapter Markers and titles formatted differently from the rest of the text,
     |app| can use this option to attempt detection them and surround them with heading tags. &lt;h2&gt; tags are used 
-    for chapter headings; &lt;h3&gt; tags are used for any titles that are detected.  This function will 
-    not create a TOC, but in many cases it will cause |app|'s default chapter detection settings to correctly
-    detect chapters and build a TOC.  Adjust the Xpath under Structure Detection if a TOC is not automatically
-    created.  The inserted heading tags are not formatted, to apply formatting use the 'extra_css' option under
+    for chapter headings; &lt;h3&gt; tags are used for any titles that are detected.  
+    
+    This function will not create a TOC, but in many cases it will cause |app|'s default chapter detection settings 
+    to correctly detect chapters and build a TOC.  Adjust the Xpath under Structure Detection if a TOC is not automatically
+    created.  If there are no other headings used in the document then setting "//h:h2" under Structure Detection would
+    be the easiest way to create a TOC for the document.
+    
+    The inserted headings are not formatted, to apply formatting use the 'extra_css' option under
     the Look and Feel conversion settings.  For example, to center heading tags, use the following::
 
         h2, h3 { text-align: center }

From b4c5cd0122b0afd540862c1eee7708a3be1d9baa Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 19 Jan 2011 00:41:59 +0800
Subject: [PATCH 51/54] fix calls to create_oebbook, mark several strings as
 unicode

---
 src/calibre/ebooks/chm/input.py        | 2 +-
 src/calibre/ebooks/conversion/utils.py | 6 +++---
 src/calibre/ebooks/html/input.py       | 2 +-
 src/calibre/ebooks/snb/input.py        | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/chm/input.py b/src/calibre/ebooks/chm/input.py
index c4b124fe98..89efa2b4d1 100644
--- a/src/calibre/ebooks/chm/input.py
+++ b/src/calibre/ebooks/chm/input.py
@@ -75,7 +75,7 @@ class CHMInput(InputFormatPlugin):
     def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
         from calibre.ebooks.conversion.plumber import create_oebbook
         from calibre.ebooks.oeb.base import DirContainer
-        oeb = create_oebbook(log, None, opts, self,
+        oeb = create_oebbook(log, None, opts,
                 encoding=opts.input_encoding, populate=False)
         self.oeb = oeb
 
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 812a863717..9ae8e5ab6f 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -299,9 +299,9 @@ class HeuristicProcessor(object):
         supports a range of html markup and text files
         '''
         # define the pieces of the regex
-        lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
-        em_en_lookahead = "(?<=.{"+str(length)+"}[\u2013\u2014])"
-        soft_hyphen = "\xad"
+        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
+        em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
+        soft_hyphen = u"\xad"
         line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
         blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
         line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index ed0bf7b3ef..080faffae6 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -295,7 +295,7 @@ class HTMLInput(InputFormatPlugin):
             return oeb
 
         from calibre.ebooks.conversion.plumber import create_oebbook
-        return create_oebbook(log, stream.name, opts, self,
+        return create_oebbook(log, stream.name, opts,
                 encoding=opts.input_encoding)
 
     def is_case_sensitive(self, path):
diff --git a/src/calibre/ebooks/snb/input.py b/src/calibre/ebooks/snb/input.py
index d2acb257aa..100ac1447f 100755
--- a/src/calibre/ebooks/snb/input.py
+++ b/src/calibre/ebooks/snb/input.py
@@ -41,7 +41,7 @@ class SNBInput(InputFormatPlugin):
             raise ValueError("Invalid SNB file")
         log.debug("Handle meta data ...")
         from calibre.ebooks.conversion.plumber import create_oebbook
-        oeb = create_oebbook(log, None, options, self,
+        oeb = create_oebbook(log, None, options,
                 encoding=options.input_encoding, populate=False)
         meta = snbFile.GetFileStream('snbf/book.snbf')
         if meta != None:

From 01584b07841f27494b897b4b398576a0bbbb9746 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 19 Jan 2011 01:32:22 +0800
Subject: [PATCH 52/54] removed rtf preprocess call

---
 src/calibre/ebooks/rtf/input.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index d3d8c78dbd..ca6f2c7b95 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -321,9 +321,6 @@ class RTFInput(InputFormatPlugin):
             res = re.sub('\s*<body>', '<body>', res)
             res = re.sub('(?<=\n)\n{2}',
                     u'<p>\u00a0</p>\n'.encode('utf-8'), res)
-            if self.opts.enable_heuristics:
-                preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None))
-                res = preprocessor(res.decode('utf-8')).encode('utf-8')
             f.write(res)
         self.write_inline_css(inline_class, border_styles)
         stream.seek(0)

From 3c45dba7ccb24e6328236c65c04c43b2378d5d03 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 19 Jan 2011 02:01:40 +0800
Subject: [PATCH 53/54] ...

---
 src/calibre/ebooks/conversion/preprocess.py | 2 +-
 src/calibre/ebooks/conversion/utils.py      | 6 +++---
 src/calibre/ebooks/txt/input.py             | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index da20af6e8a..bbd71ede3a 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -175,7 +175,7 @@ class Dehyphenator(object):
     '''
 
     def __init__(self, verbose=0, log=None):
-        self.log = default_log if log is None else log
+        self.log = log
         self.verbose = verbose
         # Add common suffixes to the regex below to increase the likelihood of a match -
         # don't add suffixes which are also complete words, such as 'able' or 'sex'
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 9ae8e5ab6f..4663eeccdf 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -92,8 +92,8 @@ class HeuristicProcessor(object):
         line_end = line_end_ere.findall(raw)
         tot_htm_ends = len(htm_end)
         tot_ln_fds = len(line_end)
-        self.log.debug("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
-                unicode(tot_htm_ends) + " marked up endings")
+        #self.log.debug("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
+        #        unicode(tot_htm_ends) + " marked up endings")
 
         if percent > 1:
             percent = 1
@@ -101,7 +101,7 @@ class HeuristicProcessor(object):
             percent = 0
 
         min_lns = tot_ln_fds * percent
-        self.log.debug("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
+        #self.log.debug("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
         if min_lns > tot_htm_ends:
             return True
 
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index c918d145f4..dd14de2d20 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -118,11 +118,11 @@ class TXTInput(InputFormatPlugin):
                 txt = separate_paragraphs_print_formatted(txt)
 
             if options.paragraph_type == 'unformatted':
-                from calibre.ebooks.conversion.utils import PreProcessor
+                from calibre.ebooks.conversion.utils import HeuristicProcessor
                 # get length
 
                 # unwrap lines based on punctuation
-                preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
+                preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
                 txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
 
             flow_size = getattr(options, 'flow_size', 0)

From ca89710f65059c1148a6da1d44b040b47a4f8335 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 19 Jan 2011 02:18:49 +0800
Subject: [PATCH 54/54] doc update

---
 src/calibre/manual/conversion.rst | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index e7c09a57a5..2bc5687262 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -331,6 +331,22 @@ remove all non-breaking-space entities.
     Some documents use a convention of defining text indents using non-breaking space entities.  When this option is enabled |app| will
     attempt to detect this sort of formatting and convert them to a 3% text indent using css.
 
+.. search-replace:
+
+Search & Replace
+---------------------
+
+These options are useful primarily for conversion of PDF documents. Often, the conversion leaves
+behind page headers and footers in the text. These options use regular expressions to try and detect
+the headers and footers and remove them. Remember that they operate on the intermediate XHTML produced
+by the conversion pipeline. There is also a wizard to help you customize the regular expressions for
+your document.  These options can also be used for generic search and replace of any content by additionally 
+specifying a replacement expression.
+
+The search works by using a python regular expression. All matched text is simply removed from
+the document or replaced using the replacement pattern. You can learn more about regular expressions and 
+their syntax at http://docs.python.org/library/re.html.
+
 .. _structure-detection:
 
 Structure Detection
@@ -374,21 +390,6 @@ which means that |app| will insert page breaks before every `<h1>` and `<h2>` ta
     
     The default expressions may change depending on the input format you are converting.
 
-Removing headers and footers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-These options are useful primarily for conversion of PDF documents. Often, the conversion leaves
-behind page headers and footers in the text. These options use regular expressions to try and detect
-the headers and footers and remove them. Remember that they operate on the intermediate XHTML produced
-by the conversion pipeline. There is also a wizard to help you customize the regular expressions for
-your document.
-
-The header and footer regular expressions are used in conjunction with the remove header and footer options.
-If the remove option is not enabled the regular expression will not be applied to remove the matched text.
-The removal works by using a python regular expression. All matched text is simply removed from
-the document. You can learn more about regular expressions and their syntax at
-http://docs.python.org/library/re.html.
-
 Miscellaneous
 ~~~~~~~~~~~~~~