oops

2025-07-09 03:04:10 -04:00 · 2011-01-18 14:15:04 -07:00 · 2011-01-18 14:15:04 -07:00 · ed5bb2390a
commit ed5bb2390a
parent 5fcc1369df ca89710f65
32 changed files with 1200 additions and 665 deletions
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@ -160,18 +160,6 @@ class InputFormatPlugin(Plugin):
        '''
        raise NotImplementedError()
    def preprocess_html(self, opts, html):
        '''
        This method is called by the conversion pipeline on all HTML before it
        is parsed. It is meant to be used to do any required preprocessing on
        the HTML, like removing hard line breaks, etc.
        :param html: A unicode string
        :return: A unicode string
        '''
        return html
    def convert(self, stream, options, file_ext, log, accelerators):
        '''
        This method must be implemented in sub-classes. It must return
--- a/src/calibre/ebooks/chm/input.py
+++ b/src/calibre/ebooks/chm/input.py
@ -75,7 +75,7 @@ class CHMInput(InputFormatPlugin):
    def _create_oebbook(self, hhcpath, basedir, opts, log, mi):
        from calibre.ebooks.conversion.plumber import create_oebbook
        from calibre.ebooks.oeb.base import DirContainer
-        oeb = create_oebbook(log, None, opts, self,
+        oeb = create_oebbook(log, None, opts,
                encoding=opts.input_encoding, populate=False)
        self.oeb = oeb
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@ -126,8 +126,27 @@ def add_pipeline_options(parser, plumber):
                      'margin_top', 'margin_left', 'margin_right',
                      'margin_bottom', 'change_justification',
                      'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
-                      'asciiize', 'remove_header', 'header_regex',
+                      'asciiize',
-                      'remove_footer', 'footer_regex',
+                  ]
                  ),
              'HEURISTIC PROCESSING' : (
                  _('Modify the document text and structure using common patterns.'),
                  [
                      'enable_heuristics', 'markup_chapter_headings',
                      'italicize_common_cases', 'fix_indents',
                      'html_unwrap_factor', 'unwrap_lines',
                      'delete_blank_paragraphs', 'format_scene_breaks',
                      'dehyphenate', 'renumber_headings',
                  ]
                  ),
              'SEARCH AND REPLACE' : (
                 _('Modify the document text and structure using user defined patterns.'),
                 [
                      'sr1_search', 'sr1_replace',
                      'sr2_search', 'sr2_replace',
                      'sr3_search', 'sr3_replace',
                 ]
              ),
@ -137,7 +156,6 @@ def add_pipeline_options(parser, plumber):
                      'chapter', 'chapter_mark',
                      'prefer_metadata_cover', 'remove_first_image',
                      'insert_metadata', 'page_breaks_before',
                      'preprocess_html', 'html_unwrap_factor',
                  ]
                  ),
@ -164,7 +182,8 @@ def add_pipeline_options(parser, plumber):
              }
-    group_order = ['', 'LOOK AND FEEL', 'STRUCTURE DETECTION',
+    group_order = ['', 'LOOK AND FEEL', 'HEURISTIC PROCESSING',
            'SEARCH AND REPLACE', 'STRUCTURE DETECTION',
            'TABLE OF CONTENTS', 'METADATA', 'DEBUG']
    for group in group_order:
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -376,23 +376,6 @@ OptionRecommendation(name='insert_metadata',
            )
        ),
 OptionRecommendation(name='preprocess_html',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=_('Attempt to detect and correct hard line breaks and other '
            'problems in the source file. This may make things worse, so use '
            'with care.'
            )
        ),
 OptionRecommendation(name='html_unwrap_factor',
        recommended_value=0.40, level=OptionRecommendation.LOW,
        help=_('Scale used to determine the length at which a line should '
            'be unwrapped if preprocess is enabled. Valid values are a decimal between 0 and 1. The '
            'default is 0.40, just below the median line length. This will unwrap typical books '
            ' with hard line breaks, but should be reduced if the line length is variable.'
            )
        ),
 OptionRecommendation(name='smarten_punctuation',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=_('Convert plain quotes, dashes and ellipsis to their '
@ -401,32 +384,6 @@ OptionRecommendation(name='smarten_punctuation',
            )
        ),
 OptionRecommendation(name='remove_header',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=_('Use a regular expression to try and remove the header.'
            )
        ),
 OptionRecommendation(name='header_regex',
        recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
        level=OptionRecommendation.LOW,
        help=_('The regular expression to use to remove the header.'
            )
        ),
 OptionRecommendation(name='remove_footer',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=_('Use a regular expression to try and remove the footer.'
            )
        ),
 OptionRecommendation(name='footer_regex',
        recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
        level=OptionRecommendation.LOW,
        help=_('The regular expression to use to remove the footer.'
            )
        ),
 OptionRecommendation(name='read_metadata_from_opf',
            recommended_value=None, level=OptionRecommendation.LOW,
            short_switch='m',
@ -527,6 +484,89 @@ OptionRecommendation(name='timestamp',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Set the book timestamp (used by the date column in calibre).')),
 OptionRecommendation(name='enable_heuristics',
    recommended_value=False, level=OptionRecommendation.LOW,
    help=_('Enable heurisic processing. This option must be set for any '
           'heuristic processing to take place.')),
 OptionRecommendation(name='markup_chapter_headings',
    recommended_value=False, level=OptionRecommendation.LOW,
    help=_('Detect unformatted chapter headings and sub headings. Change ' 
           'them to h2 and h3 tags.  This setting will not create a TOC, '
           'but can be used in conjunction with structure detection to create '
           'one.')),
 OptionRecommendation(name='italicize_common_cases',
    recommended_value=False, level=OptionRecommendation.LOW,
    help=_('Look for common words and patterns that denote '
           'italics and italicize them.')),
 OptionRecommendation(name='fix_indents',
    recommended_value=False, level=OptionRecommendation.LOW,
    help=_('Turn indentation created from multiple non-breaking space entities '
           'into CSS indents.')),
 OptionRecommendation(name='html_unwrap_factor',
    recommended_value=0.40, level=OptionRecommendation.LOW,
    help=_('Scale used to determine the length at which a line should '
            'be unwrapped. Valid values are a decimal between 0 and 1. The '
            'default is 0.4, just below the median line length.  If only a '
            'few lines in the document require unwrapping this value should '
            'be reduced')),
 OptionRecommendation(name='unwrap_lines',
    recommended_value=False, level=OptionRecommendation.LOW,
    help=_('Unwrap lines using punctuation and other formatting clues.')),
 OptionRecommendation(name='delete_blank_paragraphs',
    recommended_value=False, level=OptionRecommendation.LOW,
    help=_('Remove empty paragraphs from the document when they exist between '
           'every other paragraph')),
 OptionRecommendation(name='format_scene_breaks',
    recommended_value=False, level=OptionRecommendation.LOW,
    help=_('left aligned scene break markers are center aligned. '
           'Replace soft scene breaks that use multiple blank lines with'
           'horizontal rules.')),
 OptionRecommendation(name='dehyphenate',
    recommended_value=False, level=OptionRecommendation.LOW,
    help=_('Analyses hyphenated words throughout the document.  The '
           'document itself is used as a dictionary to determine whether hyphens '
           'should be retained or removed.')),
 OptionRecommendation(name='renumber_headings',
    recommended_value=False, level=OptionRecommendation.LOW,
    help=_('Looks for occurences of sequential <h1> or <h2> tags. '
           'The tags are renumbered to prevent splitting in the middle '
           'of chapter headings.')),
 OptionRecommendation(name='sr1_search',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Search pattern (regular expression) to be replaced with '
           'sr1-replace.')),
 OptionRecommendation(name='sr1_replace',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Replace characters to replace the text found with sr1-search.')),
 OptionRecommendation(name='sr2_search',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Search pattern (regular expression) to be replaced with '
           'sr2-replace.')),
 OptionRecommendation(name='sr2_replace',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Replace characters to replace the text found with sr2-search.')),
 OptionRecommendation(name='sr3_search',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Search pattern (regular expression) to be replaced with '
           'sr3-replace.')),
 OptionRecommendation(name='sr3_replace',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Replace characters to replace the text found with sr3-search.')),
 ]
        # }}}
@ -861,7 +901,6 @@ OptionRecommendation(name='timestamp',
                self.opts_to_mi(self.user_metadata)
            if not hasattr(self.oeb, 'manifest'):
                self.oeb = create_oebbook(self.log, self.oeb, self.opts,
                        self.input_plugin,
                        encoding=self.input_plugin.output_encoding)
            self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
            self.opts.is_image_collection = self.input_plugin.is_image_collection
@ -971,14 +1010,13 @@ OptionRecommendation(name='timestamp',
        self.log(self.output_fmt.upper(), 'output written to', self.output)
        self.flush()
-def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
+def create_oebbook(log, path_or_stream, opts, reader=None,
        encoding='utf-8', populate=True):
    '''
    Create an OEBBook.
    '''
    from calibre.ebooks.oeb.base import OEBBook
-    html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
+    html_preprocessor = HTMLPreProcessor(log, opts)
            opts.preprocess_html, opts)
    if not encoding:
        encoding = None
    oeb = OEBBook(log, html_preprocessor,
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -174,13 +174,19 @@ class Dehyphenator(object):
    retain hyphens.
    '''
-    def __init__(self):
+    def __init__(self, verbose=0, log=None):
        self.log = log
        self.verbose = verbose
        # Add common suffixes to the regex below to increase the likelihood of a match -
        # don't add suffixes which are also complete words, such as 'able' or 'sex'
-        self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$", re.IGNORECASE)
+        # only remove if it's not already the point of hyphenation
        self.suffix_string = "((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$"
        self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
        self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
        # remove prefixes if the prefix was not already the point of hyphenation
-        self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
+        self.prefix_string = '^(dis|re|un|in|ex)'
-        self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
+        self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
        self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
    def dehyphenate(self, match):
        firsthalf = match.group('firstpart')
@ -191,31 +197,44 @@ class Dehyphenator(object):
            wraptags = ''
        hyphenated = unicode(firsthalf) + "-" + unicode(secondhalf)
        dehyphenated = unicode(firsthalf) + unicode(secondhalf)
        if self.suffixes.match(secondhalf) is None:
            lookupword = self.removesuffixes.sub('', dehyphenated)
-        if self.prefixes.match(firsthalf) is None:
+        else:
            lookupword = dehyphenated
        if len(firsthalf) > 3 and self.prefixes.match(firsthalf) is None:
            lookupword = self.removeprefix.sub('', lookupword)
-        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+        if self.verbose > 2:
            self.log("lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated))
        try:
            searchresult = self.html.find(lookupword.lower())
        except:
            return hyphenated
        if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
            if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+                if self.verbose > 2:
                    self.log("    Cleanup:returned dehyphenated word: " + str(dehyphenated))
                return dehyphenated
            elif self.html.find(hyphenated) != -1:
-                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
+                if self.verbose > 2:
                    self.log("        Cleanup:returned hyphenated word: " + str(hyphenated))
                return hyphenated
            else:
-                #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+                if self.verbose > 2:
                    self.log("            Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf))
                return firsthalf+u'\u2014'+wraptags+secondhalf
        else:
            if len(firsthalf) <= 2 and len(secondhalf) <= 2:
                if self.verbose > 2:
                    self.log("too short, returned hyphenated word: " + str(hyphenated))
                return hyphenated
            if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "returned dehyphenated word: " + str(dehyphenated)
+                if self.verbose > 2:
                    self.log("     returned dehyphenated word: " + str(dehyphenated))
                return dehyphenated
            else:
-                #print "           returned hyphenated word: " + str(hyphenated)
+                if self.verbose > 2:
                    self.log("          returned hyphenated word: " + str(hyphenated))
                return hyphenated
    def __call__(self, html, format, length=1):
@ -228,7 +247,7 @@ class Dehyphenator(object):
        elif format == 'txt':
            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
        elif format == 'individual_words':
-            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
+            intextmatch = re.compile(u'(?!<)(?P<firstpart>\w+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)')
        elif format == 'html_cleanup':
            intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
        elif format == 'txt_cleanup':
@ -397,10 +416,8 @@ class HTMLPreProcessor(object):
                     (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
                      lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
                     ]
-    def __init__(self, input_plugin_preprocess, plugin_preprocess,
+    def __init__(self, log=None, extra_opts=None):
-            extra_opts=None):
+        self.log = log
        self.input_plugin_preprocess = input_plugin_preprocess
        self.plugin_preprocess = plugin_preprocess
        self.extra_opts = extra_opts
    def is_baen(self, src):
@ -436,27 +453,19 @@ class HTMLPreProcessor(object):
        if not getattr(self.extra_opts, 'keep_ligatures', False):
            html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
        for search, replace in [['sr3_search', 'sr3_replace'], ['sr2_search', 'sr2_replace'], ['sr1_search', 'sr1_replace']]:
            search_pattern = getattr(self.extra_opts, search, '')
            if search_pattern:
                try:
                    search_re = re.compile(search_pattern)
                    replace_txt = getattr(self.extra_opts, replace, '')
                    if replace_txt == None:
                        replace_txt = ''
                    rules.insert(0, (search_re, replace_txt))
                except Exception as e:
                    self.log.error('Failed to parse %s regexp because %s' % (search, e))
        end_rules = []
        if getattr(self.extra_opts, 'remove_header', None):
            try:
                rules.insert(0,
                    (re.compile(self.extra_opts.header_regex), lambda match : '')
                )
            except:
                import traceback
                print 'Failed to parse remove_header regexp'
                traceback.print_exc()
        if getattr(self.extra_opts, 'remove_footer', None):
            try:
                rules.insert(0,
                    (re.compile(self.extra_opts.footer_regex), lambda match : '')
                )
            except:
                import traceback
                print 'Failed to parse remove_footer regexp'
                traceback.print_exc()
        # delete soft hyphens - moved here so it's executed after header/footer removal
        if is_pdftohtml:
            # unwrap/delete soft hyphens
@ -464,12 +473,6 @@ class HTMLPreProcessor(object):
            # unwrap/delete soft hyphens with formatting
            end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
        # Make the more aggressive chapter marking regex optional with the preprocess option to
        # reduce false positives and move after header/footer removal
        if getattr(self.extra_opts, 'preprocess_html', None):
            if is_pdftohtml:
                end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
        length = -1
        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
            docanalysis = DocAnalysis('pdf', html)
@ -512,15 +515,14 @@ class HTMLPreProcessor(object):
        if is_pdftohtml and length > -1:
            # Dehyphenate
-            dehyphenator = Dehyphenator()
+            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
            html = dehyphenator(html,'html', length)
        if is_pdftohtml:
-            from calibre.ebooks.conversion.utils import PreProcessor
+            from calibre.ebooks.conversion.utils import HeuristicProcessor
-            pdf_markup = PreProcessor(self.extra_opts, None)
+            pdf_markup = HeuristicProcessor(self.extra_opts, None)
            totalwords = 0
-            totalwords = pdf_markup.get_word_count(html)
+            if pdf_markup.get_word_count(html) > 7000:
            if totalwords > 7000:
                html = pdf_markup.markup_chapters(html, totalwords, True)
        #dump(html, 'post-preprocess')
@ -540,8 +542,10 @@ class HTMLPreProcessor(object):
            unidecoder = Unidecoder()
            html = unidecoder.decode(html)
-        if self.plugin_preprocess:
+        if getattr(self.extra_opts, 'enable_heuristics', False):
-            html = self.input_plugin_preprocess(self.extra_opts, html)
+            from calibre.ebooks.conversion.utils import HeuristicProcessor
            preprocessor = HeuristicProcessor(self.extra_opts, self.log)
            html = preprocessor(html)
        if getattr(self.extra_opts, 'smarten_punctuation', False):
            html = self.smarten_punctuation(html)
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -11,13 +11,22 @@ from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.utils.logging import default_log
 from calibre.utils.wordcount import get_wordcount_obj
-class PreProcessor(object):
+class HeuristicProcessor(object):
    def __init__(self, extra_opts=None, log=None):
        self.log = default_log if log is None else log
        self.html_preprocess_sections = 0
        self.found_indents = 0
        self.extra_opts = extra_opts
        self.deleted_nbsps = False
        self.totalwords = 0
        self.min_chapters = 1
        self.chapters_no_title = 0
        self.chapters_with_title = 0
        self.blanks_deleted = False
        self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
        self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
        self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
    def is_pdftohtml(self, src):
        return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
@ -27,12 +36,12 @@ class PreProcessor(object):
        title = match.group('title')
        if not title:
            self.html_preprocess_sections = self.html_preprocess_sections + 1
-            self.log("marked " + unicode(self.html_preprocess_sections) +
+            self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                    " chapters. - " + unicode(chap))
            return '<h2>'+chap+'</h2>\n'
        else:
            self.html_preprocess_sections = self.html_preprocess_sections + 1
-            self.log("marked " + unicode(self.html_preprocess_sections) +
+            self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                    " chapters & titles. - " + unicode(chap) + ", " + unicode(title))
            return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
@ -40,10 +49,18 @@ class PreProcessor(object):
        chap = match.group('section')
        styles = match.group('styles')
        self.html_preprocess_sections = self.html_preprocess_sections + 1
-        self.log("marked " + unicode(self.html_preprocess_sections) +
+        self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                " section markers based on punctuation. - " + unicode(chap))
        return '<'+styles+' style="page-break-before:always">'+chap
    def analyze_title_matches(self, match):
        chap = match.group('chap')
        title = match.group('title')
        if not title:
            self.chapters_no_title = self.chapters_no_title + 1
        else:
            self.chapters_with_title = self.chapters_with_title + 1
    def insert_indent(self, match):
        pstyle = match.group('formatting')
        span = match.group('span')
@ -75,8 +92,8 @@ class PreProcessor(object):
        line_end = line_end_ere.findall(raw)
        tot_htm_ends = len(htm_end)
        tot_ln_fds = len(line_end)
-        self.log("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
+        #self.log.debug("There are " + unicode(tot_ln_fds) + " total Line feeds, and " +
-                unicode(tot_htm_ends) + " marked up endings")
+        #        unicode(tot_htm_ends) + " marked up endings")
        if percent > 1:
            percent = 1
@ -84,7 +101,7 @@ class PreProcessor(object):
            percent = 0
        min_lns = tot_ln_fds * percent
-        self.log("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
+        #self.log.debug("There must be fewer than " + unicode(min_lns) + " unmarked lines to add markup")
        if min_lns > tot_htm_ends:
            return True
@ -112,16 +129,55 @@ class PreProcessor(object):
        wordcount = get_wordcount_obj(word_count_text)
        return wordcount.words
    def markup_italicis(self, html):
        ITALICIZE_WORDS = [
            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
            'Mlle.', 'Mons.', 'PS.', 'PPS.',
        ]
        ITALICIZE_STYLE_PATS = [
            r'(?msu)(?<=\s)_(?P<words>\S[^_]{0,40}?\S)?_(?=\s)',
            r'(?msu)(?<=\s)/(?P<words>\S[^/]{0,40}?\S)?/(?=\s)',
            r'(?msu)(?<=\s)~~(?P<words>\S[^~]{0,40}?\S)?~~(?=\s)',
            r'(?msu)(?<=\s)\*(?P<words>\S[^\*]{0,40}?\S)?\*(?=\s)',
            r'(?msu)(?<=\s)~(?P<words>\S[^~]{0,40}?\S)?~(?=\s)',
            r'(?msu)(?<=\s)_/(?P<words>\S[^/_]{0,40}?\S)?/_(?=\s)',
            r'(?msu)(?<=\s)_\*(?P<words>\S[^\*_]{0,40}?\S)?\*_(?=\s)',
            r'(?msu)(?<=\s)\*/(?P<words>\S[^/\*]{0,40}?\S)?/\*(?=\s)',
            r'(?msu)(?<=\s)_\*/(?P<words>\S[^\*_]{0,40}?\S)?/\*_(?=\s)',
            r'(?msu)(?<=\s)/:(?P<words>\S[^:/]{0,40}?\S)?:/(?=\s)',
            r'(?msu)(?<=\s)\|:(?P<words>\S[^:\|]{0,40}?\S)?:\|(?=\s)',
        ]
        for word in ITALICIZE_WORDS:
            html = html.replace(word, '<i>%s</i>' % word)
        for pat in ITALICIZE_STYLE_PATS:
            html = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), html)
        return html
    def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
        '''
        Searches for common chapter headings throughout the document
        attempts multiple patterns based on likelihood of a match
        with minimum false positives.  Exits after finding a successful pattern
        '''
        # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
-        # minimum of chapters to search for
+        # minimum of chapters to search for.  A max limit is calculated to prevent things like OCR
-        self.min_chapters = 1
+        # or pdf page numbers from being treated as TOC markers
        max_chapters = 150
        typical_chapters = 7000.
        if wordcount > 7000:
-            self.min_chapters = int(ceil(wordcount / 7000.))
+            if wordcount > 200000:
-        #print "minimum chapters required are: "+str(self.min_chapters)
+                typical_chapters = 15000.
            self.min_chapters = int(ceil(wordcount / typical_chapters))
        self.log.debug("minimum chapters required are: "+str(self.min_chapters))
        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
        self.html_preprocess_sections = len(heading.findall(html))
-        self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
+        self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
        # Build the Regular Expressions in pieces
        init_lookahead = "(?=<(p|div))"
@ -151,103 +207,160 @@ class PreProcessor(object):
        n_lookahead_open = "\s+(?!"
        n_lookahead_close = ")"
-        default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\:\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
+        default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
        simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"
        analysis_result = []
        chapter_types = [
-            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common Chapter Headings"],
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
-            [r"([A-Z-]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings"],  # Spaced Lettering
+            [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],  # Highest frequency headings which include titles
-            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
+            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
-            [r"[^'\"]?(\d+(\.|:)|CHAPTER)\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
+            [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
-            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
+            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
-            [r"[^'\"]?(\d+|CHAPTER)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric chapter headings"],  # Numeric Chapters, no dot or colon
+            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
-            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
+            [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'],  # Numeric Chapters, no dot or colon
            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
            ]
        def recurse_patterns(html, analyze):
            # Start with most typical chapter headings, get more aggressive until one works
-        for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
+            for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
                n_lookahead = ''
                hits = 0
                self.chapters_no_title = 0
                self.chapters_with_title = 0
                if n_lookahead_req:
                    lp_n_lookahead_open = n_lookahead_open
                    lp_n_lookahead_close = n_lookahead_close
                else:
                    lp_n_lookahead_open = ''
                    lp_n_lookahead_close = ''
                if strict_title:
                    lp_title = default_title
                else:
                    lp_title = simple_title
                if ignorecase:
                    arg_ignorecase = r'(?i)'
                else:
                    arg_ignorecase = ''
                if title_req:
                    lp_opt_title_open = ''
                    lp_opt_title_close = ''        
                else:
                    lp_opt_title_open = opt_title_open
                    lp_opt_title_close = opt_title_close
                if self.html_preprocess_sections >= self.min_chapters:
                    break
                full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
                if n_lookahead_req:
                    n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-            self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
+                if not analyze:
-            if lookahead_ignorecase:
+                    self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
-                chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+
-                chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+                chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
                chapdetect = re.compile(r'%s' % chapter_marker)
                if analyze:
                    hits = len(chapdetect.findall(html))
                    if hits:
                        chapdetect.sub(self.analyze_title_matches, html)
                        if float(self.chapters_with_title) / float(hits) > .5:
                            title_req = True
                            strict_title = False
                        self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
                        if type_name == 'common':
                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
                        elif self.min_chapters <= hits < max_chapters:
                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
                            break
                else:
                chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
                chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
                    html = chapdetect.sub(self.chapter_head, html)
            return html
        recurse_patterns(html, True)
        chapter_types = analysis_result
        html = recurse_patterns(html, False)
        words_per_chptr = wordcount
        if words_per_chptr > 0 and self.html_preprocess_sections > 0:
            words_per_chptr = wordcount / self.html_preprocess_sections
-        self.log("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
+        self.log.debug("Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters")
        return html
    def punctuation_unwrap(self, length, content, format):
        '''
        Unwraps lines based on line length and punctuation
        supports a range of html markup and text files
        '''
        # define the pieces of the regex
-        lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
+        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
-        line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
+        em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
        soft_hyphen = u"\xad"
        line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
        blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
-        line_opening = "<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*"
+        line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
        txt_line_wrap = u"((\u0020|\u0009)*\n){1,4}"
        unwrap_regex = lookahead+line_ending+blanklines+line_opening
        em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
        shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
        if format == 'txt':
            unwrap_regex = lookahead+txt_line_wrap
            em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
            shy_unwrap_regex = soft_hyphen+txt_line_wrap
        unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
        em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
        shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
        content = unwrap.sub(' ', content)
        content = em_en_unwrap.sub('', content)
        content = shy_unwrap.sub('', content)
        return content
-
+    def txt_process(self, match):
    def __call__(self, html):
        self.log("*********  Preprocessing HTML  *********")
        # Count the words in the document to estimate how many chapters to look for and whether
        # other types of processing are attempted
        totalwords = 0
        totalwords = self.get_word_count(html)
        if totalwords < 50:
            self.log("not enough text, not preprocessing")
            return html
        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
        html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
        html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
        ###### Check Markup ######
        #
        # some lit files don't have any <p> tags or equivalent (generally just plain text between
        # <pre> tags), check and  mark up line endings if required before proceeding
        if self.no_markup(html, 0.1):
            self.log("not enough paragraph markers, adding now")
            # check if content is in pre tags, use txt processor to mark up if so
            pre = re.compile(r'<pre>', re.IGNORECASE)
            if len(pre.findall(html)) == 1:
                self.log("Running Text Processing")
        from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
        separate_paragraphs_single_line
-                outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
+        content = match.group('text')
-                html = outerhtml.sub('\g<text>', html)
+        content = separate_paragraphs_single_line(content)
-                html = separate_paragraphs_single_line(html)
+        content = preserve_spaces(content)
-                html = preserve_spaces(html)
+        content = convert_basic(content, epub_split_size_kb=0)
-                html = convert_basic(html, epub_split_size_kb=0)
+        return content
    def markup_pre(self, html):
        pre = re.compile(r'<pre>', re.IGNORECASE)
        if len(pre.findall(html)) >= 1:
            self.log.debug("Running Text Processing")
            outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
            html = outerhtml.sub(self.txt_process, html)
        else:
            # Add markup naively
            # TODO - find out if there are cases where there are more than one <pre> tag or
            # other types of unmarked html and handle them in some better fashion
            add_markup = re.compile('(?<!>)(\n)')
            html = add_markup.sub('</p>\n<p>', html)
        return html
-        ###### Mark Indents/Cleanup ######
+    def arrange_htm_line_endings(self, html):
-        #
+        html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
-        # Replace series of non-breaking spaces with text-indent
+        html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
        return html
    def fix_nbsp_indents(self, html):
        txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
        html = txtindent.sub(self.insert_indent, html)
        if self.found_indents > 1:
-            self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
+            self.log.debug("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
        return html
    def cleanup_markup(self, html):
        # remove remaining non-breaking spaces
        html = re.sub(ur'\u00a0', ' ', html)
        # Get rid of various common microsoft specific tags which can cause issues later
@ -255,108 +368,166 @@ class PreProcessor(object):
        html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
        # Delete microsoft 'smart' tags
        html = re.sub('(?i)</?st1:\w+>', '', html)
-        # Get rid of empty span, bold, & italics tags
+        # Get rid of empty span, bold, font, em, & italics tags
        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
-        html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
+        html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
-        # ADE doesn't render <br />, change to empty paragraphs
+        html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
-        #html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
+        self.deleted_nbsps = True
        return html
-        # If more than 40% of the lines are empty paragraphs and the user has enabled remove
+    def analyze_line_endings(self, html):
-        # paragraph spacing then delete blank lines to clean up spacing
+        '''
-        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
+        determines the type of html line ending used most commonly in a document
-        blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
+        use before calling docanalysis functions
-        #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
+        '''
        blanklines = blankreg.findall(html)
        lines = linereg.findall(html)
        blanks_between_paragraphs = False
        if len(lines) > 1:
            self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
                    unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
            if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
            'remove_paragraph_spacing', False):
                self.log("deleting blank lines")
                html = blankreg.sub('', html)
            elif float(len(blanklines)) / float(len(lines)) > 0.40:
                blanks_between_paragraphs = True
                #print "blanks between paragraphs is marked True"
            else:
                blanks_between_paragraphs = False
        #self.dump(html, 'before_chapter_markup')
        # detect chapters/sections to match xpath or splitting logic
        #
        html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
        ###### Unwrap lines ######
        #
        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
        # that lines can be un-wrapped across page boundaries
        paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
        spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
        paras = len(paras_reg.findall(html))
        spans = len(spans_reg.findall(html))
        if spans > 1:
            if float(paras) / float(spans) < 0.75:
-                format = 'spanned_html'
+                return 'spanned_html'
            else:
-                format = 'html'
+                return 'html'
        else:
-            format = 'html'
+            return 'html'
    def analyze_blanks(self, html):
        blanklines = self.blankreg.findall(html)
        lines = self.linereg.findall(html)
        if len(lines) > 1:
            self.log.debug("There are " + unicode(len(blanklines)) + " blank lines. " +
                    unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
            if float(len(blanklines)) / float(len(lines)) > 0.40:
                return True
            else:
                return False
    def cleanup_required(self):
        for option in ['unwrap_lines', 'markup_chapter_headings', 'format_scene_breaks', 'delete_blank_paragraphs']:
            if getattr(self.extra_opts, option, False):
                return True
        return False
    def __call__(self, html):
        self.log.debug("*********  Heuristic processing HTML  *********")
        # Count the words in the document to estimate how many chapters to look for and whether
        # other types of processing are attempted
        try:
            self.totalwords = self.get_word_count(html)
        except:
            self.log.warn("Can't get wordcount")
        if self.totalwords < 50:
            self.log.warn("flow is too short, not running heuristics")
            return html
        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
        html = self.arrange_htm_line_endings(html)
        if self.cleanup_required():
            ###### Check Markup ######
            #
            # some lit files don't have any <p> tags or equivalent (generally just plain text between
            # <pre> tags), check and  mark up line endings if required before proceeding
            # fix indents must run after this step
            if self.no_markup(html, 0.1):
                self.log.debug("not enough paragraph markers, adding now")
                # markup using text processing
                html = self.markup_pre(html)
        # Replace series of non-breaking spaces with text-indent
        if getattr(self.extra_opts, 'fix_indents', False):
            html = self.fix_nbsp_indents(html)
        if self.cleanup_required():
            # fix indents must run before this step, as it removes non-breaking spaces
            html = self.cleanup_markup(html)
        # ADE doesn't render <br />, change to empty paragraphs
        #html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
        # Determine whether the document uses interleaved blank lines
        blanks_between_paragraphs = self.analyze_blanks(html)
        #self.dump(html, 'before_chapter_markup')
        # detect chapters/sections to match xpath or splitting logic
        if getattr(self.extra_opts, 'markup_chapter_headings', False):
            html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
        if getattr(self.extra_opts, 'italicize_common_cases', False): 
            html = self.markup_italicis(html)
        # If more than 40% of the lines are empty paragraphs and the user has enabled delete
        # blank paragraphs then delete blank lines to clean up spacing
        if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
            self.log.debug("deleting blank lines")
            self.blanks_deleted = True
            html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
            html = self.blankreg.sub('', html)
        # Determine line ending type
        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so
        # that lines can be un-wrapped across page boundaries
        format = self.analyze_line_endings(html)
        # Check Line histogram to determine if the document uses hard line breaks, If 50% or
        # more of the lines break in the same region of the document then unwrapping is required
        docanalysis = DocAnalysis(format, html)
        hardbreaks = docanalysis.line_histogram(.50)
-        self.log("Hard line breaks check returned "+unicode(hardbreaks))
+        self.log.debug("Hard line breaks check returned "+unicode(hardbreaks))
        # Calculate Length
        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
        length = docanalysis.line_length(unwrap_factor)
-        self.log("Median line length is " + unicode(length) + ", calculated with " + format + " format")
+        self.log.debug("Median line length is " + unicode(length) + ", calculated with " + format + " format")
        ###### Unwrap lines ######
        if getattr(self.extra_opts, 'unwrap_lines', False):
            # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
            if hardbreaks or unwrap_factor < 0.4:
-            self.log("Unwrapping required, unwrapping Lines")
+                self.log.debug("Unwrapping required, unwrapping Lines")
-            # Unwrap em/en dashes
+                # Dehyphenate with line length limiters
-            html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
+                dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
            # Dehyphenate
            self.log("Unwrapping/Removing hyphens")
            dehyphenator = Dehyphenator()
                html = dehyphenator(html,'html', length)
            self.log("Done dehyphenating")
            # Unwrap lines using punctation and line length
            #unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
                html = self.punctuation_unwrap(length, html, 'html')
            #check any remaining hyphens, but only unwrap if there is a match
            dehyphenator = Dehyphenator()
            html = dehyphenator(html,'html_cleanup', length)
        else:
            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
            self.log("Cleaning up hyphenation")
            dehyphenator = Dehyphenator()
            html = dehyphenator(html,'html_cleanup', length)
            self.log("Done dehyphenating")
-        # delete soft hyphens
+        if getattr(self.extra_opts, 'dehyphenate', False):
-        html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
+            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
            self.log.debug("Fixing hyphenated content")
            dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
            html = dehyphenator(html,'html_cleanup', length)
            html = dehyphenator(html, 'individual_words', length)
        # If still no sections after unwrapping mark split points on lines with no punctuation
-        if self.html_preprocess_sections < self.min_chapters:
+        if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
-            self.log("Looking for more split points based on punctuation,"
+            self.log.debug("Looking for more split points based on punctuation,"
                    " currently have " + unicode(self.html_preprocess_sections))
            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
            html = chapdetect3.sub(self.chapter_break, html)
        if getattr(self.extra_opts, 'renumber_headings', False):
            # search for places where a first or second level heading is immediately followed by another
            # top level heading.  demote the second heading to h3 to prevent splitting between chapter
            # headings and titles, images, etc
            doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
            html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
-        # put back non-breaking spaces in empty paragraphs to preserve original formatting
+        if getattr(self.extra_opts, 'format_scene_breaks', False):
        html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
            # Center separator lines
-        html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
+            html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
            if not self.blanks_deleted:
                html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
            html = re.sub('<p\s+id="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
        if self.deleted_nbsps:
            # put back non-breaking spaces in empty paragraphs to preserve original formatting
            html = self.blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
        return html
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -24,7 +24,6 @@ from calibre.constants import islinux, isfreebsd, iswindows
 from calibre import unicode_path, as_unicode
 from calibre.utils.localization import get_lang
 from calibre.utils.filenames import ascii_filename
 from calibre.ebooks.conversion.utils import PreProcessor
 class Link(object):
    '''
@ -296,7 +295,7 @@ class HTMLInput(InputFormatPlugin):
            return oeb
        from calibre.ebooks.conversion.plumber import create_oebbook
-        return create_oebbook(log, stream.name, opts, self,
+        return create_oebbook(log, stream.name, opts,
                encoding=opts.input_encoding)
    def is_case_sensitive(self, path):
@ -485,9 +484,3 @@ class HTMLInput(InputFormatPlugin):
            self.log.exception('Failed to read CSS file: %r'%link)
            return (None, None)
        return (None, raw)
    def preprocess_html(self, options, html):
        self.options = options
        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
        return preprocessor(html)
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@ -7,7 +7,7 @@ __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
+from calibre.ebooks.conversion.utils import HeuristicProcessor
 class LITInput(InputFormatPlugin):
@ -22,7 +22,7 @@ class LITInput(InputFormatPlugin):
        from calibre.ebooks.lit.reader import LitReader
        from calibre.ebooks.conversion.plumber import create_oebbook
        self.log = log
-        return create_oebbook(log, stream, options, self, reader=LitReader)
+        return create_oebbook(log, stream, options, reader=LitReader)
    def postprocess_book(self, oeb, opts, log):
        from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
@ -39,10 +39,13 @@ class LITInput(InputFormatPlugin):
                body = body[0]
                if len(body) == 1 and body[0].tag == XHTML('pre'):
                    pre = body[0]
-                    from calibre.ebooks.txt.processor import convert_basic
+                    from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
                    separate_paragraphs_single_line
                    from lxml import etree
                    import copy
-                    html = convert_basic(pre.text).replace('<html>',
+                    html = separate_paragraphs_single_line(pre.text)
                    html = preserve_spaces(html)
                    html = convert_basic(html).replace('<html>',
                            '<html xmlns="%s">'%XHTML_NS)
                    root = etree.fromstring(html)
                    body = XPath('//h:body')(root)
@ -51,10 +54,3 @@ class LITInput(InputFormatPlugin):
                    for elem in body:
                        ne = copy.deepcopy(elem)
                        pre.append(ne)
    def preprocess_html(self, options, html):
        self.options = options
        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
        return preprocessor(html)
--- a/src/calibre/ebooks/lrf/input.py
+++ b/src/calibre/ebooks/lrf/input.py
@ -12,7 +12,6 @@ from copy import deepcopy
 from lxml import etree
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.conversion.utils import PreProcessor
 from calibre import guess_type
 class Canvas(etree.XSLTExtension):
@ -419,11 +418,3 @@ class LRFInput(InputFormatPlugin):
            f.write(result)
        styles.write()
        return os.path.abspath('content.opf')
    def preprocess_html(self, options, html):
        self.options = options
        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
        return preprocessor(html)
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@ -39,11 +39,3 @@ class MOBIInput(InputFormatPlugin):
                accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
        return mr.created_opf_path
    def preprocess_html(self, options, html):
        # search for places where a first or second level heading is immediately followed by another
        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
        # headings and titles, images, etc
        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
        html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
        return html
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@ -9,7 +9,6 @@ import os
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.pdb.header import PdbHeaderReader
 from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
 from calibre.ebooks.conversion.utils import PreProcessor
 class PDBInput(InputFormatPlugin):
@ -32,8 +31,3 @@ class PDBInput(InputFormatPlugin):
        opf = reader.extract_content(os.getcwd())
        return opf
    def preprocess_html(self, options, html):
        self.options = options
        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
        return preprocessor(html)
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -7,7 +7,6 @@ import os, glob, re, textwrap
 from lxml import etree
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.conversion.utils import PreProcessor
 border_style_map = {
        'single' : 'solid',
@ -319,13 +318,9 @@ class RTFInput(InputFormatPlugin):
            res = transform.tostring(result)
            res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
            # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
            if not getattr(self.opts, 'remove_paragraph_spacing', False):
            res = re.sub('\s*<body>', '<body>', res)
            res = re.sub('(?<=\n)\n{2}',
                    u'<p>\u00a0</p>\n'.encode('utf-8'), res)
            if self.opts.preprocess_html:
                preprocessor = PreProcessor(self.opts, log=getattr(self, 'log', None))
                res = preprocessor(res.decode('utf-8')).encode('utf-8')
            f.write(res)
        self.write_inline_css(inline_class, border_styles)
        stream.seek(0)
--- a/src/calibre/ebooks/snb/input.py
+++ b/src/calibre/ebooks/snb/input.py
@ -41,7 +41,7 @@ class SNBInput(InputFormatPlugin):
            raise ValueError("Invalid SNB file")
        log.debug("Handle meta data ...")
        from calibre.ebooks.conversion.plumber import create_oebbook
-        oeb = create_oebbook(log, None, options, self,
+        oeb = create_oebbook(log, None, options,
                encoding=options.input_encoding, populate=False)
        meta = snbFile.GetFileStream('snbf/book.snbf')
        if meta != None:
--- a/src/calibre/ebooks/txt/heuristicprocessor.py
+++ b/src/calibre/ebooks/txt/heuristicprocessor.py
@ -1,58 +0,0 @@
 # -*- coding: utf-8 -*-
 __license__ = 'GPL 3'
 __copyright__ = '2011, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 import re
 from calibre import prepare_string_for_xml
 class TXTHeuristicProcessor(object):
    def __init__(self):
        self.ITALICIZE_WORDS = [
            'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
            'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
            'Mlle.', 'Mons.', 'PS.', 'PPS.',
        ]
        self.ITALICIZE_STYLE_PATS = [
            r'(?msu)_(?P<words>.+?)_',
            r'(?msu)/(?P<words>[^<>]+?)/',
            r'(?msu)~~(?P<words>.+?)~~',
            r'(?msu)\*(?P<words>.+?)\*',
            r'(?msu)~(?P<words>.+?)~',
            r'(?msu)_/(?P<words>[^<>]+?)/_',
            r'(?msu)_\*(?P<words>.+?)\*_',
            r'(?msu)\*/(?P<words>[^<>]+?)/\*',
            r'(?msu)_\*/(?P<words>[^<>]+?)/\*_',
            r'(?msu)/:(?P<words>[^<>]+?):/',
            r'(?msu)\|:(?P<words>.+?):\|',
        ]
    def process_paragraph(self, paragraph):
        for word in self.ITALICIZE_WORDS:
            paragraph = paragraph.replace(word, '<i>%s</i>' % word)
        for pat in self.ITALICIZE_STYLE_PATS:
            paragraph = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), paragraph)
        return paragraph
    def convert(self, txt, title='', epub_split_size_kb=0):
        from calibre.ebooks.txt.processor import clean_txt, split_txt, HTML_TEMPLATE
        txt = clean_txt(txt)
        txt = split_txt(txt, epub_split_size_kb)
        processed = []
        for line in txt.split('\n\n'):
            processed.append(u'<p>%s</p>' % self.process_paragraph(prepare_string_for_xml(line.replace('\n', ' '))))
        txt = u'\n'.join(processed)
        txt = re.sub('[ ]{2,}', ' ', txt)
        html = HTML_TEMPLATE % (title, txt)
        from calibre.ebooks.conversion.utils import PreProcessor
        pp = PreProcessor()
        html = pp.markup_chapters(html, pp.get_word_count(html), False)
        return html
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@ -12,7 +12,7 @@ from calibre.ebooks.chardet import detect
 from calibre.ebooks.txt.processor import convert_basic, convert_markdown, \
    separate_paragraphs_single_line, separate_paragraphs_print_formatted, \
    preserve_spaces, detect_paragraph_type, detect_formatting_type, \
-    convert_heuristic, normalize_line_endings, convert_textile
+    normalize_line_endings, convert_textile
 from calibre import _ent_pat, xml_entity_to_unicode
 class TXTInput(InputFormatPlugin):
@ -106,7 +106,7 @@ class TXTInput(InputFormatPlugin):
                    log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
            # Dehyphenate
-            dehyphenator = Dehyphenator()
+            dehyphenator = Dehyphenator(options.verbose, log=getattr(self, 'log', None))
            txt = dehyphenator(txt,'txt', length)
            # We don't check for block because the processor assumes block.
@ -118,24 +118,24 @@ class TXTInput(InputFormatPlugin):
                txt = separate_paragraphs_print_formatted(txt)
            if options.paragraph_type == 'unformatted':
-                from calibre.ebooks.conversion.utils import PreProcessor
+                from calibre.ebooks.conversion.utils import HeuristicProcessor
                # get length
                # unwrap lines based on punctuation
-                preprocessor = PreProcessor(options, log=getattr(self, 'log', None))
+                preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
                txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
            flow_size = getattr(options, 'flow_size', 0)
            if options.formatting_type == 'heuristic':
                html = convert_heuristic(txt, epub_split_size_kb=flow_size)
            else:
            html = convert_basic(txt, epub_split_size_kb=flow_size)
-        # Dehyphenate in cleanup mode for missed txt and markdown conversion
+            if options.formatting_type == 'heuristic':
-        dehyphenator = Dehyphenator()
+                setattr(options, 'enable_heuristics', True)
-        html = dehyphenator(html,'txt_cleanup', length)
+                setattr(options, 'markup_chapter_headings', True)
-        html = dehyphenator(html,'html_cleanup', length)
+                setattr(options, 'italicize_common_cases', True)
                setattr(options, 'fix_indents', True)
                setattr(options, 'delete_blank_paragraphs', True)
                setattr(options, 'format_scene_breaks', True)
                setattr(options, 'dehyphenate', True)
        from calibre.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@ -12,7 +12,6 @@ import os, re
 from calibre import prepare_string_for_xml, isbytestring
 from calibre.ebooks.metadata.opf2 import OPFCreator
 from calibre.ebooks.txt.heuristicprocessor import TXTHeuristicProcessor
 from calibre.ebooks.conversion.preprocess import DocAnalysis
 from calibre.utils.cleantext import clean_ascii_chars
@ -67,10 +66,6 @@ def convert_basic(txt, title='', epub_split_size_kb=0):
    return HTML_TEMPLATE % (title, u'\n'.join(lines))
 def convert_heuristic(txt, title='', epub_split_size_kb=0):
    tp = TXTHeuristicProcessor()
    return tp.convert(txt, title, epub_split_size_kb)
 def convert_markdown(txt, title='', disable_toc=False):
    from calibre.ebooks.markdown import markdown
    md = markdown.Markdown(
--- a/src/calibre/gui2/convert/bulk.py
+++ b/src/calibre/gui2/convert/bulk.py
@ -11,6 +11,8 @@ from calibre.gui2.convert.single import Config, sort_formats_by_preference, \
 from calibre.customize.ui import available_output_formats
 from calibre.gui2 import ResizableDialog
 from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
 from calibre.gui2.convert.heuristics import HeuristicsWidget
 from calibre.gui2.convert.search_and_replace import SearchAndReplaceWidget
 from calibre.gui2.convert.page_setup import PageSetupWidget
 from calibre.gui2.convert.structure_detection import StructureDetectionWidget
 from calibre.gui2.convert.toc import TOCWidget
@ -69,6 +71,8 @@ class BulkConfig(Config):
        self.setWindowTitle(_('Bulk Convert'))
        lf = widget_factory(LookAndFeelWidget)
        hw = widget_factory(HeuristicsWidget)
        sr = widget_factory(SearchAndReplaceWidget)
        ps = widget_factory(PageSetupWidget)
        sd = widget_factory(StructureDetectionWidget)
        toc = widget_factory(TOCWidget)
@ -90,7 +94,7 @@ class BulkConfig(Config):
            if not c: break
            self.stack.removeWidget(c)
-        widgets = [lf, ps, sd, toc]
+        widgets = [lf, hw, sr, ps, sd, toc]
        if output_widget is not None:
            widgets.append(output_widget)
        for w in widgets:
--- a/src/calibre/gui2/convert/heuristics.py
+++ b/src/calibre/gui2/convert/heuristics.py
@ -0,0 +1,72 @@
 # -*- coding: utf-8 -*-
 __license__ = 'GPL 3'
 __copyright__ = '2011, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 from PyQt4.Qt import Qt
 from calibre.gui2.convert.heuristics_ui import Ui_Form
 from calibre.gui2.convert import Widget
 class HeuristicsWidget(Widget, Ui_Form):
    TITLE = _('Heuristic Processing')
    HELP  = _('Modify the document text and structure using common patterns.')
    COMMIT_NAME = 'heuristics'
    def __init__(self, parent, get_option, get_help, db=None, book_id=None):
        Widget.__init__(self, parent,
                ['enable_heuristics', 'markup_chapter_headings',
                 'italicize_common_cases', 'fix_indents',
                 'html_unwrap_factor', 'unwrap_lines',
                 'delete_blank_paragraphs', 'format_scene_breaks',
                 'dehyphenate', 'renumber_headings']
                )
        self.db, self.book_id = db, book_id
        self.initialize_options(get_option, get_help, db, book_id)
        self.opt_enable_heuristics.stateChanged.connect(self.enable_heuristics)
        self.opt_unwrap_lines.stateChanged.connect(self.enable_unwrap)
        self.enable_heuristics(self.opt_enable_heuristics.checkState())
    def break_cycles(self):
        Widget.break_cycles(self)
        try:
            self.opt_enable_heuristics.stateChanged.disconnect()
            self.opt_unwrap_lines.stateChanged.disconnect()
        except:
            pass
    def set_value_handler(self, g, val):
        if val is None and g is self.opt_html_unwrap_factor:
            g.setValue(0.0)
            return True
    def enable_heuristics(self, state):
        if state == Qt.Checked:
            state = True
        else:
            state = False
        self.opt_markup_chapter_headings.setEnabled(state)
        self.opt_italicize_common_cases.setEnabled(state)
        self.opt_fix_indents.setEnabled(state)
        self.opt_delete_blank_paragraphs.setEnabled(state)
        self.opt_format_scene_breaks.setEnabled(state)
        self.opt_dehyphenate.setEnabled(state)
        self.opt_renumber_headings.setEnabled(state)
        self.opt_unwrap_lines.setEnabled(state)
        if state and self.opt_unwrap_lines.checkState() == Qt.Checked:
            self.opt_html_unwrap_factor.setEnabled(True)
        else:
            self.opt_html_unwrap_factor.setEnabled(False)
    def enable_unwrap(self, state):
        if state == Qt.Checked:
            state = True
        else:
            state = False
        self.opt_html_unwrap_factor.setEnabled(state)
--- a/src/calibre/gui2/convert/heuristics.ui
+++ b/src/calibre/gui2/convert/heuristics.ui
@ -0,0 +1,178 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <ui version="4.0">
 <class>Form</class>
 <widget class="QWidget" name="Form">
  <property name="geometry">
   <rect>
    <x>0</x>
    <y>0</y>
    <width>938</width>
    <height>470</height>
   </rect>
  </property>
  <property name="windowTitle">
   <string>Form</string>
  </property>
  <layout class="QVBoxLayout" name="verticalLayout">
   <item>
    <widget class="QCheckBox" name="opt_enable_heuristics">
     <property name="text">
      <string>&amp;Preprocess input file to possibly improve structure detection</string>
     </property>
    </widget>
   </item>
   <item>
    <widget class="QGroupBox" name="groupBox">
     <property name="title">
      <string>Heuristic Processing</string>
     </property>
     <layout class="QGridLayout" name="gridLayout">
      <item row="0" column="0" colspan="2">
       <widget class="QCheckBox" name="opt_unwrap_lines">
        <property name="text">
         <string>Unwrap lines</string>
        </property>
       </widget>
      </item>
      <item row="1" column="1">
       <widget class="QLabel" name="huf_label">
        <property name="text">
         <string>Line &amp;un-wrap factor during preprocess:</string>
        </property>
        <property name="buddy">
         <cstring>opt_html_unwrap_factor</cstring>
        </property>
       </widget>
      </item>
      <item row="1" column="2">
       <widget class="QDoubleSpinBox" name="opt_html_unwrap_factor">
        <property name="toolTip">
         <string/>
        </property>
        <property name="maximum">
         <double>1.000000000000000</double>
        </property>
        <property name="singleStep">
         <double>0.050000000000000</double>
        </property>
        <property name="value">
         <double>0.400000000000000</double>
        </property>
       </widget>
      </item>
      <item row="1" column="3">
       <spacer name="horizontalSpacer_2">
        <property name="orientation">
         <enum>Qt::Horizontal</enum>
        </property>
        <property name="sizeHint" stdset="0">
         <size>
          <width>40</width>
          <height>20</height>
         </size>
        </property>
       </spacer>
      </item>
      <item row="2" column="0" colspan="4">
       <widget class="QCheckBox" name="opt_markup_chapter_headings">
        <property name="text">
         <string>Detect and markup unformatted chapter headings and sub headings</string>
        </property>
       </widget>
      </item>
      <item row="3" column="0" colspan="4">
       <widget class="QCheckBox" name="opt_renumber_headings">
        <property name="text">
         <string>Renumber sequences of &lt;h1&gt; or &lt;h2&gt; tags to prevent splitting</string>
        </property>
       </widget>
      </item>
      <item row="4" column="0" colspan="2">
       <widget class="QCheckBox" name="opt_delete_blank_paragraphs">
        <property name="text">
         <string>Delete blank lines between paragraphs</string>
        </property>
       </widget>
      </item>
      <item row="5" column="0" colspan="3">
       <widget class="QCheckBox" name="opt_format_scene_breaks">
        <property name="text">
         <string>Ensure scene breaks are consistently formatted</string>
        </property>
       </widget>
      </item>
      <item row="6" column="0" colspan="2">
       <widget class="QCheckBox" name="opt_dehyphenate">
        <property name="text">
         <string>Remove unnecessary hyphens</string>
        </property>
       </widget>
      </item>
      <item row="7" column="0" colspan="2">
       <widget class="QCheckBox" name="opt_italicize_common_cases">
        <property name="text">
         <string>Italicize common words and patterns</string>
        </property>
       </widget>
      </item>
      <item row="8" column="0" colspan="2">
       <widget class="QCheckBox" name="opt_fix_indents">
        <property name="text">
         <string>Replace entity indents with CSS indents</string>
        </property>
       </widget>
      </item>
      <item row="9" column="0" colspan="2">
       <spacer name="verticalSpacer">
        <property name="orientation">
         <enum>Qt::Vertical</enum>
        </property>
        <property name="sizeHint" stdset="0">
         <size>
          <width>131</width>
          <height>35</height>
         </size>
        </property>
       </spacer>
      </item>
     </layout>
    </widget>
   </item>
  </layout>
 </widget>
 <resources/>
 <connections>
  <connection>
   <sender>opt_enable_heuristics</sender>
   <signal>toggled(bool)</signal>
   <receiver>opt_html_unwrap_factor</receiver>
   <slot>setEnabled(bool)</slot>
   <hints>
    <hint type="sourcelabel">
     <x>328</x>
     <y>87</y>
    </hint>
    <hint type="destinationlabel">
     <x>481</x>
     <y>113</y>
    </hint>
   </hints>
  </connection>
  <connection>
   <sender>opt_enable_heuristics</sender>
   <signal>toggled(bool)</signal>
   <receiver>huf_label</receiver>
   <slot>setEnabled(bool)</slot>
   <hints>
    <hint type="sourcelabel">
     <x>295</x>
     <y>88</y>
    </hint>
    <hint type="destinationlabel">
     <x>291</x>
     <y>105</y>
    </hint>
   </hints>
  </connection>
 </connections>
 </ui>
--- a/src/calibre/gui2/convert/pdb_output.py
+++ b/src/calibre/gui2/convert/pdb_output.py
@ -6,8 +6,6 @@ __docformat__ = 'restructuredtext en'
 from calibre.gui2.convert.pdb_output_ui import Ui_Form
 from calibre.gui2.convert import Widget
 from calibre.ebooks.pdb import FORMAT_WRITERS
 from calibre.gui2.widgets import BasicComboModel
 format_model = None
@ -21,17 +19,8 @@ class PluginWidget(Widget, Ui_Form):
    def __init__(self, parent, get_option, get_help, db=None, book_id=None):
        Widget.__init__(self, parent, ['format', 'inline_toc', 'pdb_output_encoding'])
        self.db, self.book_id = db, book_id
        for x in get_option('format').option.choices:
            self.opt_format.addItem(x)
        self.initialize_options(get_option, get_help, db, book_id)
        default = self.opt_format.currentText()
        global format_model
        if format_model is None:
            format_model = BasicComboModel(FORMAT_WRITERS.keys())
        self.format_model = format_model
        self.opt_format.setModel(self.format_model)
        default_index = self.opt_format.findText(default)
        format_index = self.opt_format.findText('doc')
        self.opt_format.setCurrentIndex(default_index if default_index != -1 else format_index if format_index != -1 else 0)
--- a/src/calibre/gui2/convert/pdf_output.py
+++ b/src/calibre/gui2/convert/pdf_output.py
@ -6,8 +6,6 @@ __docformat__ = 'restructuredtext en'
 from calibre.gui2.convert.pdf_output_ui import Ui_Form
 from calibre.gui2.convert import Widget
 from calibre.ebooks.pdf.pageoptions import PAPER_SIZES, ORIENTATIONS
 from calibre.gui2.widgets import BasicComboModel
 paper_size_model = None
 orientation_model = None
@ -23,28 +21,11 @@ class PluginWidget(Widget, Ui_Form):
        Widget.__init__(self, parent, ['paper_size',
            'orientation', 'preserve_cover_aspect_ratio'])
        self.db, self.book_id = db, book_id
        for x in get_option('paper_size').option.choices:
            self.opt_paper_size.addItem(x)
        for x in get_option('orientation').option.choices:
            self.opt_orientation.addItem(x)
        self.initialize_options(get_option, get_help, db, book_id)
        default_paper_size = self.opt_paper_size.currentText()
        default_orientation = self.opt_orientation.currentText()
        global paper_size_model
        if paper_size_model is None:
            paper_size_model = BasicComboModel(PAPER_SIZES.keys())
        self.paper_size_model = paper_size_model
        self.opt_paper_size.setModel(self.paper_size_model)
        default_paper_size_index = self.opt_paper_size.findText(default_paper_size)
        letter_index = self.opt_paper_size.findText('letter')
        self.opt_paper_size.setCurrentIndex(default_paper_size_index if default_paper_size_index != -1 else letter_index if letter_index != -1 else 0)
        global orientation_model
        if orientation_model is None:
            orientation_model = BasicComboModel(ORIENTATIONS.keys())
        self.orientation_model = orientation_model
        self.opt_orientation.setModel(self.orientation_model)
        default_orientation_index = self.opt_orientation.findText(default_orientation)
        orientation_index = self.opt_orientation.findText('portrait')
        self.opt_orientation.setCurrentIndex(default_orientation_index if default_orientation_index != -1 else orientation_index if orientation_index != -1 else 0)
--- a/src/calibre/gui2/convert/search_and_replace.py
+++ b/src/calibre/gui2/convert/search_and_replace.py
@ -0,0 +1,54 @@
 # -*- coding: utf-8 -*-
 __license__ = 'GPL 3'
 __copyright__ = '2011, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 import re
 from calibre.gui2.convert.search_and_replace_ui import Ui_Form
 from calibre.gui2.convert import Widget
 from calibre.gui2 import error_dialog
 class SearchAndReplaceWidget(Widget, Ui_Form):
    TITLE = _('Search &\nReplace')
    HELP  = _('Modify the document text and structure using user defined patterns.')
    COMMIT_NAME = 'search_and_replace'
    def __init__(self, parent, get_option, get_help, db=None, book_id=None):
        Widget.__init__(self, parent,
                ['sr1_search', 'sr1_replace',
                 'sr2_search', 'sr2_replace',
                 'sr3_search', 'sr3_replace']
                )
        self.db, self.book_id = db, book_id
        self.initialize_options(get_option, get_help, db, book_id)
        self.opt_sr1_search.set_msg(_('Search Regular Expression'))
        self.opt_sr1_search.set_book_id(book_id)
        self.opt_sr1_search.set_db(db)
        self.opt_sr2_search.set_msg(_('Search Regular Expression'))
        self.opt_sr2_search.set_book_id(book_id)
        self.opt_sr2_search.set_db(db)
        self.opt_sr3_search.set_msg(_('Search Regular Expression'))
        self.opt_sr3_search.set_book_id(book_id)
        self.opt_sr3_search.set_db(db)
    def break_cycles(self):
        Widget.break_cycles(self)
        self.opt_sr1_search.break_cycles()
        self.opt_sr2_search.break_cycles()
        self.opt_sr3_search.break_cycles()
    def pre_commit_check(self):
        for x in ('sr1_search', 'sr2_search', 'sr3_search'):
            x = getattr(self, 'opt_'+x)
            try:
                pat = unicode(x.regex)
                re.compile(pat)
            except Exception, err:
                error_dialog(self, _('Invalid regular expression'),
                             _('Invalid regular expression: %s')%err).exec_()
                return False
        return True
--- a/src/calibre/gui2/convert/search_and_replace.ui
+++ b/src/calibre/gui2/convert/search_and_replace.ui
@ -0,0 +1,191 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <ui version="4.0">
 <class>Form</class>
 <widget class="QWidget" name="Form">
  <property name="geometry">
   <rect>
    <x>0</x>
    <y>0</y>
    <width>198</width>
    <height>350</height>
   </rect>
  </property>
  <property name="sizePolicy">
   <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
    <horstretch>0</horstretch>
    <verstretch>0</verstretch>
   </sizepolicy>
  </property>
  <property name="windowTitle">
   <string>Form</string>
  </property>
  <layout class="QGridLayout" name="gridLayout_4">
   <property name="sizeConstraint">
    <enum>QLayout::SetDefaultConstraint</enum>
   </property>
   <item row="0" column="0">
    <widget class="QGroupBox" name="groupBox">
     <property name="sizePolicy">
      <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
       <horstretch>0</horstretch>
       <verstretch>0</verstretch>
      </sizepolicy>
     </property>
     <property name="title">
      <string>1.</string>
     </property>
     <layout class="QGridLayout" name="gridLayout_2">
      <property name="sizeConstraint">
       <enum>QLayout::SetMinimumSize</enum>
      </property>
      <item row="0" column="0">
       <widget class="RegexEdit" name="opt_sr1_search" native="true">
        <property name="sizePolicy">
         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
          <horstretch>0</horstretch>
          <verstretch>0</verstretch>
         </sizepolicy>
        </property>
       </widget>
      </item>
      <item row="1" column="0">
       <widget class="QLabel" name="label_4">
        <property name="sizePolicy">
         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
          <horstretch>0</horstretch>
          <verstretch>0</verstretch>
         </sizepolicy>
        </property>
        <property name="text">
         <string>Replacement Text</string>
        </property>
       </widget>
      </item>
      <item row="2" column="0">
       <widget class="QLineEdit" name="opt_sr1_replace">
        <property name="sizePolicy">
         <sizepolicy hsizetype="Minimum" vsizetype="Fixed">
          <horstretch>0</horstretch>
          <verstretch>0</verstretch>
         </sizepolicy>
        </property>
       </widget>
      </item>
     </layout>
    </widget>
   </item>
   <item row="1" column="0">
    <widget class="QGroupBox" name="groupBox_2">
     <property name="sizePolicy">
      <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
       <horstretch>0</horstretch>
       <verstretch>0</verstretch>
      </sizepolicy>
     </property>
     <property name="title">
      <string>2.</string>
     </property>
     <layout class="QGridLayout" name="gridLayout">
      <property name="sizeConstraint">
       <enum>QLayout::SetMinimumSize</enum>
      </property>
      <item row="0" column="0">
       <widget class="RegexEdit" name="opt_sr2_search" native="true">
        <property name="sizePolicy">
         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
          <horstretch>0</horstretch>
          <verstretch>0</verstretch>
         </sizepolicy>
        </property>
       </widget>
      </item>
      <item row="1" column="0">
       <widget class="QLabel" name="label_5">
        <property name="sizePolicy">
         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
          <horstretch>0</horstretch>
          <verstretch>0</verstretch>
         </sizepolicy>
        </property>
        <property name="text">
         <string>Replacement Text</string>
        </property>
       </widget>
      </item>
      <item row="2" column="0">
       <widget class="QLineEdit" name="opt_sr2_replace">
        <property name="sizePolicy">
         <sizepolicy hsizetype="Minimum" vsizetype="Fixed">
          <horstretch>0</horstretch>
          <verstretch>0</verstretch>
         </sizepolicy>
        </property>
       </widget>
      </item>
     </layout>
    </widget>
   </item>
   <item row="2" column="0">
    <widget class="QGroupBox" name="groupBox_3">
     <property name="sizePolicy">
      <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
       <horstretch>0</horstretch>
       <verstretch>0</verstretch>
      </sizepolicy>
     </property>
     <property name="title">
      <string>3.</string>
     </property>
     <layout class="QGridLayout" name="gridLayout_3">
      <property name="sizeConstraint">
       <enum>QLayout::SetMinimumSize</enum>
      </property>
      <item row="0" column="0">
       <widget class="RegexEdit" name="opt_sr3_search" native="true">
        <property name="sizePolicy">
         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
          <horstretch>0</horstretch>
          <verstretch>0</verstretch>
         </sizepolicy>
        </property>
       </widget>
      </item>
      <item row="1" column="0">
       <widget class="QLabel" name="label_6">
        <property name="sizePolicy">
         <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
          <horstretch>0</horstretch>
          <verstretch>0</verstretch>
         </sizepolicy>
        </property>
        <property name="text">
         <string>Replacement Text</string>
        </property>
       </widget>
      </item>
      <item row="2" column="0">
       <widget class="QLineEdit" name="opt_sr3_replace">
        <property name="sizePolicy">
         <sizepolicy hsizetype="Minimum" vsizetype="Fixed">
          <horstretch>0</horstretch>
          <verstretch>0</verstretch>
         </sizepolicy>
        </property>
       </widget>
      </item>
     </layout>
    </widget>
   </item>
  </layout>
 </widget>
 <customwidgets>
  <customwidget>
   <class>RegexEdit</class>
   <extends>QWidget</extends>
   <header>regex_builder.h</header>
   <container>1</container>
  </customwidget>
 </customwidgets>
 <resources/>
 <connections/>
 </ui>
--- a/src/calibre/gui2/convert/single.py
+++ b/src/calibre/gui2/convert/single.py
@ -16,6 +16,8 @@ from calibre.ebooks.conversion.config import GuiRecommendations, save_specifics,
 from calibre.gui2.convert.single_ui import Ui_Dialog
 from calibre.gui2.convert.metadata import MetadataWidget
 from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
 from calibre.gui2.convert.heuristics import HeuristicsWidget
 from calibre.gui2.convert.search_and_replace import SearchAndReplaceWidget
 from calibre.gui2.convert.page_setup import PageSetupWidget
 from calibre.gui2.convert.structure_detection import StructureDetectionWidget
 from calibre.gui2.convert.toc import TOCWidget
@ -170,6 +172,8 @@ class Config(ResizableDialog, Ui_Dialog):
        self.mw = widget_factory(MetadataWidget)
        self.setWindowTitle(_('Convert')+ ' ' + unicode(self.mw.title.text()))
        lf = widget_factory(LookAndFeelWidget)
        hw = widget_factory(HeuristicsWidget)
        sr = widget_factory(SearchAndReplaceWidget)
        ps = widget_factory(PageSetupWidget)
        sd = widget_factory(StructureDetectionWidget)
        toc = widget_factory(TOCWidget)
@ -203,7 +207,7 @@ class Config(ResizableDialog, Ui_Dialog):
            if not c: break
            self.stack.removeWidget(c)
-        widgets = [self.mw, lf, ps, sd, toc]
+        widgets = [self.mw, lf, hw, sr, ps, sd, toc]
        if input_widget is not None:
            widgets.append(input_widget)
        if output_widget is not None:
--- a/src/calibre/gui2/convert/structure_detection.py
+++ b/src/calibre/gui2/convert/structure_detection.py
@ -6,8 +6,6 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import re
 from calibre.gui2.convert.structure_detection_ui import Ui_Form
 from calibre.gui2.convert import Widget
 from calibre.gui2 import error_dialog
@ -24,12 +22,8 @@ class StructureDetectionWidget(Widget, Ui_Form):
        Widget.__init__(self, parent,
                ['chapter', 'chapter_mark',
                'remove_first_image',
-                'insert_metadata', 'page_breaks_before',
+                'insert_metadata', 'page_breaks_before']
                'preprocess_html', 'remove_header', 'header_regex',
                'remove_footer', 'footer_regex','html_unwrap_factor']
                )
        self.opt_html_unwrap_factor.setEnabled(False)
        self.huf_label.setEnabled(False)
        self.db, self.book_id = db, book_id
        for x in ('pagebreak', 'rule', 'both', 'none'):
            self.opt_chapter_mark.addItem(x)
@ -37,28 +31,11 @@ class StructureDetectionWidget(Widget, Ui_Form):
        self.opt_chapter.set_msg(_('Detect chapters at (XPath expression):'))
        self.opt_page_breaks_before.set_msg(_('Insert page breaks before '
            '(XPath expression):'))
        self.opt_header_regex.set_msg(_('Header regular expression:'))
        self.opt_header_regex.set_book_id(book_id)
        self.opt_header_regex.set_db(db)
        self.opt_footer_regex.set_msg(_('Footer regular expression:'))
        self.opt_footer_regex.set_book_id(book_id)
        self.opt_footer_regex.set_db(db)
    def break_cycles(self):
        Widget.break_cycles(self)
        self.opt_header_regex.break_cycles()
        self.opt_footer_regex.break_cycles()
    def pre_commit_check(self):
        for x in ('header_regex', 'footer_regex'):
            x = getattr(self, 'opt_'+x)
            try:
                pat = unicode(x.regex)
                re.compile(pat)
            except Exception, err:
                error_dialog(self, _('Invalid regular expression'),
                             _('Invalid regular expression: %s')%err).exec_()
                return False
        for x in ('chapter', 'page_breaks_before'):
            x = getattr(self, 'opt_'+x)
            if not x.check():
@ -66,8 +43,3 @@ class StructureDetectionWidget(Widget, Ui_Form):
                _('The XPath expression %s is invalid.')%x.text).exec_()
                return False
        return True
    def set_value_handler(self, g, val):
        if val is None and g is self.opt_html_unwrap_factor:
            g.setValue(0.0)
            return True
--- a/src/calibre/gui2/convert/structure_detection.ui
+++ b/src/calibre/gui2/convert/structure_detection.ui
@ -14,10 +14,10 @@
   <string>Form</string>
  </property>
  <layout class="QGridLayout" name="gridLayout">
-   <item row="0" column="1" colspan="2">
+   <item row="0" column="0" colspan="3">
    <widget class="XPathEdit" name="opt_chapter" native="true"/>
   </item>
-   <item row="1" column="0" colspan="2">
+   <item row="1" column="0">
    <widget class="QLabel" name="label">
     <property name="text">
      <string>Chapter &amp;mark:</string>
@ -27,7 +27,7 @@
     </property>
    </widget>
   </item>
-   <item row="1" column="2">
+   <item row="1" column="1">
    <widget class="QComboBox" name="opt_chapter_mark">
     <property name="minimumContentsLength">
      <number>20</number>
@ -41,17 +41,17 @@
     </property>
    </widget>
   </item>
-   <item row="5" column="0" colspan="2">
+   <item row="3" column="0" colspan="2">
    <widget class="QCheckBox" name="opt_insert_metadata">
     <property name="text">
      <string>Insert &amp;metadata as page at start of book</string>
     </property>
    </widget>
   </item>
-   <item row="11" column="0" colspan="3">
+   <item row="5" column="0" colspan="3">
    <widget class="XPathEdit" name="opt_page_breaks_before" native="true"/>
   </item>
-   <item row="12" column="0" colspan="3">
+   <item row="6" column="0" colspan="3">
    <spacer name="verticalSpacer">
     <property name="orientation">
      <enum>Qt::Vertical</enum>
@ -64,53 +64,7 @@
     </property>
    </spacer>
   </item>
-   <item row="8" column="0" colspan="2">
+   <item row="1" column="2">
    <widget class="QCheckBox" name="opt_remove_footer">
     <property name="text">
      <string>Remove F&amp;ooter</string>
     </property>
    </widget>
   </item>
   <item row="6" column="0" colspan="2">
    <widget class="QCheckBox" name="opt_remove_header">
     <property name="text">
      <string>Remove H&amp;eader</string>
     </property>
    </widget>
   </item>
   <item row="7" column="0" colspan="3">
    <widget class="RegexEdit" name="opt_header_regex" native="true"/>
   </item>
   <item row="9" column="0" colspan="3">
    <widget class="RegexEdit" name="opt_footer_regex" native="true"/>
   </item>
   <item row="4" column="1">
    <widget class="QLabel" name="huf_label">
     <property name="text">
      <string>Line &amp;un-wrap factor during preprocess:</string>
     </property>
     <property name="buddy">
      <cstring>opt_html_unwrap_factor</cstring>
     </property>
    </widget>
   </item>
   <item row="4" column="2">
    <widget class="QDoubleSpinBox" name="opt_html_unwrap_factor">
     <property name="toolTip">
      <string/>
     </property>
     <property name="maximum">
      <double>1.000000000000000</double>
     </property>
     <property name="singleStep">
      <double>0.050000000000000</double>
     </property>
     <property name="value">
      <double>0.400000000000000</double>
     </property>
    </widget>
   </item>
   <item row="4" column="0">
    <spacer name="horizontalSpacer">
     <property name="orientation">
      <enum>Qt::Horizontal</enum>
@ -123,13 +77,6 @@
     </property>
    </spacer>
   </item>
   <item row="3" column="0" colspan="2">
    <widget class="QCheckBox" name="opt_preprocess_html">
     <property name="text">
      <string>&amp;Preprocess input file to possibly improve structure detection</string>
     </property>
    </widget>
   </item>
  </layout>
 </widget>
 <customwidgets>
@ -139,46 +86,7 @@
   <header>convert/xpath_wizard.h</header>
   <container>1</container>
  </customwidget>
  <customwidget>
   <class>RegexEdit</class>
   <extends>QWidget</extends>
   <header>regex_builder.h</header>
   <container>1</container>
  </customwidget>
 </customwidgets>
 <resources/>
- <connections>
+ <connections/>
  <connection>
   <sender>opt_preprocess_html</sender>
   <signal>toggled(bool)</signal>
   <receiver>opt_html_unwrap_factor</receiver>
   <slot>setEnabled(bool)</slot>
   <hints>
    <hint type="sourcelabel">
     <x>328</x>
     <y>87</y>
    </hint>
    <hint type="destinationlabel">
     <x>481</x>
     <y>113</y>
    </hint>
   </hints>
  </connection>
  <connection>
   <sender>opt_preprocess_html</sender>
   <signal>toggled(bool)</signal>
   <receiver>huf_label</receiver>
   <slot>setEnabled(bool)</slot>
   <hints>
    <hint type="sourcelabel">
     <x>295</x>
     <y>88</y>
    </hint>
    <hint type="destinationlabel">
     <x>291</x>
     <y>105</y>
    </hint>
   </hints>
  </connection>
 </connections>
 </ui>
--- a/src/calibre/gui2/convert/txt_output.py
+++ b/src/calibre/gui2/convert/txt_output.py
@ -4,10 +4,10 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 from PyQt4.Qt import Qt
 from calibre.gui2.convert.txt_output_ui import Ui_Form
 from calibre.gui2.convert import Widget
 from calibre.ebooks.txt.newlines import TxtNewlines
 from calibre.gui2.widgets import BasicComboModel
 newline_model = None
@ -24,16 +24,26 @@ class PluginWidget(Widget, Ui_Form):
        'inline_toc', 'markdown_format', 'keep_links', 'keep_image_references',
        'txt_output_encoding'])
        self.db, self.book_id = db, book_id        
        for x in get_option('newline').option.choices:
            self.opt_newline.addItem(x)        
        self.initialize_options(get_option, get_help, db, book_id)
-        default = self.opt_newline.currentText()
+        self.opt_markdown_format.stateChanged.connect(self.enable_markdown_format)
        self.enable_markdown_format(self.opt_markdown_format.checkState())
-        global newline_model
+    def break_cycles(self):
-        if newline_model is None:
+        Widget.break_cycles(self)
-            newline_model = BasicComboModel(TxtNewlines.NEWLINE_TYPES.keys())
+        
-        self.newline_model = newline_model
+        try:
-        self.opt_newline.setModel(self.newline_model)
+            self.opt_markdown_format.stateChanged.disconnect()
        except:
            pass
    def enable_markdown_format(self, state):
        if state == Qt.Checked:
            state = True
        else:
            state = False
        self.opt_keep_links.setEnabled(state)
        self.opt_keep_image_references.setEnabled(state)
        default_index = self.opt_newline.findText(default)
        system_index = self.opt_newline.findText('system')
        self.opt_newline.setCurrentIndex(default_index if default_index != -1 else system_index if system_index != -1 else 0)
--- a/src/calibre/gui2/convert/xexp_edit.ui
+++ b/src/calibre/gui2/convert/xexp_edit.ui
@ -6,8 +6,8 @@
   <rect>
    <x>0</x>
    <y>0</y>
-    <width>422</width>
+    <width>434</width>
-    <height>64</height>
+    <height>74</height>
   </rect>
  </property>
  <property name="windowTitle">
@ -53,13 +53,13 @@
   <item row="0" column="1">
    <widget class="QToolButton" name="button">
     <property name="toolTip">
-      <string>Use a wizard to help construct the XPath expression</string>
+      <string>Use a wizard to help construct the Regular expression</string>
     </property>
     <property name="text">
      <string>...</string>
     </property>
     <property name="icon">
-      <iconset resource="../../../../resources/images.qrc">
+      <iconset>
       <normaloff>:/images/wizard.png</normaloff>:/images/wizard.png</iconset>
     </property>
     <property name="iconSize">
@ -70,19 +70,6 @@
     </property>
    </widget>
   </item>
   <item row="0" column="2">
    <spacer name="horizontalSpacer">
     <property name="orientation">
      <enum>Qt::Horizontal</enum>
     </property>
     <property name="sizeHint" stdset="0">
      <size>
       <width>20</width>
       <height>20</height>
      </size>
     </property>
    </spacer>
   </item>
  </layout>
 </widget>
 <customwidgets>
--- a/src/calibre/gui2/preferences/conversion.py
+++ b/src/calibre/gui2/preferences/conversion.py
@ -12,6 +12,8 @@ from calibre.ebooks.conversion.plumber import Plumber
 from calibre.utils.logging import Log
 from calibre.gui2.preferences.conversion_ui import Ui_Form
 from calibre.gui2.convert.look_and_feel import LookAndFeelWidget
 from calibre.gui2.convert.heuristics import HeuristicsWidget
 from calibre.gui2.convert.search_and_replace import SearchAndReplaceWidget
 from calibre.gui2.convert.page_setup import PageSetupWidget
 from calibre.gui2.convert.structure_detection import StructureDetectionWidget
 from calibre.gui2.convert.toc import TOCWidget
@ -82,7 +84,8 @@ class Base(ConfigWidgetBase, Ui_Form):
 class CommonOptions(Base):
    def load_conversion_widgets(self):
-        self.conversion_widgets = [LookAndFeelWidget, PageSetupWidget,
+        self.conversion_widgets = [LookAndFeelWidget, HeuristicsWidget,
                SearchAndReplaceWidget, PageSetupWidget,
                StructureDetectionWidget, TOCWidget]
 class InputOptions(Base):
--- a/src/calibre/gui2/widgets.py
+++ b/src/calibre/gui2/widgets.py
@ -311,32 +311,6 @@ class FontFamilyModel(QAbstractListModel):
    def index_of(self, family):
        return self.families.index(family.strip())
 class BasicComboModel(QAbstractListModel):
    def __init__(self, items, *args):
        QAbstractListModel.__init__(self, *args)
        self.items = [i for i in items]
        self.items.sort()
    def rowCount(self, *args):
        return len(self.items)
    def data(self, index, role):
        try:
            item = self.items[index.row()]
        except:
            traceback.print_exc()
            return NONE
        if role == Qt.DisplayRole:
            return QVariant(item)
        if role == Qt.FontRole:
            return QVariant(QFont(item))
        return NONE
    def index_of(self, item):
        return self.items.index(item.strip())
 class BasicListItem(QListWidgetItem):
    def __init__(self, text, user_data=None):
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@ -255,6 +255,98 @@ you are producing are meant for a particular device type, choose the correspondi
 The Output profile also controls the screen size. This will cause, for example, images to be auto-resized to be fit to the screen in some output formats. So choose a profile of a device that has a screen size similar to your device.
 .. _heuristic-processing:
 Heuristic Processing
 ---------------------
 Heuristic Processing provides a variety of functions which can be used that try to detect and correct 
 common problems in poorly formatted input documents.  Use these functions if your input document suffers 
 from bad formatting. Because these functions rely on common patterns, be aware that in some cases an 
 option may lead to worse results, so use with care.  As an example, several of these options will
 remove all non-breaking-space entities.
 :guilabel:`Preprocess input`
    This option activates various activates |app|'s Heuristic Processing stage of the conversion pipeline.
    This must be enabled in order for various sub-functions to be applied
 :guilabel:`Unwrap lines`
    Enabling this option will cause |app| to attempt to detect and correct hard line breaks that exist 
    within a document using punctuation clues and line length.  |app| will first attempt to detect whether 
    hard line breaks exist, if they do not appear to exist |app| will not attempt to unwrap lines.  The 
    line-unwrap factor can be reduced if you want to 'force' |app| to unwrap lines.
 :guilabel:`Line-unwrap factor`
    This option controls the algorithm |app| uses to remove hard line breaks. For example, if the value of this
    option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
    than the length of 40% of all lines in the document.  If your document only has a few line breaks which need
    correction, then this value should be reduced to somewhere between 0.1 and 0.2.
 :guilabel:`Detect and markup unformatted chapter headings and sub headings`
    If your document does not have Chapter Markers and titles formatted differently from the rest of the text,
    |app| can use this option to attempt detection them and surround them with heading tags. &lt;h2&gt; tags are used 
    for chapter headings; &lt;h3&gt; tags are used for any titles that are detected.  
    This function will not create a TOC, but in many cases it will cause |app|'s default chapter detection settings 
    to correctly detect chapters and build a TOC.  Adjust the Xpath under Structure Detection if a TOC is not automatically
    created.  If there are no other headings used in the document then setting "//h:h2" under Structure Detection would
    be the easiest way to create a TOC for the document.
    The inserted headings are not formatted, to apply formatting use the 'extra_css' option under
    the Look and Feel conversion settings.  For example, to center heading tags, use the following::
        h2, h3 { text-align: center }
 :guilabel:`Renumber sequences of &lt;h1&gt; or &lt;h2&gt; tags`
    Some publishers format chapter headings using multiple &lt;h1&gt; or &lt;h2&gt; tags sequentially.  
    |app|'s default conversion settings will cause such titles to be split into two pieces.  This option 
    will re-number the heading tags to prevent splitting.
 :guilabel:`Delete blank lines between paragraphs`
    This option will cause |app| to analyze blank lines included within the document.  If every paragraph is interleaved
    with a blank line, then |app| will remove all those blank paragraphs.  Sequences of multiple blank lines will be
    considered scene breaks and retained as a single paragraph.  This option differs from the 'Remove Paragraph Spacing' 
    option under 'Look and Feel' in that it actually modifies the HTML content, while the other option modifies the document
    styles.  This option can also remove paragraphs which were inserted using |app|'s 'Insert blank line' option.
 :guilabel:`Ensure scene breaks are consistently formatted`
    With this option |app| will attempt to detect common scene-break markers and ensure that they are center aligned.  
    It also attempts to detect scene breaks defined by white space and replace them with a horizontal rule 15% of the
    page width.  Some readers may find this desirable as these 'soft' scene breaks often become page breaks on readers, and 
    thus become difficult to distinguish.
 :guilabel:`Remove unnecessary hyphens`
    |app| will analyze all hyphenated content in the document when this option is enabled.  The document itself is used
    as a dictionary for analysis.  This allows |app| to accurately remove hyphens for any words in the document in any language, 
    along with made-up and obscure scientific words.  The primary drawback is words appearing only a single time in the document 
    will not be changed.  Analysis happens in two passes, the first pass analyzes line endings.  Lines are only unwrapped if the 
    word exists with or without a hyphen in the document.  The second pass analyzes all hyphenated words throughout the document, 
    hyphens are removed if the word exists elsewhere in the document without a match.
 :guilabel:`Italicize common words and patterns`
    When enabled, |app| will look for common words and patterns that denote italics and italicize them.  Examples are common text
    conventions such as ~word~ or phrases that should generally be italicized, e.g. latin phrases like 'etc.' or 'et cetera'.
 :guilabel:`Replace entity indents with CSS indents`
    Some documents use a convention of defining text indents using non-breaking space entities.  When this option is enabled |app| will
    attempt to detect this sort of formatting and convert them to a 3% text indent using css.
 .. search-replace:
 Search & Replace
 ---------------------
 These options are useful primarily for conversion of PDF documents. Often, the conversion leaves
 behind page headers and footers in the text. These options use regular expressions to try and detect
 the headers and footers and remove them. Remember that they operate on the intermediate XHTML produced
 by the conversion pipeline. There is also a wizard to help you customize the regular expressions for
 your document.  These options can also be used for generic search and replace of any content by additionally 
 specifying a replacement expression.
 The search works by using a python regular expression. All matched text is simply removed from
 the document or replaced using the replacement pattern. You can learn more about regular expressions and 
 their syntax at http://docs.python.org/library/re.html.
 .. _structure-detection:
 Structure Detection
@ -298,21 +390,6 @@ which means that |app| will insert page breaks before every `<h1>` and `<h2>` ta
    The default expressions may change depending on the input format you are converting.
 Removing headers and footers
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 These options are useful primarily for conversion of PDF documents. Often, the conversion leaves
 behind page headers and footers in the text. These options use regular expressions to try and detect
 the headers and footers and remove them. Remember that they operate on the intermediate XHTML produced
 by the conversion pipeline. There is also a wizard to help you customize the regular expressions for
 your document.
 The header and footer regular expressions are used in conjunction with the remove header and footer options.
 If the remove option is not enabled the regular expression will not be applied to remove the matched text.
 The removal works by using a python regular expression. All matched text is simply removed from
 the document. You can learn more about regular expressions and their syntax at
 http://docs.python.org/library/re.html.
 Miscellaneous
 ~~~~~~~~~~~~~~
@ -330,16 +407,6 @@ There are a few more options in this section.
    two covers. This option will simply remove the first image from the source document, thereby
    ensuring that the converted book has only one cover, the one specified in |app|.
 :guilabel:`Preprocess input`
    This option activates various algorithms that try to detect and correct common cases of
    badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
    Turn this option on if your input document suffers from bad formatting. But be aware that in
    some cases, this option can lead to worse results, so use with care.
 :guilabel:`Line-unwrap factor`
    This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
    option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
    than the length of 40% of all lines in the document. 
 Table of Contents
 ------------------
@ -488,26 +555,33 @@ at `mobileread <http://www.mobileread.com/forums/showthread.php?t=28313>`_.
 Convert TXT documents
 ~~~~~~~~~~~~~~~~~~~~~~
-TXT documents have no well defined way to specify formatting like bold, italics, etc, or document structure like paragraphs, headings, sections and so on.
+TXT documents have no well defined way to specify formatting like bold, italics, etc, or document 
-Since TXT documents provide no way to explicitly mark parts of
+structure like paragraphs, headings, sections and so on, but there are a variety of conventions commonly 
-the text, by default |app| only groups lines in the input document into paragraphs. The default is to assume one or
+used.  By default |app| attempts automatic detection of the correct formatting and markup based on those
-more blank lines are a paragraph boundary::
+conventions.
 TXT input supports a number of options to differentiate how paragraphs are detected.
    :guilabel:`Paragraph Style: Auto`
        Analyzes the text file and attempts to automatically determine how paragraphs are defined.  This
        option will generally work fine, if you achieve undesirable results try one of the manual options.
    :guilabel:`Paragraph Style: Block`
        Assumes one or more blank lines are a paragraph boundary::
            This is the first.
            This is the
            second paragraph.
-TXT input supports a number of options to differentiate how paragraphs are detected.
+    :guilabel:`Paragraph Style: Single`
    :guilabel:`Treat each line as a paragraph`
        Assumes that every line is a paragraph::
            This is the first.
            This is the second.
            This is the third.
-    :guilabel:`Assume print formatting`
+    :guilabel:`Paragraph Style: Print`
        Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when
        the next line that starts with an indent is reached::
@ -518,13 +592,28 @@ TXT input supports a number of options to differentiate how paragraphs are detec
            This is the
            third.
-    :guilabel:`Process using markdown`
+    :guilabel:`Paragraph Style: Unformatted`
        Assumes that the document has no formatting, but does use hard line breaks.  Punctuation
        and median line length are used to attempt to re-create paragraphs.
    :guilabel:`Formatting Style: Auto`
        Attemtps to detect the type of formatting markup being used.  If no markup is used then heuristic
        formatting will be applied.
    :guilabel:`Formatting Style: Heuristic`
        Analyses the document for common chapter headings, scene breaks, and italicized words and applies the
        appropriate html markup during conversion.
    :guilabel:`Formatting Style: Markdown`
        |app| also supports running TXT input though a transformation preprocessor known as markdown. Markdown
        allows for basic formatting to be added to TXT documents, such as bold, italics, section headings, tables,
        lists, a Table of Contents, etc. Marking chapter headings with a leading # and setting the chapter XPath detection
        expression to "//h:h1" is the easiest way to have a proper table of contents generated from a TXT document.
        You can learn more about the markdown syntax at `daringfireball <http://daringfireball.net/projects/markdown/syntax>`_.
    :guilabel:`Formatting Style: None`
        Applies no special formatting to the text, the document is converted to html with no other changes.
 Convert PDF documents
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/src/calibre/utils/date.py
+++ b/src/calibre/utils/date.py
@ -52,9 +52,10 @@ def is_date_undefined(qt_or_dt):
        return True
    if hasattr(d, 'toString'):
        d = datetime(d.year(), d.month(), d.day(), tzinfo=utc_tz)
-    return d.year <= UNDEFINED_DATE.year and \
+    return d.year < UNDEFINED_DATE.year or (
-            d.month == UNDEFINED_DATE.month and \
+            d.year == UNDEFINED_DATE.year and
-            d.day == UNDEFINED_DATE.day
+            d.month == UNDEFINED_DATE.month and
            d.day == UNDEFINED_DATE.day)
 def parse_date(date_string, assume_utc=False, as_utc=True, default=None):
    '''