From 2ebf94812e8ed82491b7579333f66cde7ce15096 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 17 Jan 2011 17:13:19 +0800
Subject: [PATCH 1/4] started updating manuals, fix lit postprocess to handle
 content in pre tags correctly

---
 src/calibre/ebooks/conversion/plumber.py |  6 ++-
 src/calibre/ebooks/lit/input.py          |  9 ++--
 src/calibre/manual/conversion.rst        | 59 ++++++++++++++++++++----
 3 files changed, 59 insertions(+), 15 deletions(-)
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 249f848661..6fdf7ddc68 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -492,7 +492,9 @@ OptionRecommendation(name='enable_heuristics',
 OptionRecommendation(name='markup_chapter_headings',
     recommended_value=False, level=OptionRecommendation.LOW,
     help=_('Detect unformatted chapter headings and sub headings. Change ' 
-           'them to h2 and h3 tags.')),
+           'them to h2 and h3 tags.  This setting will not create a TOC, '
+           'but can be used in conjunction with structure detection to create '
+           'one.')),
            
 OptionRecommendation(name='italicize_common_cases',
     recommended_value=False, level=OptionRecommendation.LOW,
@@ -501,7 +503,7 @@ OptionRecommendation(name='italicize_common_cases',
            
 OptionRecommendation(name='fix_indents',
     recommended_value=False, level=OptionRecommendation.LOW,
-    help=_('Turn indentation created from multiple &nbsp; entities '
+    help=_('Turn indentation created from multiple non-breaking space entities '
            'into CSS indents.')),
            
 OptionRecommendation(name='html_unwrap_factor',
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 7b822b68a6..ff8955939e 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -22,7 +22,7 @@ class LITInput(InputFormatPlugin):
         from calibre.ebooks.lit.reader import LitReader
         from calibre.ebooks.conversion.plumber import create_oebbook
         self.log = log
-        return create_oebbook(log, stream, options, self, reader=LitReader)
+        return create_oebbook(log, stream, options, reader=LitReader)
 
     def postprocess_book(self, oeb, opts, log):
         from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
@@ -39,10 +39,13 @@ class LITInput(InputFormatPlugin):
                 body = body[0]
                 if len(body) == 1 and body[0].tag == XHTML('pre'):
                     pre = body[0]
-                    from calibre.ebooks.txt.processor import convert_basic
+                    from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
+                    separate_paragraphs_single_line
                     from lxml import etree
                     import copy
-                    html = convert_basic(pre.text).replace('<html>',
+                    html = separate_paragraphs_single_line(pre.text)
+                    html = preserve_spaces(html)
+                    html = convert_basic(html).replace('<html>',
                             '<html xmlns="%s">'%XHTML_NS)
                     root = etree.fromstring(html)
                     body = XPath('//h:body')(root)
diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index 4b2b169d72..3383708b72 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -255,6 +255,46 @@ you are producing are meant for a particular device type, choose the correspondi
 
 The Output profile also controls the screen size. This will cause, for example, images to be auto-resized to be fit to the screen in some output formats. So choose a profile of a device that has a screen size similar to your device.
 
+.. _heuristic-processing:
+
+Heuristic Processing
+---------------------
+
+:guilabel:`Preprocess input`
+    This option activates various algorithms that try to detect and correct common cases of
+    badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
+    Turn this option on if your input document suffers from bad formatting. But be aware that in
+    some cases, this option can lead to worse results, so use with care.
+
+:guilabel:`Line-unwrap factor`
+    This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
+    option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
+    than the length of 40% of all lines in the document. 
+
+:guilabel:`Unwrap lines`
+    Lorem ipsum
+    
+:guilabel:`Detect and markup unformatted chapter headings and sub headings`
+    Lorem ipsum
+
+:guilabel:`Renumber sequences of &lt;h1&gt; or &lt;h2&gt; tags to prevent splitting`
+    Lorem ipsum
+    
+:guilabel:`Delete blank lines between paragraphs`
+    Lorem ipsum
+
+:guilabel:`Ensure scene breaks are consistently formatted`
+    Lorem ipsum
+
+:guilabel:`Remove unnecessary hyphens`
+    Lorem ipsum
+
+:guilabel:`Italicize common words and patterns`
+    Lorem ipsum
+
+:guilabel:`Replace entity indents with CSS indents`
+    Lorem ipsum
+
 .. _structure-detection:
 
 Structure Detection
@@ -330,16 +370,6 @@ There are a few more options in this section.
     two covers. This option will simply remove the first image from the source document, thereby
     ensuring that the converted book has only one cover, the one specified in |app|.
 
-:guilabel:`Preprocess input`
-    This option activates various algorithms that try to detect and correct common cases of
-    badly formatted input documents. Things like hard line breaks, large blocks of text with no formatting, etc.
-    Turn this option on if your input document suffers from bad formatting. But be aware that in
-    some cases, this option can lead to worse results, so use with care.
-
-:guilabel:`Line-unwrap factor`
-    This option control the algorithm |app| uses to remove hard line breaks. For example, if the value of this
-    option is 0.4, that means calibre will remove hard line breaks from the end of lines whose lengths are less
-    than the length of 40% of all lines in the document. 
     
 Table of Contents
 ------------------
@@ -500,6 +530,9 @@ more blank lines are a paragraph boundary::
 
 TXT input supports a number of options to differentiate how paragraphs are detected.
 
+    :guilabel:`Auto`
+        Analyzes the text file and attempts to determine how paragraphs are defined.
+
     :guilabel:`Treat each line as a paragraph`
         Assumes that every line is a paragraph::
 
@@ -518,6 +551,12 @@ TXT input supports a number of options to differentiate how paragraphs are detec
             This is the
             third.
 
+    :guilabel:`Unformatted`
+        Assumes that the document has no formatting, but does use hard line breaks.  Punctuation
+        and median line length are used to attempt to re-create paragraphs.
+
+    :guilabel:`Process using Textile`
+
     :guilabel:`Process using markdown`
         |app| also supports running TXT input though a transformation preprocessor known as markdown. Markdown
         allows for basic formatting to be added to TXT documents, such as bold, italics, section headings, tables,

From 05730e1886c8562e819364c43a7fa58c172392d6 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 17 Jan 2011 19:00:14 +0800
Subject: [PATCH 2/4] insert horizontal rules for softbreaks when option is
 enabled

---
 src/calibre/ebooks/conversion/utils.py |  5 +++++
 src/calibre/manual/conversion.rst      | 15 +++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 15522d25e6..d9e5246223 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -23,6 +23,7 @@ class HeuristicProcessor(object):
         self.min_chapters = 1
         self.chapters_no_title = 0
         self.chapters_with_title = 0
+        self.blanks_deleted = False
         self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
         self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
@@ -422,6 +423,7 @@ class HeuristicProcessor(object):
         # blank paragraphs then delete blank lines to clean up spacing
         if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
             self.log.debug("deleting blank lines")
+            self.blanks_deleted = True
             html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
             html = self.blankreg.sub('', html)
 
@@ -479,6 +481,9 @@ class HeuristicProcessor(object):
         if getattr(self.extra_opts, 'format_scene_breaks', False):
             # Center separator lines
             html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
+            if not self.blanks_deleted:
+                html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
+            html = re.sub('<p\s+id="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
 
         if self.deleted_nbsps:
             # put back non-breaking spaces in empty paragraphs to preserve original formatting
diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index 3383708b72..96a8e30e3c 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -530,17 +530,18 @@ more blank lines are a paragraph boundary::
 
 TXT input supports a number of options to differentiate how paragraphs are detected.
 
-    :guilabel:`Auto`
-        Analyzes the text file and attempts to determine how paragraphs are defined.
+    :guilabel:`Paragraph Style: Auto`
+        Analyzes the text file and attempts to automatically determine how paragraphs are defined.  This
+        option will generally work fine, if you achieve undesirable results try one of the manual options.
 
-    :guilabel:`Treat each line as a paragraph`
+    :guilabel:`Paragraph Style: Single`
         Assumes that every line is a paragraph::
 
             This is the first.
             This is the second.
             This is the third.
         
-    :guilabel:`Assume print formatting`
+    :guilabel:`Paragraph Style: Print`
         Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when
         the next line that starts with an indent is reached::
 
@@ -551,11 +552,13 @@ TXT input supports a number of options to differentiate how paragraphs are detec
             This is the
             third.
 
-    :guilabel:`Unformatted`
+    :guilabel:`Paragraph Style: Unformatted`
         Assumes that the document has no formatting, but does use hard line breaks.  Punctuation
         and median line length are used to attempt to re-create paragraphs.
 
-    :guilabel:`Process using Textile`
+    :guilabel:`Formatting Style: Auto`
+
+    :guilabel:`Formatting Style: Heuristic`
 
     :guilabel:`Process using markdown`
         |app| also supports running TXT input though a transformation preprocessor known as markdown. Markdown

From a0aa719bb0b8de97a12c96c41a4bff70f656b213 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 17 Jan 2011 23:53:27 +0800
Subject: [PATCH 3/4] implemented multi-pass analysis for chapter detection

---
 src/calibre/ebooks/conversion/utils.py | 93 +++++++++++++++++++-------
 1 file changed, 68 insertions(+), 25 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index d9e5246223..1a691b2e14 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -21,6 +21,7 @@ class HeuristicProcessor(object):
         self.deleted_nbsps = False
         self.totalwords = 0
         self.min_chapters = 1
+        self.max_chapters = 150
         self.chapters_no_title = 0
         self.chapters_with_title = 0
         self.blanks_deleted = False
@@ -132,7 +133,7 @@ class HeuristicProcessor(object):
     def markup_italicis(self, html):
         ITALICIZE_WORDS = [
             'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
-            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
+            'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
             'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
             'Mlle.', 'Mons.', 'PS.', 'PPS.',
         ]
@@ -166,9 +167,11 @@ class HeuristicProcessor(object):
         with minimum false positives.  Exits after finding a successful pattern
         '''
         # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
-        # minimum of chapters to search for
+        # minimum of chapters to search for.  A max limit is calculated to prevent things like OCR
+        # or pdf page numbers from being treated as TOC markers
         if wordcount > 7000:
             self.min_chapters = int(ceil(wordcount / 7000.))
+            self.max_chapters = int(ceil(wordcount / 100.))
         #print "minimum chapters required are: "+str(self.min_chapters)
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
@@ -202,44 +205,84 @@ class HeuristicProcessor(object):
         n_lookahead_open = "\s+(?!"
         n_lookahead_close = ")"
 
-        default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
+        default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
+        simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter).{0,50}?(</[ibu][^>]*>)?(?=<)"
+
+        analysis_result = []
 
         chapter_types = [
-            [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for most common chapter headings", 'chapter'],  # Highest frequency headings which include titles
-            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common section headings", 'common'],
-            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
-            [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
-            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
-            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
-            [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric headings", 'plain_number'],  # Numeric Chapters, no dot or colon
-            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
+            [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],  # Highest frequency headings which include titles
+            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
+            [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
+            [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
+            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
+            [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'],  # Numeric Chapters, no dot or colon
+            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
             ]
 
         def recurse_patterns(html, analyze):
             # Start with most typical chapter headings, get more aggressive until one works
-            for [chapter_type, lookahead_ignorecase, log_message, type_name] in chapter_types:
+            for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
+                n_lookahead = ''
+                hits = 0
+                self.chapters_no_title = 0
+                self.chapters_with_title = 0
+
+                if n_lookahead_req:
+                    lp_n_lookahead_open = n_lookahead_open
+                    lp_n_lookahead_close = n_lookahead_close
+                else:
+                    lp_n_lookahead_open = ''
+                    lp_n_lookahead_close = ''
+
+                if strict_title:
+                    lp_title = default_title
+                else:
+                    lp_title = simple_title
+                 
+                if ignorecase:
+                    arg_ignorecase = r'(?i)'
+                else:
+                    arg_ignorecase = ''
+
+                if title_req:
+                    lp_opt_title_open = ''
+                    lp_opt_title_close = ''        
+                else:
+                    lp_opt_title_open = opt_title_open
+                    lp_opt_title_close = opt_title_close
+
                 if self.html_preprocess_sections >= self.min_chapters:
                     break
                 full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
-                n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-                self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
-                if lookahead_ignorecase:
-                    chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-                    chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
-                else:
-                    chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
-                    chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
+                if n_lookahead_req:
+                    n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
+                if not analyze:
+                    self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
+
+                chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
+                chapdetect = re.compile(r'%s' % chapter_marker)
+
                 if analyze:
                     hits = len(chapdetect.findall(html))
-                    print unicode(type_name)+" had "+unicode(hits)+" hits"
-                    chapdetect.sub(self.analyze_title_matches, html)
-                    print unicode(self.chapters_no_title)+" chapters with no title"
-                    print unicode(self.chapters_with_title)+" chapters with titles"
+                    if hits:
+                        chapdetect.sub(self.analyze_title_matches, html)
+                        if float(self.chapters_with_title) / float(hits) > .5:
+                            title_req = True
+                            strict_title = False
+                        self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
+                        if type_name == 'common':
+                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
+                        elif self.min_chapters <= hits < self.max_chapters:
+                            analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
+                            break
                 else:
                     html = chapdetect.sub(self.chapter_head, html)
             return html
 
         recurse_patterns(html, True)
+        chapter_types = analysis_result
         html = recurse_patterns(html, False)
 
         words_per_chptr = wordcount
@@ -293,7 +336,7 @@ class HeuristicProcessor(object):
         pre = re.compile(r'<pre>', re.IGNORECASE)
         if len(pre.findall(html)) >= 1:
             self.log.debug("Running Text Processing")
-            outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
+            outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
             html = outerhtml.sub(self.txt_process, html)
         else:
             # Add markup naively

From 3ca18da2cfc48a1ce3a201245eeee8ed005f0541 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 18 Jan 2011 01:17:42 +0800
Subject: [PATCH 4/4] fix pdf preprocess call

---
 src/calibre/ebooks/conversion/preprocess.py | 13 +++----------
 src/calibre/ebooks/conversion/utils.py      |  7 ++++---
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index d2bdba4928..54639df93c 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -473,12 +473,6 @@ class HTMLPreProcessor(object):
             # unwrap/delete soft hyphens with formatting
             end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
 
-        # Make the more aggressive chapter marking regex optional with the preprocess option to
-        # reduce false positives and move after header/footer removal
-        if getattr(self.extra_opts, 'preprocess_html', None):
-            if is_pdftohtml:
-                end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
-
         length = -1
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
             docanalysis = DocAnalysis('pdf', html)
@@ -525,11 +519,10 @@ class HTMLPreProcessor(object):
             html = dehyphenator(html,'html', length)
 
         if is_pdftohtml:
-            from calibre.ebooks.conversion.utils import PreProcessor
-            pdf_markup = PreProcessor(self.extra_opts, None)
+            from calibre.ebooks.conversion.utils import HeuristicProcessor
+            pdf_markup = HeuristicProcessor(self.extra_opts, None)
             totalwords = 0
-            totalwords = pdf_markup.get_word_count(html)
-            if totalwords > 7000:
+            if pdf_markup.get_word_count(html) > 7000:
                 html = pdf_markup.markup_chapters(html, totalwords, True)
 
         #dump(html, 'post-preprocess')
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 1a691b2e14..888d24d791 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -170,9 +170,9 @@ class HeuristicProcessor(object):
         # minimum of chapters to search for.  A max limit is calculated to prevent things like OCR
         # or pdf page numbers from being treated as TOC markers
         if wordcount > 7000:
-            self.min_chapters = int(ceil(wordcount / 7000.))
-            self.max_chapters = int(ceil(wordcount / 100.))
-        #print "minimum chapters required are: "+str(self.min_chapters)
+            self.min_chapters = int(ceil(wordcount / 15000.))
+            self.max_chapters = int(ceil(wordcount / 1200.))
+        print "minimum chapters required are: "+str(self.min_chapters)
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
         self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
@@ -272,6 +272,7 @@ class HeuristicProcessor(object):
                             title_req = True
                             strict_title = False
                         self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
+                        print "max chapters is "+str(self.max_chapters)
                         if type_name == 'common':
                             analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
                         elif self.min_chapters <= hits < self.max_chapters: