From 80065cb443021536762bf0fdf8d479b1b06bbd0d Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 19 Jan 2011 00:18:11 +0800
Subject: [PATCH] tweaked chapter_markup for false positives/negatives

---
 src/calibre/ebooks/conversion/utils.py | 10 ++++------
 src/calibre/manual/conversion.rst      | 12 ++++++++----
 2 files changed, 12 insertions(+), 10 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index bcc6f5a236..812a863717 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -174,7 +174,7 @@ class HeuristicProcessor(object):
             if wordcount > 200000:
                 typical_chapters = 15000.
             self.min_chapters = int(ceil(wordcount / typical_chapters))
-        print "minimum chapters required are: "+str(self.min_chapters)
+        self.log.debug("minimum chapters required are: "+str(self.min_chapters))
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
         self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
@@ -208,12 +208,12 @@ class HeuristicProcessor(object):
         n_lookahead_close = ")"
 
         default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
-        simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter).{0,50}?(</[ibu][^>]*>)?(?=<)"
+        simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"
 
         analysis_result = []
 
         chapter_types = [
-            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
             [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],  # Highest frequency headings which include titles
             [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
             [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
@@ -274,10 +274,9 @@ class HeuristicProcessor(object):
                             title_req = True
                             strict_title = False
                         self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
-                        print "max chapters is "+str(self.max_chapters)
                         if type_name == 'common':
                             analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
-                        elif self.min_chapters <= hits < self.max_chapters:
+                        elif self.min_chapters <= hits < max_chapters:
                             analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
                             break
                 else:
@@ -423,7 +422,6 @@ class HeuristicProcessor(object):
         except:
             self.log.warn("Can't get wordcount")
 
-        print "found "+unicode(self.totalwords)+" words in the flow"
         if self.totalwords < 50:
             self.log.warn("flow is too short, not running heuristics")
             return html
diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index 94a3a60721..e7c09a57a5 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -285,10 +285,14 @@ remove all non-breaking-space entities.
 :guilabel:`Detect and markup unformatted chapter headings and sub headings`
     If your document does not have Chapter Markers and titles formatted differently from the rest of the text,
     |app| can use this option to attempt detection them and surround them with heading tags. &lt;h2&gt; tags are used 
-    for chapter headings; &lt;h3&gt; tags are used for any titles that are detected.  This function will 
-    not create a TOC, but in many cases it will cause |app|'s default chapter detection settings to correctly
-    detect chapters and build a TOC.  Adjust the Xpath under Structure Detection if a TOC is not automatically
-    created.  The inserted heading tags are not formatted, to apply formatting use the 'extra_css' option under
+    for chapter headings; &lt;h3&gt; tags are used for any titles that are detected.  
+    
+    This function will not create a TOC, but in many cases it will cause |app|'s default chapter detection settings 
+    to correctly detect chapters and build a TOC.  Adjust the Xpath under Structure Detection if a TOC is not automatically
+    created.  If there are no other headings used in the document then setting "//h:h2" under Structure Detection would
+    be the easiest way to create a TOC for the document.
+    
+    The inserted headings are not formatted, to apply formatting use the 'extra_css' option under
     the Look and Feel conversion settings.  For example, to center heading tags, use the following::
 
         h2, h3 { text-align: center }