From 6e64f5ec4e0cb66cc0bf722ecf050f72fb9120dc Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 24 Jan 2011 12:20:50 +0800
Subject: [PATCH 1/6] doc tweaks, delete empty paragraphs during Heuristics

---
 src/calibre/ebooks/conversion/utils.py | 2 ++
 src/calibre/manual/conversion.rst      | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index aabb1b8bc4..14eca46b07 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -367,6 +367,8 @@ class HeuristicProcessor(object):
         html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
         # Delete microsoft 'smart' tags
         html = re.sub('(?i)</?st1:\w+>', '', html)
+        # Delete self closing paragraph tags
+        html = re.sub('<p\s?/>', '', html)
         # Get rid of empty span, bold, font, em, & italics tags
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
         html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index 6ec986f26a..7f3ff21fe0 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -587,11 +587,11 @@ TXT input supports a number of options to differentiate how paragraphs are detec
         Assumes that every paragraph starts with an indent (either a tab or 2+ spaces). Paragraphs end when
         the next line that starts with an indent is reached::
 
-            This is the
+              This is the
             first.
-            This is the second.
+              This is the second.
             
-            This is the
+              This is the
             third.
 
     :guilabel:`Paragraph Style: Unformatted`
@@ -603,7 +603,7 @@ TXT input supports a number of options to differentiate how paragraphs are detec
         formatting will be applied.
 
     :guilabel:`Formatting Style: Heuristic`
-        Analyses the document for common chapter headings, scene breaks, and italicized words and applies the
+        Analyzes the document for common chapter headings, scene breaks, and italicized words and applies the
         appropriate html markup during conversion.
 
     :guilabel:`Formatting Style: Markdown`

From 1e675af91aa665334261dafdc50f89724d2bf4f4 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 24 Jan 2011 12:22:42 +0800
Subject: [PATCH 2/6] Always apply title case to downloaded metadata - tired of
 crap metadata sources with 99% lowercase characters

---
 src/calibre/ebooks/metadata/fetch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py
index 390f288d8e..8018f42b13 100644
--- a/src/calibre/ebooks/metadata/fetch.py
+++ b/src/calibre/ebooks/metadata/fetch.py
@@ -411,7 +411,7 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
                     r.pubdate = pubdate
 
     def fix_case(x):
-        if x and x.isupper():
+        if x:
             x = titlecase(x)
         return x
 

From a20cd3336dc5a309543dd5134a4f85bf563b9b09 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 24 Jan 2011 13:47:53 +0800
Subject: [PATCH 3/6] improved negative lookaheads in heuristics

---
 src/calibre/ebooks/conversion/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index e8c2acb9d6..22639801ff 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -203,8 +203,8 @@ class HeuristicProcessor(object):
             blank_lines = ""
         opt_title_open = "("
         opt_title_close = ")?"
-        n_lookahead_open = "\s+(?!"
-        n_lookahead_close = ")"
+        n_lookahead_open = "(?!\s*"
+        n_lookahead_close = ")\s*"
 
         default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
         simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"

From d03ae9e001165d6edc82fbdbf884b89cd0b35ed9 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 24 Jan 2011 14:20:48 +0800
Subject: [PATCH 4/6] fixed a regression in chapter markup logic which was
 causing false negatives

---
 src/calibre/ebooks/conversion/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 22639801ff..1eb063cdd8 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -215,7 +215,7 @@ class HeuristicProcessor(object):
             [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
             [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],  # Highest frequency headings which include titles
             [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
-            [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
+            [r"[^'\"]?(\d+(\.|:))\s*([\w\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'],  # Numeric Chapters
             [r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'],  # Spaced Lettering
             [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
             [r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'],  # Numeric Chapters, no dot or colon
@@ -275,7 +275,7 @@ class HeuristicProcessor(object):
                         self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
                         if type_name == 'common':
                             analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
-                        elif self.min_chapters <= hits < max_chapters:
+                        elif self.min_chapters <= hits < max_chapters or self.min_chapters < 3 > hits:
                             analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
                             break
                 else:

From 670fc644edb235b4a1fed74ded7924a2b8c13ec4 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 24 Jan 2011 16:00:05 +0800
Subject: [PATCH 5/6] made chapter markup routine more Sigil friendly

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 1eb063cdd8..d9350e6adb 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -43,7 +43,7 @@ class HeuristicProcessor(object):
             self.html_preprocess_sections = self.html_preprocess_sections + 1
             self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                     " chapters & titles. - " + unicode(chap) + ", " + unicode(title))
-            return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
+            return '<h2 title="'+chap+', '+title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n'
 
     def chapter_break(self, match):
         chap = match.group('section')

From 7485c9a5e200c20285ead3795025781f4d9ef31e Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 24 Jan 2011 16:30:29 +0800
Subject: [PATCH 6/6] fixed sigil integration to strip html from chapter
 titles, fixed softbreak handling and enabled integration with extra_css

---
 src/calibre/ebooks/conversion/utils.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index d9350e6adb..f6e259b6f9 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -25,13 +25,15 @@ class HeuristicProcessor(object):
         self.chapters_with_title = 0
         self.blanks_deleted = False
         self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
-        self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
+        self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
+        self.softbreak = re.compile(r'\s*(?P<openline><p(?=\sclass=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
 
     def is_pdftohtml(self, src):
         return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
 
     def chapter_head(self, match):
+        from calibre.utils.html2text import html2text
         chap = match.group('chap')
         title = match.group('title')
         if not title:
@@ -40,10 +42,12 @@ class HeuristicProcessor(object):
                     " chapters. - " + unicode(chap))
             return '<h2>'+chap+'</h2>\n'
         else:
+            txt_chap = html2text(chap)
+            txt_title = html2text(title)
             self.html_preprocess_sections = self.html_preprocess_sections + 1
             self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                     " chapters & titles. - " + unicode(chap) + ", " + unicode(title))
-            return '<h2 title="'+chap+', '+title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n'
+            return '<h2 title="'+txt_chap+', '+txt_title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n'
 
     def chapter_break(self, match):
         chap = match.group('section')
@@ -469,7 +473,7 @@ class HeuristicProcessor(object):
         if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
             self.log.debug("deleting blank lines")
             self.blanks_deleted = True
-            html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
+            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
             html = self.blankreg.sub('', html)
 
         # Determine line ending type
@@ -524,11 +528,11 @@ class HeuristicProcessor(object):
             # Center separator lines
             html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
             if not self.blanks_deleted:
-                html = self.multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
-            html = re.sub('<p\s+id="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
+                html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
+            html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
 
         if self.deleted_nbsps:
             # put back non-breaking spaces in empty paragraphs to preserve original formatting
             html = self.blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
-
+            html = self.softbreak.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
         return html