From 0ed2f3fceb3cc4cc06319d199109b7647e5c9af4 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 1 Oct 2010 17:45:47 +0800
Subject: [PATCH 01/28] partial/potential fix for mobi problem

---
 src/calibre/ebooks/mobi/mobiml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py
index 231ad51eee..31b1ac5834 100644
--- a/src/calibre/ebooks/mobi/mobiml.py
+++ b/src/calibre/ebooks/mobi/mobiml.py
@@ -188,7 +188,7 @@ class MobiMLizer(object):
                 para = wrapper
                 emleft = int(round(left / self.profile.fbase)) - 1
                 emleft = min((emleft, 10))
-                while emleft > 0:
+                while emleft > 1:
                     para = etree.SubElement(para, XHTML('blockquote'))
                     emleft -= 1
             else:

From 3900216da0896ff1d372714f074ea71e39885054 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 1 Oct 2010 19:17:32 +0800
Subject: [PATCH 02/28] revert mobi change

---
 src/calibre/ebooks/mobi/mobiml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py
index 31b1ac5834..231ad51eee 100644
--- a/src/calibre/ebooks/mobi/mobiml.py
+++ b/src/calibre/ebooks/mobi/mobiml.py
@@ -188,7 +188,7 @@ class MobiMLizer(object):
                 para = wrapper
                 emleft = int(round(left / self.profile.fbase)) - 1
                 emleft = min((emleft, 10))
-                while emleft > 1:
+                while emleft > 0:
                     para = etree.SubElement(para, XHTML('blockquote'))
                     emleft -= 1
             else:

From 4a044b8e9d6b5f0168ef4e65d6a3e9aa47f182b4 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 4 Oct 2010 16:16:33 +0800
Subject: [PATCH 03/28] small tweak

---
 src/calibre/ebooks/conversion/utils.py | 8 +++++---
 src/calibre/ebooks/mobi/mobiml.py      | 4 ++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 2faec27b68..976ed6a8f4 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -146,7 +146,7 @@ class PreProcessor(object):
                #print "blanks between paragraphs is marked True"
             else:
                 blanks_between_paragraphs = False
-        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
+        self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
         # detect chapters/sections to match xpath or splitting logic
         #
         # Build the Regular Expressions in pieces
@@ -166,13 +166,13 @@ class PreProcessor(object):
         title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
         opt_title_close = ")?"
 
-        default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
+        default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)"
         typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
         numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
         uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
 
         chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-        #print chapter_marker
+        print chapter_marker
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
         self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
@@ -184,12 +184,14 @@ class PreProcessor(object):
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters")
             chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+            print chapter_marker
             chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
             html = chapdetect2.sub(self.chapter_head, html)
 
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words")
             chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+            print chapter_marker
             chapdetect2 = re.compile(r'%s' % chapter_marker,  re.UNICODE)
             html = chapdetect2.sub(self.chapter_head, html)
         ###### Unwrap lines ######
diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py
index 231ad51eee..d4801e637e 100644
--- a/src/calibre/ebooks/mobi/mobiml.py
+++ b/src/calibre/ebooks/mobi/mobiml.py
@@ -184,12 +184,12 @@ class MobiMLizer(object):
             elif tag in NESTABLE_TAGS and istate.rendered:
                 para = wrapper = bstate.nested[-1]
             elif left > 0 and indent >= 0:
-                para = wrapper = etree.SubElement(parent, XHTML('blockquote'))
+                para = wrapper = etree.SubElement(parent, XHTML('div'))
                 para = wrapper
                 emleft = int(round(left / self.profile.fbase)) - 1
                 emleft = min((emleft, 10))
                 while emleft > 0:
-                    para = etree.SubElement(para, XHTML('blockquote'))
+                    para = etree.SubElement(para, XHTML('div'))
                     emleft -= 1
             else:
                 para = wrapper = etree.SubElement(parent, XHTML('p'))

From b45dc837830b0e4f61b9cc19dfcc5f214589eb83 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 11 Oct 2010 00:14:00 +1000
Subject: [PATCH 04/28] preprocessing tweaks, fixed division by zero error in
 line_histogram

---
 src/calibre/ebooks/conversion/preprocess.py | 2 ++
 src/calibre/ebooks/conversion/utils.py      | 6 +++++-
 src/calibre/ebooks/pdb/input.py             | 6 ++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index c5ebae4bba..de01188829 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -144,6 +144,8 @@ class DocAnalysis(object):
 
         # Normalize the histogram into percents
         totalLines = len(self.lines)
+        if totalLines == 0:
+            return False
         h = [ float(count)/totalLines for count in hRaw ]
         #print "\nhRaw histogram lengths are: "+str(hRaw)
         #print "              percents are: "+str(h)+"\n"
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 976ed6a8f4..a01c29f2fb 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -146,7 +146,7 @@ class PreProcessor(object):
                #print "blanks between paragraphs is marked True"
             else:
                 blanks_between_paragraphs = False
-        self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
+        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
         # detect chapters/sections to match xpath or splitting logic
         #
         # Build the Regular Expressions in pieces
@@ -230,6 +230,7 @@ class PreProcessor(object):
             html = dehyphenator(html,'html', length)
             self.log("Done dehyphenating")
             # Unwrap lines using punctation and line length
+            unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
             unwrap = re.compile(u"(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
             html = unwrap.sub(' ', html)
             #check any remaining hyphens, but only unwrap if there is a match
@@ -259,5 +260,8 @@ class PreProcessor(object):
 
         # put back non-breaking spaces in empty paragraphs to preserve original formatting
         html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
+        
+        # Center separator lines
+        html = re.sub(u'<p>\s*(?P<break>([*#•]+\s*)+)\s*</p>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
 
         return html
diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py
index dfe5b653dd..6850c48b16 100644
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@@ -9,6 +9,7 @@ import os
 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre.ebooks.pdb.header import PdbHeaderReader
 from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
+from calibre.ebooks.conversion.utils import PreProcessor
 
 class PDBInput(InputFormatPlugin):
 
@@ -44,3 +45,8 @@ class PDBInput(InputFormatPlugin):
         opf = reader.extract_content(os.getcwd())
 
         return opf
+
+    def preprocess_html(self, options, html):
+        self.options = options
+        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
+        return preprocessor(html)
\ No newline at end of file

From 87b615f81f694f2dffa23a07afe6e87d6e90497f Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 18 Oct 2010 04:55:53 +0800
Subject: [PATCH 05/28] added new chapter heading type

---
 src/calibre/ebooks/conversion/utils.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index a01c29f2fb..6002509013 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -154,7 +154,7 @@ class PreProcessor(object):
         chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
         chapter_header_open = r"(?P<chap>"
         chapter_header_close = ")\s*"
-        chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*"
+        chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>\s*"
         if blanks_between_paragraphs:
             blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
         else:
@@ -170,6 +170,7 @@ class PreProcessor(object):
         typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
         numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
         uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
+        numeric_titles = r".?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*"
 
         chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
         print chapter_marker
@@ -194,6 +195,14 @@ class PreProcessor(object):
             print chapter_marker
             chapdetect2 = re.compile(r'%s' % chapter_marker,  re.UNICODE)
             html = chapdetect2.sub(self.chapter_head, html)
+
+        if self.html_preprocess_sections < 10:
+            self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters with titles")
+            chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_titles+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+            print chapter_marker
+            chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+            html = chapdetect2.sub(self.chapter_head, html)
+
         ###### Unwrap lines ######
         #
         # Some OCR sourced files have line breaks in the html using a combination of span & p tags

From ccb683ef09ded0d708f8eeda5269c2be6a1b3ba7 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 18 Oct 2010 09:02:07 +0800
Subject: [PATCH 06/28] added unicode hyphens to dehyphenation function

---
 src/calibre/ebooks/conversion/preprocess.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index de01188829..4a77f58df4 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -219,13 +219,13 @@ class Dehyphenator(object):
         self.html = html
         self.format = format
         if format == 'html':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
         elif format == 'pdf':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
         elif format == 'individual_words':
-            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
+            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
         elif format == 'html_cleanup':
-            intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
+            intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
 
         html = intextmatch.sub(self.dehyphenate, html)
         return html

From f31a055135908f2d79efa4fd3c176a5a5e9e9a52 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 16 Nov 2010 14:39:53 +0800
Subject: [PATCH 07/28] work in progress on chapter detection improvements

---
 src/calibre/ebooks/conversion/utils.py | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 6002509013..724428fec0 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -165,6 +165,8 @@ class PreProcessor(object):
         title_header_close = ")\s*"
         title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
         opt_title_close = ")?"
+        n_lookahead_open = "(?!="
+        n_lookahead_close = ")"
 
         default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)"
         typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
@@ -172,7 +174,11 @@ class PreProcessor(object):
         uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
         numeric_titles = r".?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*"
 
-        chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+        full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close
+        n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line)
+        print "n_lookahead is " + n_lookahead
+        print "Chapter line is " + full_chapter_line + "\n\n"
+        chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
         print chapter_marker
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
@@ -184,21 +190,33 @@ class PreProcessor(object):
             html = chapdetect.sub(self.chapter_head, html)
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters")
-            chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+            full_chapter_line = chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close
+            n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line)
+            print "n_lookahead is " + n_lookahead
+            print "Chapter line is " + full_chapter_line + "\n\n"
+            chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
             print chapter_marker
             chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
             html = chapdetect2.sub(self.chapter_head, html)
 
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words")
-            chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+            full_chapter_line = chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close
+            n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line)
+            print "n_lookahead is " + n_lookahead
+            print "Chapter line is " + full_chapter_line + "\n\n"
+            chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
             print chapter_marker
             chapdetect2 = re.compile(r'%s' % chapter_marker,  re.UNICODE)
             html = chapdetect2.sub(self.chapter_head, html)
 
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters with titles")
-            chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_titles+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+            full_chapter_line = chapter_line_open+chapter_header_open+numeric_titles+chapter_header_close+chapter_line_close
+            n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line)
+            print "n_lookahead is " + n_lookahead
+            print "Chapter line is " + full_chapter_line + "\n\n"
+            chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
             print chapter_marker
             chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
             html = chapdetect2.sub(self.chapter_head, html)

From f7e99d2e86fbcae5fbdae3428905836256fae687 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 16 Nov 2010 15:54:49 +0800
Subject: [PATCH 08/28] added accented characters to the line unwrap patterns,
 since they're not covered under a-z character classes using the unicode
 option.

---
 src/calibre/ebooks/conversion/preprocess.py | 2 +-
 src/calibre/ebooks/conversion/utils.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index e1d73dcfd9..ef092f7954 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -475,7 +475,7 @@ class HTMLPreProcessor(object):
                 end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                 end_rules.append(
                     # Un wrap using punctuation
-                    (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                 )
 
         for rule in self.PREPROCESS + start_rules:
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 724428fec0..143ece4b79 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -258,7 +258,7 @@ class PreProcessor(object):
             self.log("Done dehyphenating")
             # Unwrap lines using punctation and line length
             unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
-            unwrap = re.compile(u"(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+            unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
             html = unwrap.sub(' ', html)
             #check any remaining hyphens, but only unwrap if there is a match
             dehyphenator = Dehyphenator()

From 4526ced6d1257a34b9f3c093f02f373291657ab8 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 16 Nov 2010 18:34:21 +0800
Subject: [PATCH 09/28] made conversion of nbsp to indent a bit smarter

---
 src/calibre/ebooks/conversion/utils.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 143ece4b79..51139d3a18 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -44,10 +44,14 @@ class PreProcessor(object):
         span = match.group('span')
         self.found_indents = self.found_indents + 1
         if pstyle:
-            if not span:
-                return '<p '+pstyle+' style="text-indent:3%">'
+            if pstyle.lower().find('style'):
+                pstyle = re.sub(r'"$', '; text-indent:3%"', pstyle)
             else:
-                return '<p '+pstyle+' style="text-indent:3%">'+span
+                pstyle = pstyle+' style="text-indent:3%"'
+            if not span:
+                return '<p '+pstyle+'>'
+            else:
+                return '<p '+pstyle+'>'+span
         else:
             if not span:
                 return '<p style="text-indent:3%">'

From 26ba75f76cc1db12439fb6f3a7c6bc9fbd049507 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 17 Nov 2010 10:25:51 +0800
Subject: [PATCH 10/28] added a search for emphasized lines during chapter
 markup

---
 src/calibre/ebooks/conversion/utils.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 51139d3a18..bec15924d6 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -177,9 +177,10 @@ class PreProcessor(object):
         numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
         uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
         numeric_titles = r".?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*"
+        emphasized_lines = r"<b[^>]*>\s*(<span[^>]*>)?\s*(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>"
 
         full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close
-        n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line)
+        n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
         print "n_lookahead is " + n_lookahead
         print "Chapter line is " + full_chapter_line + "\n\n"
         chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
@@ -195,7 +196,7 @@ class PreProcessor(object):
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters")
             full_chapter_line = chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close
-            n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line)
+            n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
             print "n_lookahead is " + n_lookahead
             print "Chapter line is " + full_chapter_line + "\n\n"
             chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
@@ -203,10 +204,21 @@ class PreProcessor(object):
             chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
             html = chapdetect2.sub(self.chapter_head, html)
 
+        if self.html_preprocess_sections < 10:
+            self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying emphazised lines")
+            full_chapter_line = chapter_line_open+chapter_header_open+emphasized_lines+chapter_header_close+chapter_line_close
+            n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
+            print "n_lookahead is " + n_lookahead
+            print "Chapter line is " + full_chapter_line + "\n\n"
+            chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+            print chapter_marker
+            chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+            html = chapdetect2.sub(self.chapter_head, html)            
+
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words")
             full_chapter_line = chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close
-            n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line)
+            n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
             print "n_lookahead is " + n_lookahead
             print "Chapter line is " + full_chapter_line + "\n\n"
             chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
@@ -217,7 +229,7 @@ class PreProcessor(object):
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters with titles")
             full_chapter_line = chapter_line_open+chapter_header_open+numeric_titles+chapter_header_close+chapter_line_close
-            n_lookahead = re.sub("(out|inn|cha)", "l", full_chapter_line)
+            n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
             print "n_lookahead is " + n_lookahead
             print "Chapter line is " + full_chapter_line + "\n\n"
             chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close

From a55b4dbbac65ef083d3af4943243f9b6e092d227 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 17 Nov 2010 13:49:12 +0800
Subject: [PATCH 11/28] remove extra line feeds from html comments when
 sanitizing

---
 src/calibre/library/comments.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/calibre/library/comments.py b/src/calibre/library/comments.py
index 83eec89abe..00a6ef55ae 100644
--- a/src/calibre/library/comments.py
+++ b/src/calibre/library/comments.py
@@ -131,7 +131,8 @@ def comments_to_html(comments):
 def sanitize_comments_html(html):
     text = html2text(html)
     md = markdown.Markdown(safe_mode=True)
-    return md.convert(text)
+    cleansed = re.sub('\n+', '', md.convert(text))
+    return cleansed
 
 def test():
     for pat, val in [

From b03b8023943417dc544f70bd470ba5f61c59d848 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 17 Nov 2010 14:12:14 +0800
Subject: [PATCH 12/28] adjusted css to compact the comments display

---
 src/calibre/gui2/book_details.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/calibre/gui2/book_details.py b/src/calibre/gui2/book_details.py
index e193fe10b2..8cc2965171 100644
--- a/src/calibre/gui2/book_details.py
+++ b/src/calibre/gui2/book_details.py
@@ -221,6 +221,8 @@ class BookInfo(QWebView):
             <style type="text/css">
                 body, td {background-color: %s; font-size: %dpx; color: %s }
                 a { text-decoration: none; color: blue }
+                p { margin-top: .2em }
+                h3 { margin-bottom: .2em }
             </style>
             </head>
             <body>

From fb124c50a767956abcadec577fe10ad1e0e4ae80 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 17 Nov 2010 17:55:50 +0800
Subject: [PATCH 13/28] added negative lookahead to reduce false positive
 matches during chapter marking

---
 src/calibre/ebooks/conversion/utils.py | 54 +++++++++++++++++---------
 1 file changed, 36 insertions(+), 18 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index bec15924d6..ac38a0097d 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -83,6 +83,24 @@ class PreProcessor(object):
         if min_lns > tot_htm_ends:
             return True
 
+    def dump(self, raw, where):
+        import os
+        dp = getattr(self.extra_opts, 'debug_pipeline', None)
+        if dp and os.path.exists(dp):
+            odir = os.path.join(dp, 'preprocess')
+            if not os.path.exists(odir):
+                    os.makedirs(odir)
+            if os.path.exists(odir):
+                odir = os.path.join(odir, where)
+                if not os.path.exists(odir):
+                    os.makedirs(odir)
+                name, i = None, 0
+                while not name or os.path.exists(os.path.join(odir, name)):
+                    i += 1
+                    name = '%04d.html'%i
+                with open(os.path.join(odir, name), 'wb') as f:
+                    f.write(raw.encode('utf-8'))
+
     def __call__(self, html):
         self.log("*********  Preprocessing HTML  *********")
 
@@ -150,7 +168,7 @@ class PreProcessor(object):
                #print "blanks between paragraphs is marked True"
             else:
                 blanks_between_paragraphs = False
-        #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
+        #self.dump(html, 'before_chapter_markup')
         # detect chapters/sections to match xpath or splitting logic
         #
         # Build the Regular Expressions in pieces
@@ -158,7 +176,7 @@ class PreProcessor(object):
         chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
         chapter_header_open = r"(?P<chap>"
         chapter_header_close = ")\s*"
-        chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>\s*"
+        chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
         if blanks_between_paragraphs:
             blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
         else:
@@ -169,7 +187,7 @@ class PreProcessor(object):
         title_header_close = ")\s*"
         title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
         opt_title_close = ")?"
-        n_lookahead_open = "(?!="
+        n_lookahead_open = "\s+(?!"
         n_lookahead_close = ")"
 
         default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)"
@@ -181,10 +199,10 @@ class PreProcessor(object):
 
         full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close
         n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-        print "n_lookahead is " + n_lookahead
-        print "Chapter line is " + full_chapter_line + "\n\n"
+        #print "n_lookahead is:\n" + n_lookahead + "\n\n"
+        #print "'normal' Chapter line - no title - is:\n" + full_chapter_line + "\n\n"
         chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-        print chapter_marker
+        #print "full chapter regex with lookahead is:\n" + chapter_marker + "\n\n"
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
         self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
@@ -197,10 +215,10 @@ class PreProcessor(object):
             self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters")
             full_chapter_line = chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close
             n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-            print "n_lookahead is " + n_lookahead
-            print "Chapter line is " + full_chapter_line + "\n\n"
+            #print "n_lookahead is " + n_lookahead
+            #print "Chapter line is " + full_chapter_line + "\n\n"
             chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-            print chapter_marker
+            #print chapter_marker
             chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
             html = chapdetect2.sub(self.chapter_head, html)
 
@@ -208,10 +226,10 @@ class PreProcessor(object):
             self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying emphazised lines")
             full_chapter_line = chapter_line_open+chapter_header_open+emphasized_lines+chapter_header_close+chapter_line_close
             n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-            print "n_lookahead is " + n_lookahead
-            print "Chapter line is " + full_chapter_line + "\n\n"
+            #print "n_lookahead is " + n_lookahead
+            #print "Chapter line is " + full_chapter_line + "\n\n"
             chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-            print chapter_marker
+            #print chapter_marker
             chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
             html = chapdetect2.sub(self.chapter_head, html)            
 
@@ -219,10 +237,10 @@ class PreProcessor(object):
             self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words")
             full_chapter_line = chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close
             n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-            print "n_lookahead is " + n_lookahead
-            print "Chapter line is " + full_chapter_line + "\n\n"
+            #print "n_lookahead is " + n_lookahead
+            #print "Chapter line is " + full_chapter_line + "\n\n"
             chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-            print chapter_marker
+            #print chapter_marker
             chapdetect2 = re.compile(r'%s' % chapter_marker,  re.UNICODE)
             html = chapdetect2.sub(self.chapter_head, html)
 
@@ -230,10 +248,10 @@ class PreProcessor(object):
             self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters with titles")
             full_chapter_line = chapter_line_open+chapter_header_open+numeric_titles+chapter_header_close+chapter_line_close
             n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-            print "n_lookahead is " + n_lookahead
-            print "Chapter line is " + full_chapter_line + "\n\n"
+            #print "n_lookahead is " + n_lookahead
+            #print "Chapter line is " + full_chapter_line + "\n\n"
             chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-            print chapter_marker
+            #print chapter_marker
             chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
             html = chapdetect2.sub(self.chapter_head, html)
 

From b238903ba81af83b3a7246cdf5d4d839a48f0d9b Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 17 Nov 2010 19:27:51 +0800
Subject: [PATCH 14/28] minor tweaks to chapter marking

---
 src/calibre/ebooks/conversion/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index ac38a0097d..fffb0d75d4 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -195,7 +195,7 @@ class PreProcessor(object):
         numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
         uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
         numeric_titles = r".?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*"
-        emphasized_lines = r"<b[^>]*>\s*(<span[^>]*>)?\s*(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>"
+        emphasized_lines = r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>"
 
         full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close
         n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
@@ -308,10 +308,10 @@ class PreProcessor(object):
         html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
 
         # If still no sections after unwrapping mark split points on lines with no punctuation
-        if self.html_preprocess_sections < 10:
+        if self.html_preprocess_sections < 5:
             self.log("Looking for more split points based on punctuation,"
                     " currently have " + unicode(self.html_preprocess_sections))
-            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
+            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
             html = chapdetect3.sub(self.chapter_break, html)
         # search for places where a first or second level heading is immediately followed by another
         # top level heading.  demote the second heading to h3 to prevent splitting between chapter

From 2b888a4add647821774fbf92ea7807bbdf435af9 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 19 Nov 2010 10:03:56 +0800
Subject: [PATCH 15/28] fix a problem with pdf unwrap_factor getting set to 0.0

---
 src/calibre/gui2/convert/pdf_input.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/gui2/convert/pdf_input.py b/src/calibre/gui2/convert/pdf_input.py
index 967a0fe234..f1ef7d24ee 100644
--- a/src/calibre/gui2/convert/pdf_input.py
+++ b/src/calibre/gui2/convert/pdf_input.py
@@ -22,5 +22,5 @@ class PluginWidget(Widget, Ui_Form):
 
     def set_value_handler(self, g, val):
         if val is None and isinstance(g, QDoubleSpinBox):
-            g.setValue(0.0)
+            g.setValue(0.45)
             return True

From 2a40afbd8e819e8fee0261e1f35ba54af235be8d Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 19 Nov 2010 12:54:25 +0800
Subject: [PATCH 16/28] blanklines are preserved in rtf2xml, then converted to
 empty html paragraphs to preserver softbreaks

---
 src/calibre/ebooks/rtf/input.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 078b30627f..d7619d471a 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -84,7 +84,7 @@ class RTFInput(InputFormatPlugin):
             group_borders = 1,
 
             # Write or do not write paragraphs. Default is 0.
-            empty_paragraphs = 0,
+            empty_paragraphs = 1,
         )
         parser.parse_rtf()
         ans = open('out.xml').read()
@@ -228,6 +228,10 @@ class RTFInput(InputFormatPlugin):
         with open(html, 'wb') as f:
             res = transform.tostring(result)
             res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
+            # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
+            if not getattr(self.options, 'remove_paragraph_spacing', False):
+                res = re.sub('\s*<body>', '<body>', res)
+                res = re.sub('\n{4}', u'\n<p>\u00a0</p>\n', res)
             if self.options.preprocess_html:
                 preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
                 res = preprocessor(res)

From f8f908ecd670f63ac07573d9ea330abfbca4ff3a Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 19 Nov 2010 13:23:32 +0800
Subject: [PATCH 17/28] ...

---
 src/calibre/ebooks/rtf/input.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index d7619d471a..d0ef19ecd9 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -231,7 +231,7 @@ class RTFInput(InputFormatPlugin):
             # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
             if not getattr(self.options, 'remove_paragraph_spacing', False):
                 res = re.sub('\s*<body>', '<body>', res)
-                res = re.sub('\n{4}', u'\n<p>\u00a0</p>\n', res)
+                res = re.sub('(?<=\n)\n{2}', u'<p>\u00a0</p>\n', res)
             if self.options.preprocess_html:
                 preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
                 res = preprocessor(res)

From 25c93421fb38455a4b57eb4e84bb9c55eb507299 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 20 Nov 2010 12:25:56 +0800
Subject: [PATCH 18/28] merge from trunk

---
 src/calibre/ebooks/conversion/plumber.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 9a863d7e66..d0e9aa2e99 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -957,6 +957,8 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
     '''
     Create an OEBBook.
     '''
+    if input_plugin == 'LITInput':
+        print "***\n\n*** Input plugin is: "+str(input_plugin)+"\n\n****"
     from calibre.ebooks.oeb.base import OEBBook
     html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
             opts.preprocess_html, opts)

From 9c2dcfd5aff2b6e521677bf8afeac68fb81c7816 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 20 Nov 2010 12:26:57 +0800
Subject: [PATCH 19/28] ...

---
 src/calibre/ebooks/conversion/plumber.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index d0e9aa2e99..9a863d7e66 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -957,8 +957,6 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
     '''
     Create an OEBBook.
     '''
-    if input_plugin == 'LITInput':
-        print "***\n\n*** Input plugin is: "+str(input_plugin)+"\n\n****"
     from calibre.ebooks.oeb.base import OEBBook
     html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
             opts.preprocess_html, opts)

From 267eebb9aa489cc443e57e90a9353730345af0c3 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 21 Nov 2010 15:38:18 +0800
Subject: [PATCH 20/28] adjusted preprocessing regexes for hyphen removal and
 chapter marking

---
 src/calibre/ebooks/conversion/preprocess.py |  2 +-
 src/calibre/ebooks/conversion/utils.py      | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index ef092f7954..3ff816b3bf 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -220,7 +220,7 @@ class Dehyphenator(object):
         self.html = html
         self.format = format
         if format == 'html':
-            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
+            intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)' % length)
         elif format == 'pdf':
             intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
         elif format == 'individual_words':
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 20689c6950..feb74324e8 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -107,7 +107,7 @@ class PreProcessor(object):
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
         html = re.sub(r"\s*</p>", "</p>\n", html)
         html = re.sub(r"\s*<p>\s*", "\n<p>", html)
-
+        
         ###### Check Markup ######
         #
         # some lit files don't have any <p> tags or equivalent (generally just plain text between
@@ -191,10 +191,10 @@ class PreProcessor(object):
         n_lookahead_close = ")"
 
         default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)"
-        typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
-        numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
-        uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
-        numeric_titles = r".?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*"
+        typical_chapters = r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
+        numeric_chapters = r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}\s*"
+        uppercase_chapters = r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*"
+        numeric_titles = r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*"
         emphasized_lines = r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>"
 
         full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close

From 73278a8cd65dc780155154712ecdb77048fbacb0 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 21 Nov 2010 15:40:02 +0800
Subject: [PATCH 21/28] ...

---
 src/calibre/gui2/convert/pdf_input.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/gui2/convert/pdf_input.py b/src/calibre/gui2/convert/pdf_input.py
index f1ef7d24ee..967a0fe234 100644
--- a/src/calibre/gui2/convert/pdf_input.py
+++ b/src/calibre/gui2/convert/pdf_input.py
@@ -22,5 +22,5 @@ class PluginWidget(Widget, Ui_Form):
 
     def set_value_handler(self, g, val):
         if val is None and isinstance(g, QDoubleSpinBox):
-            g.setValue(0.45)
+            g.setValue(0.0)
             return True

From c378a90a927bd9e1d075699226353ac05ccd9422 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 21 Nov 2010 16:43:06 +0800
Subject: [PATCH 22/28] reworked chapter marking code

---
 src/calibre/ebooks/conversion/utils.py | 86 ++++++++------------------
 1 file changed, 26 insertions(+), 60 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index feb74324e8..acd8d3f02a 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -172,7 +172,7 @@ class PreProcessor(object):
         # detect chapters/sections to match xpath or splitting logic
         #
         # Build the Regular Expressions in pieces
-        lookahead = "(?=<(p|div))"
+        init_lookahead = "(?=<(p|div))"
         chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
         chapter_header_open = r"(?P<chap>"
         chapter_header_close = ")\s*"
@@ -191,69 +191,35 @@ class PreProcessor(object):
         n_lookahead_close = ")"
 
         default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)"
-        typical_chapters = r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
-        numeric_chapters = r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}\s*"
-        uppercase_chapters = r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*"
-        numeric_titles = r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*"
-        emphasized_lines = r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>"
-
-        full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close
-        n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-        #print "n_lookahead is:\n" + n_lookahead + "\n\n"
-        #print "'normal' Chapter line - no title - is:\n" + full_chapter_line + "\n\n"
-        chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-        #print "full chapter regex with lookahead is:\n" + chapter_marker + "\n\n"
+        
+        min_chapters = 10
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
         self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
-        #
-        # Start with most typical chapter headings, get more aggressive until one works
-        if self.html_preprocess_sections < 10:
-            chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+        
+        chapter_types = [
+            [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
+            [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
+            [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
+            [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
+            [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
+            ]
+        
+        for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
+            if self.html_preprocess_sections >= min_chapters:
+                break
+            full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
+            n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
+            self.log("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
+            if lookahead_ignorecase:
+                chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+                chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+            else:
+                chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
+                chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
+                
             html = chapdetect.sub(self.chapter_head, html)
-        if self.html_preprocess_sections < 10:
-            self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters")
-            full_chapter_line = chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close
-            n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-            #print "n_lookahead is " + n_lookahead
-            #print "Chapter line is " + full_chapter_line + "\n\n"
-            chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-            #print chapter_marker
-            chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
-            html = chapdetect2.sub(self.chapter_head, html)
-
-        if self.html_preprocess_sections < 10:
-            self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying emphazised lines")
-            full_chapter_line = chapter_line_open+chapter_header_open+emphasized_lines+chapter_header_close+chapter_line_close
-            n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-            #print "n_lookahead is " + n_lookahead
-            #print "Chapter line is " + full_chapter_line + "\n\n"
-            chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-            #print chapter_marker
-            chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
-            html = chapdetect2.sub(self.chapter_head, html)            
-
-        if self.html_preprocess_sections < 10:
-            self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words")
-            full_chapter_line = chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close
-            n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-            #print "n_lookahead is " + n_lookahead
-            #print "Chapter line is " + full_chapter_line + "\n\n"
-            chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-            #print chapter_marker
-            chapdetect2 = re.compile(r'%s' % chapter_marker,  re.UNICODE)
-            html = chapdetect2.sub(self.chapter_head, html)
-
-        if self.html_preprocess_sections < 10:
-            self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters with titles")
-            full_chapter_line = chapter_line_open+chapter_header_open+numeric_titles+chapter_header_close+chapter_line_close
-            n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
-            #print "n_lookahead is " + n_lookahead
-            #print "Chapter line is " + full_chapter_line + "\n\n"
-            chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
-            #print chapter_marker
-            chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
-            html = chapdetect2.sub(self.chapter_head, html)
+            
 
         ###### Unwrap lines ######
         #

From fae3252d50f3316458dad2606a3362e2345f5326 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 21 Nov 2010 17:37:56 +0800
Subject: [PATCH 23/28] further cleanup to chapter markup

---
 src/calibre/ebooks/conversion/utils.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index acd8d3f02a..af3d83da4a 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -168,29 +168,30 @@ class PreProcessor(object):
                #print "blanks between paragraphs is marked True"
             else:
                 blanks_between_paragraphs = False
-        #self.dump(html, 'before_chapter_markup')
+        self.dump(html, 'before_chapter_markup')
         # detect chapters/sections to match xpath or splitting logic
         #
         # Build the Regular Expressions in pieces
         init_lookahead = "(?=<(p|div))"
         chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
+        title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
         chapter_header_open = r"(?P<chap>"
+        title_header_open = r"(?P<title>"
         chapter_header_close = ")\s*"
+        title_header_close = ")"
         chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
+        title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
+
         if blanks_between_paragraphs:
             blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
         else:
             blank_lines = ""
         opt_title_open = "("
-        title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
-        title_header_open = "(?P<title>"
-        title_header_close = ")\s*"
-        title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
         opt_title_close = ")?"
         n_lookahead_open = "\s+(?!"
         n_lookahead_close = ")"
 
-        default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)"
+        default_title = r"\s{0,3}([\w\'\"-]+\s{0,3}){1,5}?(?=<)"
         
         min_chapters = 10
         heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
@@ -204,7 +205,8 @@ class PreProcessor(object):
             [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
             [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
             ]
-        
+
+        # Start with most typical chapter headings, get more aggressive until one works
         for [chapter_type, lookahead_ignorecase, log_message] in chapter_types:
             if self.html_preprocess_sections >= min_chapters:
                 break
@@ -215,7 +217,9 @@ class PreProcessor(object):
                 chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
                 chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
             else:
+                print "Chapter line is:\n"+full_chapter_line
                 chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
+                print "\nFull regex is:\n"+chapter_marker
                 chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
                 
             html = chapdetect.sub(self.chapter_head, html)

From cf4f9e41c273fe5f63db22a33120cd3a380bc730 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 21 Nov 2010 17:49:46 +0800
Subject: [PATCH 24/28] ...

---
 src/calibre/ebooks/conversion/utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index af3d83da4a..26c8d23e0c 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -217,9 +217,7 @@ class PreProcessor(object):
                 chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
                 chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
             else:
-                print "Chapter line is:\n"+full_chapter_line
                 chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
-                print "\nFull regex is:\n"+chapter_marker
                 chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
                 
             html = chapdetect.sub(self.chapter_head, html)

From e1602dc31a2ac0b3f8f4367fd5d881369906c7e1 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 21 Nov 2010 18:03:25 +0800
Subject: [PATCH 25/28] ...

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 26c8d23e0c..ea78808d08 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -168,7 +168,7 @@ class PreProcessor(object):
                #print "blanks between paragraphs is marked True"
             else:
                 blanks_between_paragraphs = False
-        self.dump(html, 'before_chapter_markup')
+        #self.dump(html, 'before_chapter_markup')
         # detect chapters/sections to match xpath or splitting logic
         #
         # Build the Regular Expressions in pieces

From caabf1d814a419c79aafb1a78a372afc894420de Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 21 Nov 2010 18:29:59 +0800
Subject: [PATCH 26/28] ...

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index ea78808d08..4f3e2ed90a 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -200,7 +200,7 @@ class PreProcessor(object):
         
         chapter_types = [
             [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
-            [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
+            [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,10}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
             [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
             [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
             [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters

From 7b35480ce2acf9a947193f504fe26ac78fb8ca94 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 21 Nov 2010 19:40:17 +0800
Subject: [PATCH 27/28] fixed a problem with some formats and line unwrapping

---
 src/calibre/ebooks/conversion/utils.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 4f3e2ed90a..2039a545ca 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -22,12 +22,12 @@ class PreProcessor(object):
         title = match.group('title')
         if not title:
             self.html_preprocess_sections = self.html_preprocess_sections + 1
-            self.log("found " + unicode(self.html_preprocess_sections) +
+            self.log("marked " + unicode(self.html_preprocess_sections) +
                     " chapters. - " + unicode(chap))
             return '<h2>'+chap+'</h2>\n'
         else:
             self.html_preprocess_sections = self.html_preprocess_sections + 1
-            self.log("found " + unicode(self.html_preprocess_sections) +
+            self.log("marked " + unicode(self.html_preprocess_sections) +
                     " chapters & titles. - " + unicode(chap) + ", " + unicode(title))
             return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
 
@@ -106,7 +106,7 @@ class PreProcessor(object):
 
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
         html = re.sub(r"\s*</p>", "</p>\n", html)
-        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
+        html = re.sub(r"\s*<p(?P<style>[^>]*)>\s*", "\n<p"+"\g<style>"+">", html)
         
         ###### Check Markup ######
         #
@@ -200,7 +200,7 @@ class PreProcessor(object):
         
         chapter_types = [
             [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}", True, "Searching for common Chapter Headings"],
-            [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,10}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
+            [r"[^'\"]?(\d+\.?|CHAPTER)\s*([\dA-Z\-\'\"\?\.!#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings"],  # Numeric Chapters
             [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(\s*(?=[\w#\-*\s]+<)([\w#-*]+\s*){1,5}\s*)(</span>)?\s*</b>", True, "Searching for emphasized lines"], # Emphasized lines
             [r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles"], # Numeric Titles
             [r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters" ] # Uppercase Chapters
@@ -241,6 +241,7 @@ class PreProcessor(object):
             format = 'html'
         # Check Line histogram to determine if the document uses hard line breaks, If 50% or
         # more of the lines break in the same region of the document then unwrapping is required
+        self.dump(html, 'before_doc_analysis_zipped_http')
         docanalysis = DocAnalysis(format, html)
         hardbreaks = docanalysis.line_histogram(.50)
         self.log("Hard line breaks check returned "+unicode(hardbreaks))

From 6c3fc5280f33434366ad38fc0c1128fda8810d3e Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 21 Nov 2010 19:52:55 +0800
Subject: [PATCH 28/28] ...

---
 src/calibre/ebooks/conversion/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 2039a545ca..cca3679d14 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -241,7 +241,6 @@ class PreProcessor(object):
             format = 'html'
         # Check Line histogram to determine if the document uses hard line breaks, If 50% or
         # more of the lines break in the same region of the document then unwrapping is required
-        self.dump(html, 'before_doc_analysis_zipped_http')
         docanalysis = DocAnalysis(format, html)
         hardbreaks = docanalysis.line_histogram(.50)
         self.log("Hard line breaks check returned "+unicode(hardbreaks))