From 0ad1f3c088f2ff0872de49171fd99a91a50a031a Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 25 Aug 2010 10:49:42 +1000
Subject: [PATCH 01/12] preprocessing regex tweaks

---
 src/calibre/ebooks/conversion/preprocess.py | 2 +-
 src/calibre/ebooks/rtf/input.py             | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index da652c1a38..940c27344b 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -209,7 +209,7 @@ class HTMLPreProcessor(object):
                   (re.compile(ur'\u00a0'), lambda match : ' '),
 
                   # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
+                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
                   (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
 
                   # Have paragraphs show better
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index dcffbe68ca..eaba28e429 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -231,12 +231,12 @@ class RTFInput(InputFormatPlugin):
             if self.options.preprocess_html:
                 print "*********  Preprocessing HTML  *********\n"
                 # Detect Chapters to match the xpath in the GUI
-                chapdetect = re.compile(r'<p[^>]*>\s*<span[^>]*>\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)\s*</span>\s*</p>', re.IGNORECASE)
+                chapdetect = re.compile(r'<p[^>]*>\s*<span[^>]*>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(<(/i|b)>)?)?)\s*</span>\s*</p>', re.IGNORECASE)
                 res = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', res)
                 # Unwrap lines using punctation if the median length of all lines is less than 150
                 length = line_length('html', res, 0.4)
                 print "*** Median length is " + str(length) + " ***\n"
-                unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</span>\s*(</p>)?\s*(?P<up2threeblanks><p[^>]*>\s*(<span[^>]*>\s*</span>\s*)</p>\s*){0,3}\s*<p[^>]*>\s*(<span[^>]*>)?\s*" % length, re.UNICODE)
+                unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</span>\s*</p>\s*(?P<up2threeblanks><p[^>]*>\s*(<span[^>]*>\s*</span>\s*)</p>\s*){0,3}\s*<p[^>]*>\s*<span[^>]*>\s*" % length, re.UNICODE)
                 if length < 150:
                     res = unwrap.sub(' ', res)
             f.write(res)

From 5c951fb9628617133f17ead6d1393ea84b7c6412 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 4 Sep 2010 15:12:29 +1000
Subject: [PATCH 02/12] Preprocessing Updates

---
 src/calibre/ebooks/conversion/preprocess.py |  26 +++--
 src/calibre/ebooks/html/input.py            |   2 +-
 src/calibre/ebooks/lit/input.py             | 104 ++++++++++++++++++--
 src/calibre/ebooks/mobi/input.py            |  10 ++
 src/calibre/ebooks/pdf/reflow.py            |   4 +
 5 files changed, 132 insertions(+), 14 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 957418f1fd..2954fd7c26 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -62,6 +62,7 @@ def wrap_lines(match):
     else:
                return ital+' '
 
+
 def line_length(format, raw, percent):
     '''
     raw is the raw text to find the line length to use for wrapping.
@@ -191,32 +192,36 @@ class HTMLPreProcessor(object):
                   (re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'),
                   (re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'),
 
+                  # If pdf printed from a browser then the header/footer has a reliable pattern
+                  (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
+
+                  # Center separator lines
+                  (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
+
                   # Remove page links
                   (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
                   # Remove <hr> tags
                   (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
                   # Replace <br><br> with <p>
-                  (re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
+                  # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),
 
-                  # Remove hyphenation
-                  (re.compile(r'-<br.*?>\n\r?'), lambda match: ''),
+                  # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
+                  (re.compile(r'(?<=[-–])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
 
                   # Remove gray background
                   (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
 
                   # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
-                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
+                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+(\s\w+)?)?\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
 
                   # Have paragraphs show better
                   (re.compile(r'<br.*?>'), lambda match : '<p>'),
                   # Clean up spaces
                   (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
-                  # Connect paragraphs split by -
-                  (re.compile(u'(?<=[^\s][-–])[\s]*(</p>)*[\s]*(<p>)*\s*(?=[^\s])'), lambda match: ''),
                   # Add space before and after italics
                   (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
                   (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
+                                   
                  ]
 
     # Fix Book Designer markup
@@ -293,6 +298,13 @@ class HTMLPreProcessor(object):
                 import traceback
                 print 'Failed to parse remove_footer regexp'
                 traceback.print_exc()
+        
+        # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
+        if getattr(self.extra_opts, 'preprocess_html', None):
+            if is_pdftohtml:
+                end_rules.append(
+                    (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
+                )
 
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
             length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index d57bfddd3e..35a8a1a9bc 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -494,7 +494,7 @@ class HTMLInput(InputFormatPlugin):
         if not hasattr(self, 'log'):
             from calibre.utils.logging import default_log
             self.log = default_log
-		self.log("*********  Preprocessing HTML  *********")
+		self.log("*********  Preprocessing HTML - HTML Input plugin *********")
 		# Detect Chapters to match the xpath in the GUI
 		chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
 		html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 9bf20fb1d4..f7bb0fbfd9 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -11,12 +11,14 @@ import re
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.conversion.preprocess import line_length
 
+
 class LITInput(InputFormatPlugin):
 
     name        = 'LIT Input'
     author      = 'Marshall T. Vandegrift'
     description = 'Convert LIT files to HTML'
     file_types  = set(['lit'])
+    html_preprocess_sections = 0
 
     def convert(self, stream, options, file_ext, log,
                 accelerators):
@@ -55,14 +57,104 @@ class LITInput(InputFormatPlugin):
 
 
 	def preprocess_html(self, html):
+
+        def chapter_head(match):
+            chap = match.group('chap')
+            title = match.group('title')
+            if not title:
+                       self.html_preprocess_sections = self.html_preprocess_sections + 1
+                       self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
+                       return '<h2>'+chap+'</h2>\n'
+            else:
+                       self.html_preprocess_sections = self.html_preprocess_sections + 1
+                       self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+                       return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
+
+        def chapter_link(match):
+            chap = match.group('sectionlink')
+            if not chap:
+                       self.html_preprocess_sections = self.html_preprocess_sections + 1
+                       self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
+                       return '<br style="page-break-before:always">'
+            else:
+                       self.html_preprocess_sections = self.html_preprocess_sections + 1
+                       self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
+                       return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>'
+
+
+        def no_markup(raw, percent):
+            '''
+            Detects total marked up line endings in the file. raw is the text to 
+            inspect.  Percent is the minimum percent of line endings which should 
+            be marked up to return true.
+            '''
+            htm_end_ere = re.compile('</p>', re.DOTALL)
+            line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
+            htm_end = htm_end_ere.findall(raw)
+            line_end = line_end_ere.findall(raw)
+            tot_htm_ends = len(htm_end)
+            tot_ln_fds = len(line_end)
+            self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
+    
+            if percent > 1:
+                percent = 1
+            if percent < 0:
+                percent = 0    
+    
+            min_lns = tot_ln_fds * percent
+            self.log("There must be more than " + str(min_lns) + " unmarked lines to be true")
+            if min_lns > tot_htm_ends:
+                return True
+                
 		self.log("*********  Preprocessing HTML  *********")
-		# Detect Chapters to match the xpath in the GUI
-		chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
-		html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
-		# Unwrap lines using punctation if the median length of all lines is less than 150
+		# remove non-breaking spaces
+		html = re.sub(ur'\u00a0', ' ', html)
+		# Get rid of empty <o:p> tags to simplify other processing
+		html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
+		# Get rid of empty span tags
+        html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
+        
+        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+		linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE)
+        blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
+        blanklines = blankreg.findall(html)
+        lines = linereg.findall(html)
+        if len(lines) > 1:
+            self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
+            if float(len(blanklines)) / float(len(lines)) > 0.40:
+                self.log("deleting blank lines")
+                html = blankreg.sub('', html)
+		# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+		html = re.sub(r"\s*</p>", "</p>\n", html)
+		
+		# some lit files don't have any <p> tags or equivalent, check and 
+		# mark up line endings if required before proceeding
+		if no_markup(html, 0.1):
+		     self.log("not enough paragraph markers, adding now")
+             add_markup = re.compile('(?<!>)(\n)')
+             html = add_markup.sub('</p>\n<p>', html)
+        
+		# detect chapters/sections to match xpath or splitting logic
 		#
-		# Insert extra line feeds so the line length regex functions properly
-		html = re.sub(r"</p>", "</p>\n", html)
+		# Mark split points based on embedded links
+		chaplink = re.compile(r'<a\sname[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<sectionlink>[^\s<]+(\s*[^\s<]+){0,4})?\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*</a>', re.IGNORECASE)
+        html = chaplink.sub(chapter_link, html)
+        # Continue with alternate patterns, start with most typical chapter headings
+		if self.html_preprocess_sections < 10:        
+            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}.?(\d+\.?|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\s*){0,4}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
+            html = chapdetect.sub(chapter_head, html)
+		if self.html_preprocess_sections < 10:
+		    self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+		    html = chapdetect2.sub(chapter_head, html)
+		    
+        # search for places where a first or second level heading is immediately followed by another
+        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
+        # headings and titles, images, etc
+        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
+        html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
+		#    
+		# Unwrap lines using punctation if the median length of all lines is less than 150		
 		length = line_length('html', html, 0.4)
 		self.log("*** Median length is " + str(length) + " ***")
 		unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py
index 487e70c04f..b8dc7a9560 100644
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@@ -3,6 +3,7 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
+import re
 from calibre.customize.conversion import InputFormatPlugin
 
 class MOBIInput(InputFormatPlugin):
@@ -37,3 +38,12 @@ class MOBIInput(InputFormatPlugin):
                     include_meta_content_type=False))
                 accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
         return mr.created_opf_path
+
+    def preprocess_html(self, html):
+        # search for places where a first or second level heading is immediately followed by another
+        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
+        # headings and titles, images, etc
+        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
+        html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
+        return html
+
diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py
index 584d631d0b..36848ddb8b 100644
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@@ -408,6 +408,10 @@ class Page(object):
     # Fraction of text height that two strings' bottoms can differ by
     # for them to be considered to be part of the same text fragment
     LINE_FACTOR = 0.4
+    
+    # Percentage of the page heigth which should be considered header
+    # or footer to be discarded from reflow considerations
+    HEAD_FOOTER_MARGIN
 
     # Multiplies the average line height when determining row height
     # of a particular element to detect columns.

From 4c7373026b9ee8a618dccf8602740d6a7d578aa2 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 11 Sep 2010 12:10:49 +1000
Subject: [PATCH 03/12] preprocessing changes for lit & pdf, added utils.py,
 changed default unwrap_factor

---
 src/calibre/ebooks/conversion/preprocess.py | 15 ++++++++---
 src/calibre/ebooks/conversion/utils.py      |  6 +++++
 src/calibre/ebooks/lit/input.py             | 29 +++++++++++++--------
 src/calibre/ebooks/pdf/input.py             |  4 +--
 4 files changed, 37 insertions(+), 17 deletions(-)
 create mode 100644 src/calibre/ebooks/conversion/utils.py

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 2954fd7c26..452a322d95 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -77,6 +77,7 @@ def line_length(format, raw, percent):
     elif format == 'pdf':
         linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
     lines = linere.findall(raw)
+    print "percent is " + str(percent)
 
     lengths = []
     for line in lines:
@@ -165,6 +166,11 @@ class HTMLPreProcessor(object):
                   (re.compile(u'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ì'),
                   (re.compile(u'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'à'),
                   (re.compile(u'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'À'),
+                  
+                  #(re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'),
+                  #(re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'),
+                  #(re.compile(u'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ò'),
+                  #(re.compile(u'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ò'),
 
                   (re.compile(u'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ó'),
                   (re.compile(u'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ó'),
@@ -206,13 +212,13 @@ class HTMLPreProcessor(object):
                   # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),
 
                   # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
-                  (re.compile(r'(?<=[-–])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
+                  (re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
 
                   # Remove gray background
                   (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
 
                   # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+(\s\w+)?)?\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
+                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
 
                   # Have paragraphs show better
                   (re.compile(r'<br.*?>'), lambda match : '<p>'),
@@ -303,15 +309,16 @@ class HTMLPreProcessor(object):
         if getattr(self.extra_opts, 'preprocess_html', None):
             if is_pdftohtml:
                 end_rules.append(
-                    (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
+                    (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
                 )
 
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
             length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
             if length:
+                print "The pdf line length returned is " + str(length)
                 end_rules.append(
                     # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
                 )
 
         for rule in self.PREPROCESS + start_rules:
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
new file mode 100644
index 0000000000..52be473372
--- /dev/null
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
\ No newline at end of file
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index f7bb0fbfd9..35dad501be 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -102,7 +102,7 @@ class LITInput(InputFormatPlugin):
                 percent = 0    
     
             min_lns = tot_ln_fds * percent
-            self.log("There must be more than " + str(min_lns) + " unmarked lines to be true")
+            self.log("There must be more than " + str(min_lns) + " unmarked lines to return true")
             if min_lns > tot_htm_ends:
                 return True
                 
@@ -141,24 +141,31 @@ class LITInput(InputFormatPlugin):
         html = chaplink.sub(chapter_link, html)
         # Continue with alternate patterns, start with most typical chapter headings
 		if self.html_preprocess_sections < 10:        
-            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}.?(\d+\.?|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\s*){0,4}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
+            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(\d+\.?|Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
             html = chapdetect.sub(chapter_head, html)
 		if self.html_preprocess_sections < 10:
 		    self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
             chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
-		    html = chapdetect2.sub(chapter_head, html)
-		    
+		    html = chapdetect2.sub(chapter_head, html)    
+        #    
+		# Unwrap lines using punctation if the median length of all lines is less than 150		
+		length = line_length('html', html, 0.4)
+		self.log("*** Median line length is " + str(length) + " ***")
+		unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+		if length < 150:
+		    self.log("Unwrapping Lines")
+			html = unwrap.sub(' ', html)		
+		# If still no sections after unwrapping lines break on lines with no punctuation
+		if self.html_preprocess_sections < 10:
+		    self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
+		    #self.log(html)
+            chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE)
+            html = chapdetect3.sub(chapter_head, html)    	
         # search for places where a first or second level heading is immediately followed by another
         # top level heading.  demote the second heading to h3 to prevent splitting between chapter
         # headings and titles, images, etc
         doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
         html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
-		#    
-		# Unwrap lines using punctation if the median length of all lines is less than 150		
-		length = line_length('html', html, 0.4)
-		self.log("*** Median length is " + str(length) + " ***")
-		unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
-		if length < 150:
-			html = unwrap.sub(' ', html)
+
         return html
 
diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py
index 64a089281e..113c3d99d8 100644
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@@ -22,10 +22,10 @@ class PDFInput(InputFormatPlugin):
     options = set([
         OptionRecommendation(name='no_images', recommended_value=False,
             help=_('Do not extract images from the document')),
-        OptionRecommendation(name='unwrap_factor', recommended_value=0.5,
+        OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
             help=_('Scale used to determine the length at which a line should '
             'be unwrapped. Valid values are a decimal between 0 and 1. The '
-            'default is 0.5, this is the median line length.')),
+            'default is 0.45, this is the median line length.')),
         OptionRecommendation(name='new_pdf_engine', recommended_value=False,
             help=_('Use the new PDF conversion engine.'))
     ])

From faf15b2f3d611594352721d4d06407025fea1320 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 11 Sep 2010 13:09:23 +1000
Subject: [PATCH 04/12] preprocess merge gone wrong, fixing

---
 src/calibre/ebooks/conversion/preprocess.py | 25 ++++++---------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index e2364d961f..24a389e65c 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -168,7 +168,6 @@ class HTMLPreProcessor(object):
                   (re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
                   (re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
                   (re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
-<<<<<<< TREE
                   (re.compile(u'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'è'),
                   (re.compile(u'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'È'),
                   (re.compile(u'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ì'),
@@ -176,13 +175,6 @@ class HTMLPreProcessor(object):
                   (re.compile(u'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'à'),
                   (re.compile(u'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'À'),
                   
-                  #(re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'),
-                  #(re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'),
-                  #(re.compile(u'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ò'),
-                  #(re.compile(u'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ò'),
-=======
->>>>>>> MERGE-SOURCE
-
                   # ´
                   (re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),
                   (re.compile(u'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Á'),
@@ -218,14 +210,7 @@ class HTMLPreProcessor(object):
                   # ¸
                   (re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'),
                   (re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'),
-
-<<<<<<< TREE
-                  # If pdf printed from a browser then the header/footer has a reliable pattern
-                  (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
-
-                  # Center separator lines
-                  (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
-=======
+                  
                   # ˛
                   (re.compile(u'˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ą'),
                   (re.compile(u'˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ą'),
@@ -235,8 +220,12 @@ class HTMLPreProcessor(object):
                   # ˙
                   (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
                   (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
-                  
->>>>>>> MERGE-SOURCE
+
+                  # If pdf printed from a browser then the header/footer has a reliable pattern
+                  (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
+
+                  # Center separator lines
+                  (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
 
                   # Remove page links
                   (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),

From 2a906184ad4c56d3018806c03bf2647bd8ecc242 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 11 Sep 2010 13:17:21 +1000
Subject: [PATCH 05/12] preprocess merge gone wrong, merged original accent
 code back

---
 src/calibre/ebooks/conversion/preprocess.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 24a389e65c..f2b19efa9b 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -168,13 +168,7 @@ class HTMLPreProcessor(object):
                   (re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
                   (re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
                   (re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'è'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'È'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ì'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ì'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'à'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'À'),
-                  
+
                   # ´
                   (re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),
                   (re.compile(u'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Á'),
@@ -210,7 +204,7 @@ class HTMLPreProcessor(object):
                   # ¸
                   (re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'),
                   (re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'),
-                  
+
                   # ˛
                   (re.compile(u'˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ą'),
                   (re.compile(u'˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ą'),
@@ -221,6 +215,7 @@ class HTMLPreProcessor(object):
                   (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
                   (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
 
+
                   # If pdf printed from a browser then the header/footer has a reliable pattern
                   (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
 

From 480eccb0b0c3921fd356d329e6d601b9207c2d26 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 11 Sep 2010 15:33:10 +1000
Subject: [PATCH 06/12] Fixed unwrapping for various hyphen and dash types,
 other minor tweaks to pdf

---
 src/calibre/ebooks/conversion/preprocess.py | 28 ++++++++++++++-------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index f2b19efa9b..c120f0a560 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -77,7 +77,6 @@ def line_length(format, raw, percent):
     elif format == 'pdf':
         linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
     lines = linere.findall(raw)
-    print "percent is " + str(percent)
 
     lengths = []
     for line in lines:
@@ -230,14 +229,17 @@ class HTMLPreProcessor(object):
                   # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),
 
                   # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
-                  (re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
+                  #(re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
+                  # unwrap/delete soft hyphens
+                  #(re.compile(u'[­]\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
 
                   # Remove gray background
                   (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
 
                   # Detect Chapters to match default XPATH in GUI
                   (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
-
+                  (re.compile(r'<br\s*/?>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?'), chap_head),
+                  
                   # Have paragraphs show better
                   (re.compile(r'<br.*?>'), lambda match : '<p>'),
                   # Clean up spaces
@@ -322,21 +324,29 @@ class HTMLPreProcessor(object):
                 import traceback
                 print 'Failed to parse remove_footer regexp'
                 traceback.print_exc()
+      
+        # unwrap hyphenation - moved here so it's executed after header/footer removal
+        if is_pdftohtml:
+            # unwrap visible dashes and hyphens - don't delete as 50% or more of the time these
+            # hyphens are for compound words, formatting, etc
+            end_rules.append((re.compile(u'(?<=[-–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
+            # unwrap/delete soft hyphens
+            end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
+            # unwrap/delete soft hyphens with formatting
+            end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
         
         # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
         if getattr(self.extra_opts, 'preprocess_html', None):
             if is_pdftohtml:
-                end_rules.append(
-                    (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
-                )
-
+                end_rules.append((re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head))
+                
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
             length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
             if length:
-                print "The pdf line length returned is " + str(length)
+                # print "The pdf line length returned is " + str(length)
                 end_rules.append(
                     # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
                 )
 
         for rule in self.PREPROCESS + start_rules:

From cf7cc4de4d9b9fa5e4b22c5ce2cb63c099165589 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 11 Sep 2010 21:02:44 +1000
Subject: [PATCH 07/12] preprocess updates for lit, html, and pdf

---
 src/calibre/ebooks/conversion/preprocess.py |   8 --
 src/calibre/ebooks/conversion/utils.py      | 122 +++++++++++++++++++-
 src/calibre/ebooks/html/input.py            |  20 +---
 src/calibre/ebooks/lit/input.py             | 117 +------------------
 src/calibre/ebooks/pdb/pdf/reader.py        |   2 +-
 src/calibre/ebooks/pdf/input.py             |   2 +-
 6 files changed, 129 insertions(+), 142 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index c120f0a560..6123577191 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -214,7 +214,6 @@ class HTMLPreProcessor(object):
                   (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
                   (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
 
-
                   # If pdf printed from a browser then the header/footer has a reliable pattern
                   (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
 
@@ -225,13 +224,6 @@ class HTMLPreProcessor(object):
                   (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
                   # Remove <hr> tags
                   (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
-                  # Replace <br><br> with <p>
-                  # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),
-
-                  # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
-                  #(re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
-                  # unwrap/delete soft hyphens
-                  #(re.compile(u'[­]\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
 
                   # Remove gray background
                   (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 52be473372..68cebb3a11 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -3,4 +3,124 @@
 
 __license__   = 'GPL v3'
 __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
-__docformat__ = 'restructuredtext en'
\ No newline at end of file
+__docformat__ = 'restructuredtext en'
+
+import re
+from calibre.ebooks.conversion.preprocess import line_length
+from calibre.utils.logging import default_log
+from lxml import etree
+
+class PreProcessor(object):
+    html_preprocess_sections = 0
+
+    def __init__(self, args):
+        self.args = args
+        self.log = default_log
+   
+    def chapter_head(self, match):
+        chap = match.group('chap')
+        title = match.group('title')
+        if not title:
+                   self.html_preprocess_sections = self.html_preprocess_sections + 1
+                   self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
+                   return '<h2>'+chap+'</h2>\n'
+        else:
+                   self.html_preprocess_sections = self.html_preprocess_sections + 1
+                   self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+                   return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
+
+    def chapter_link(self, match):
+        chap = match.group('sectionlink')
+        if not chap:
+                   self.html_preprocess_sections = self.html_preprocess_sections + 1
+                   self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
+                   return '<br style="page-break-before:always">'
+        else:
+                   self.html_preprocess_sections = self.html_preprocess_sections + 1
+                   self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
+                   return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>'
+
+    def no_markup(self, raw, percent):
+        '''
+        Detects total marked up line endings in the file. raw is the text to 
+        inspect.  Percent is the minimum percent of line endings which should 
+        be marked up to return true.
+        '''
+        htm_end_ere = re.compile('</p>', re.DOTALL)
+        line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
+        htm_end = htm_end_ere.findall(raw)
+        line_end = line_end_ere.findall(raw)
+        tot_htm_ends = len(htm_end)
+        tot_ln_fds = len(line_end)
+        self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
+
+        if percent > 1:
+            percent = 1
+        if percent < 0:
+            percent = 0    
+    
+        min_lns = tot_ln_fds * percent
+        self.log("There must be fewer than " + str(min_lns) + " unmarked lines to return true")
+        if min_lns > tot_htm_ends:
+            return True
+            
+    def __call__(self, html):
+        self.log("*********  Preprocessing HTML  *********")
+        # remove non-breaking spaces
+        html = re.sub(ur'\u00a0', ' ', html)
+        # Get rid of empty <o:p> tags to simplify other processing
+        html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
+        # Get rid of empty span tags
+        html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
+        
+        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE)
+        blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
+        blanklines = blankreg.findall(html)
+        lines = linereg.findall(html)
+        if len(lines) > 1:
+            self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
+            if float(len(blanklines)) / float(len(lines)) > 0.40:
+                self.log("deleting blank lines")
+                html = blankreg.sub('', html)
+        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+        html = re.sub(r"\s*</p>", "</p>\n", html)
+        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
+        
+        # some lit files don't have any <p> tags or equivalent, check and 
+        # mark up line endings if required before proceeding
+        if self.no_markup(html, 0.1):
+             self.log("not enough paragraph markers, adding now")
+             add_markup = re.compile('(?<!>)(\n)')
+             html = add_markup.sub('</p>\n<p>', html)
+        
+        # detect chapters/sections to match xpath or splitting logic
+        # 
+        # Start with most typical chapter headings       
+        chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
+        html = chapdetect.sub(self.chapter_head, html)
+        if self.html_preprocess_sections < 10:
+            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9}|\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            html = chapdetect2.sub(self.chapter_head, html)    
+        #    
+        # Unwrap lines using punctation if the median length of all lines is less than 200        
+        length = line_length('html', html, 0.4)
+        self.log("*** Median line length is " + str(length) + " ***")
+        unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+        if length < 200:
+            self.log("Unwrapping Lines")
+            html = unwrap.sub(' ', html)        
+        # If still no sections after unwrapping lines break on lines with no punctuation
+        if self.html_preprocess_sections < 10:
+            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
+            #self.log(html)
+            chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE)
+            html = chapdetect3.sub(self.chapter_head, html)        
+        # search for places where a first or second level heading is immediately followed by another
+        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
+        # headings and titles, images, etc
+        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
+        html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
+        
+        return html
\ No newline at end of file
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 35a8a1a9bc..e83216ae1f 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -24,7 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows
 from calibre import unicode_path
 from calibre.utils.localization import get_lang
 from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
 
 class Link(object):
     '''
@@ -491,20 +491,6 @@ class HTMLInput(InputFormatPlugin):
         return (None, raw)
 
 	def preprocess_html(self, html):
-        if not hasattr(self, 'log'):
-            from calibre.utils.logging import default_log
-            self.log = default_log
-		self.log("*********  Preprocessing HTML - HTML Input plugin *********")
-		# Detect Chapters to match the xpath in the GUI
-		chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
-		html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
-		# Unwrap lines using punctation if the median length of all lines is less than 150
-		#
-		# Insert extra line feeds so the line length regex functions properly
-		html = re.sub(r"</p>", "</p>\n", html)
-		length = line_length('html', html, 0.4)
-		self.log.debug("*** Median length is " + str(length) + " ***")
-		unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
-		if length < 150:
-			html = unwrap.sub(' ', html)
+        preprocessor = PreProcessor(html)
+        html = preprocessor(html)
         return html
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 35dad501be..58e7bc84bf 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -6,10 +6,8 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-import re
-
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
 
 
 class LITInput(InputFormatPlugin):
@@ -18,7 +16,6 @@ class LITInput(InputFormatPlugin):
     author      = 'Marshall T. Vandegrift'
     description = 'Convert LIT files to HTML'
     file_types  = set(['lit'])
-    html_preprocess_sections = 0
 
     def convert(self, stream, options, file_ext, log,
                 accelerators):
@@ -57,115 +54,7 @@ class LITInput(InputFormatPlugin):
 
 
 	def preprocess_html(self, html):
-
-        def chapter_head(match):
-            chap = match.group('chap')
-            title = match.group('title')
-            if not title:
-                       self.html_preprocess_sections = self.html_preprocess_sections + 1
-                       self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
-                       return '<h2>'+chap+'</h2>\n'
-            else:
-                       self.html_preprocess_sections = self.html_preprocess_sections + 1
-                       self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
-                       return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
-
-        def chapter_link(match):
-            chap = match.group('sectionlink')
-            if not chap:
-                       self.html_preprocess_sections = self.html_preprocess_sections + 1
-                       self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
-                       return '<br style="page-break-before:always">'
-            else:
-                       self.html_preprocess_sections = self.html_preprocess_sections + 1
-                       self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
-                       return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>'
-
-
-        def no_markup(raw, percent):
-            '''
-            Detects total marked up line endings in the file. raw is the text to 
-            inspect.  Percent is the minimum percent of line endings which should 
-            be marked up to return true.
-            '''
-            htm_end_ere = re.compile('</p>', re.DOTALL)
-            line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
-            htm_end = htm_end_ere.findall(raw)
-            line_end = line_end_ere.findall(raw)
-            tot_htm_ends = len(htm_end)
-            tot_ln_fds = len(line_end)
-            self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
-    
-            if percent > 1:
-                percent = 1
-            if percent < 0:
-                percent = 0    
-    
-            min_lns = tot_ln_fds * percent
-            self.log("There must be more than " + str(min_lns) + " unmarked lines to return true")
-            if min_lns > tot_htm_ends:
-                return True
-                
-		self.log("*********  Preprocessing HTML  *********")
-		# remove non-breaking spaces
-		html = re.sub(ur'\u00a0', ' ', html)
-		# Get rid of empty <o:p> tags to simplify other processing
-		html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
-		# Get rid of empty span tags
-        html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
-        
-        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
-		linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE)
-        blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
-        blanklines = blankreg.findall(html)
-        lines = linereg.findall(html)
-        if len(lines) > 1:
-            self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
-            if float(len(blanklines)) / float(len(lines)) > 0.40:
-                self.log("deleting blank lines")
-                html = blankreg.sub('', html)
-		# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
-		html = re.sub(r"\s*</p>", "</p>\n", html)
-		
-		# some lit files don't have any <p> tags or equivalent, check and 
-		# mark up line endings if required before proceeding
-		if no_markup(html, 0.1):
-		     self.log("not enough paragraph markers, adding now")
-             add_markup = re.compile('(?<!>)(\n)')
-             html = add_markup.sub('</p>\n<p>', html)
-        
-		# detect chapters/sections to match xpath or splitting logic
-		#
-		# Mark split points based on embedded links
-		chaplink = re.compile(r'<a\sname[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<sectionlink>[^\s<]+(\s*[^\s<]+){0,4})?\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*</a>', re.IGNORECASE)
-        html = chaplink.sub(chapter_link, html)
-        # Continue with alternate patterns, start with most typical chapter headings
-		if self.html_preprocess_sections < 10:        
-            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(\d+\.?|Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
-            html = chapdetect.sub(chapter_head, html)
-		if self.html_preprocess_sections < 10:
-		    self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
-		    html = chapdetect2.sub(chapter_head, html)    
-        #    
-		# Unwrap lines using punctation if the median length of all lines is less than 150		
-		length = line_length('html', html, 0.4)
-		self.log("*** Median line length is " + str(length) + " ***")
-		unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
-		if length < 150:
-		    self.log("Unwrapping Lines")
-			html = unwrap.sub(' ', html)		
-		# If still no sections after unwrapping lines break on lines with no punctuation
-		if self.html_preprocess_sections < 10:
-		    self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
-		    #self.log(html)
-            chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE)
-            html = chapdetect3.sub(chapter_head, html)    	
-        # search for places where a first or second level heading is immediately followed by another
-        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
-        # headings and titles, images, etc
-        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
-        html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
-
+        preprocessor = PreProcessor(html)
+        html = preprocessor(html)
         return html
 
diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py
index 3ae9f8ccca..c151551866 100644
--- a/src/calibre/ebooks/pdb/pdf/reader.py
+++ b/src/calibre/ebooks/pdb/pdf/reader.py
@@ -21,7 +21,7 @@ class Reader(FormatReader):
         self.options = options
         setattr(self.options, 'new_pdf_engine', False)
         setattr(self.options, 'no_images', False)
-        setattr(self.options, 'unwrap_factor', 0.5)
+        setattr(self.options, 'unwrap_factor', 0.45)
 
     def extract_content(self, output_dir):
         self.log.info('Extracting PDF...')
diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py
index 113c3d99d8..14b3552b04 100644
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@@ -25,7 +25,7 @@ class PDFInput(InputFormatPlugin):
         OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
             help=_('Scale used to determine the length at which a line should '
             'be unwrapped. Valid values are a decimal between 0 and 1. The '
-            'default is 0.45, this is the median line length.')),
+            'default is 0.45, just below the median line length.')),
         OptionRecommendation(name='new_pdf_engine', recommended_value=False,
             help=_('Use the new PDF conversion engine.'))
     ])

From f6de0bef13d7d1001b951d465cff3135aad616ed Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 11 Sep 2010 22:15:09 +1000
Subject: [PATCH 08/12] replaced messed up rtf file

---
 src/calibre/ebooks/rtf/preprocess.py | 624 +++++++++++++--------------
 1 file changed, 289 insertions(+), 335 deletions(-)

diff --git a/src/calibre/ebooks/rtf/preprocess.py b/src/calibre/ebooks/rtf/preprocess.py
index ee45da697f..a3076651fd 100644
--- a/src/calibre/ebooks/rtf/preprocess.py
+++ b/src/calibre/ebooks/rtf/preprocess.py
@@ -1,390 +1,344 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
 
 __license__   = 'GPL v3'
-__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__copyright__ = '2010, Gerendi Sandor Attila'
 __docformat__ = 'restructuredtext en'
 
-import functools, re
+"""
+RTF tokenizer and token parser. v.1.0 (1/17/2010)
+Author: Gerendi Sandor Attila
 
-from calibre import entity_to_unicode
+At this point this will tokenize a RTF file then rebuild it from the tokens.
+In the process the UTF8 tokens are altered to be supported by the RTF2XML and also remain RTF specification compilant.
+"""
 
-XMLDECL_RE    = re.compile(r'^\s*<[?]xml.*?[?]>')
-SVG_NS       = 'http://www.w3.org/2000/svg'
-XLINK_NS     = 'http://www.w3.org/1999/xlink'
+class tokenDelimitatorStart():
+    def __init__(self):
+        pass
+    def toRTF(self):
+        return b'{'
+    def __repr__(self):
+        return '{'
 
-convert_entities = functools.partial(entity_to_unicode,
-        result_exceptions = {
-            u'<' : '&lt;',
-            u'>' : '&gt;',
-            u"'" : '&apos;',
-            u'"' : '&quot;',
-            u'&' : '&amp;',
-        })
-_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
+class tokenDelimitatorEnd():
+    def __init__(self):
+        pass
+    def toRTF(self):
+        return b'}'
+    def __repr__(self):
+        return '}'
 
-LIGATURES = {
-#        u'\u00c6': u'AE',
-#        u'\u00e6': u'ae',
-#        u'\u0152': u'OE',
-#        u'\u0153': u'oe',
-#        u'\u0132': u'IJ',
-#        u'\u0133': u'ij',
-#        u'\u1D6B': u'ue',
-        u'\uFB00': u'ff',
-        u'\uFB01': u'fi',
-        u'\uFB02': u'fl',
-        u'\uFB03': u'ffi',
-        u'\uFB04': u'ffl',
-        u'\uFB05': u'ft',
-        u'\uFB06': u'st',
-        }
+class tokenControlWord():
+    def __init__(self, name, separator = ''):
+        self.name = name
+        self.separator = separator
+    def toRTF(self):
+        return self.name + self.separator
+    def __repr__(self):
+        return self.name + self.separator
 
-_ligpat = re.compile(u'|'.join(LIGATURES))
+class tokenControlWordWithNumericArgument():
+    def __init__(self, name, argument, separator = ''):
+        self.name = name
+        self.argument = argument
+        self.separator = separator
+    def toRTF(self):
+        return self.name + repr(self.argument) + self.separator
+    def __repr__(self):
+        return self.name + repr(self.argument) + self.separator
 
-def sanitize_head(match):
-    x = match.group(1)
-    x = _span_pat.sub('', x)
-    return '<head>\n%s\n</head>' % x
+class tokenControlSymbol():
+    def __init__(self, name):
+        self.name = name
+    def toRTF(self):
+        return self.name
+    def __repr__(self):
+        return self.name
 
-def chap_head(match):
-    chap = match.group('chap')
-    title = match.group('title')
-    if not title:
-               return '<h1>'+chap+'</h1><br/>\n'
-    else:
-               return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
+class tokenData():
+    def __init__(self, data):
+        self.data = data
+    def toRTF(self):
+        return self.data
+    def __repr__(self):
+        return self.data
 
-def wrap_lines(match):
-    ital = match.group('ital')
-    if not ital:
-               return ' '
-    else:
-               return ital+' '
+class tokenBinN():
+    def __init__(self, data, separator = ''):
+        self.data = data
+        self.separator = separator
+    def toRTF(self):
+        return "\\bin" + repr(len(self.data)) + self.separator + self.data
+    def __repr__(self):
+        return "\\bin" + repr(len(self.data)) + self.separator + self.data
+
+class token8bitChar():
+    def __init__(self, data):
+        self.data = data
+    def toRTF(self):
+        return "\\'" + self.data
+    def __repr__(self):
+        return "\\'" + self.data
+
+class tokenUnicode():
+    def __init__(self, data, separator = '', current_ucn = 1, eqList = []):
+        self.data = data
+        self.separator = separator
+        self.current_ucn = current_ucn
+        self.eqList = eqList
+    def toRTF(self):
+        result = '\\u' + repr(self.data) + ' '
+        ucn = self.current_ucn
+        if len(self.eqList) < ucn:
+            ucn = len(self.eqList)
+            result =  tokenControlWordWithNumericArgument('\\uc', ucn).toRTF() + result
+        i = 0
+        for eq in self.eqList:
+            if i >= ucn:
+                break
+            result = result + eq.toRTF()
+        return result
+    def __repr__(self):
+        return '\\u' + repr(self.data)
 
 
-def line_length(format, raw, percent):
-    '''
-    raw is the raw text to find the line length to use for wrapping.
-    percentage is a decimal number, 0 - 1 which is used to determine
-    how far in the list of line lengths to use. The list of line lengths is
-    ordered smallest to larged and does not include duplicates. 0.5 is the
-    median value.
-    '''
-    raw = raw.replace('&nbsp;', ' ')
-    if format == 'html':
-        linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
-    elif format == 'pdf':
-        linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
-    lines = linere.findall(raw)
-    print "percent is " + str(percent)
+def isAsciiLetter(value):
+    return ((value >= 'a') and (value <= 'z')) or ((value >= 'A') and (value <= 'Z'))
 
-    lengths = []
-    for line in lines:
-        if len(line) > 0:
-            lengths.append(len(line))
+def isDigit(value):
+    return (value >= '0') and (value <= '9')
 
-    if not lengths:
-        return 0
+def isChar(value, char):
+    return value == char
 
-    lengths = list(set(lengths))
-    total = sum(lengths)
-    avg = total / len(lengths)
-    max_line = avg * 2
-
-    lengths = sorted(lengths)
-    for i in range(len(lengths) - 1, -1, -1):
-        if lengths[i] > max_line:
-            del lengths[i]
-
-    if percent > 1:
-        percent = 1
-    if percent < 0:
-        percent = 0
-
-    index = int(len(lengths) * percent) - 1
-
-    return lengths[index]
+def isString(buffer, string):
+    return buffer == string
 
 
-class CSSPreProcessor(object):
+class RtfTokenParser():
+    def __init__(self, tokens):
+        self.tokens = tokens
+        self.process()
+        self.processUnicode()
 
-    PAGE_PAT   = re.compile(r'@page[^{]*?{[^}]*?}')
+    def process(self):
+        i = 0
+        newTokens = []
+        while i < len(self.tokens):
+            if isinstance(self.tokens[i], tokenControlSymbol):
+                if isString(self.tokens[i].name, "\\'"):
+                    i = i + 1
+                    if not isinstance(self.tokens[i], tokenData):
+                        raise Exception('Error: token8bitChar without data.')
+                    if len(self.tokens[i].data) < 2:
+                        raise Exception('Error: token8bitChar without data.')
+                    newTokens.append(token8bitChar(self.tokens[i].data[0:2]))
+                    if len(self.tokens[i].data) > 2:
+                        newTokens.append(tokenData(self.tokens[i].data[2:]))
+                    i = i + 1
+                    continue
 
-    def __call__(self, data, add_namespace=False):
-        from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
-        data = self.PAGE_PAT.sub('', data)
-        if not add_namespace:
-            return data
-        ans, namespaced = [], False
-        for line in data.splitlines():
-            ll = line.lstrip()
-            if not (namespaced or ll.startswith('@import') or
-                        ll.startswith('@charset')):
-                ans.append(XHTML_CSS_NAMESPACE.strip())
-                namespaced = True
-            ans.append(line)
+            newTokens.append(self.tokens[i])
+            i = i + 1
 
-        return u'\n'.join(ans)
+        self.tokens = list(newTokens)
 
-class HTMLPreProcessor(object):
+    def processUnicode(self):
+        i = 0
+        newTokens = []
+        ucNbStack = [1]
+        while i < len(self.tokens):
+            if isinstance(self.tokens[i], tokenDelimitatorStart):
+                ucNbStack.append(ucNbStack[len(ucNbStack) - 1])
+                newTokens.append(self.tokens[i])
+                i = i + 1
+                continue
+            if isinstance(self.tokens[i], tokenDelimitatorEnd):
+                ucNbStack.pop()
+                newTokens.append(self.tokens[i])
+                i = i + 1
+                continue
+            if isinstance(self.tokens[i], tokenControlWordWithNumericArgument):
+                if isString(self.tokens[i].name, '\\uc'):
+                    ucNbStack[len(ucNbStack) - 1] = self.tokens[i].argument
+                    newTokens.append(self.tokens[i])
+                    i = i + 1
+                    continue
+                if isString(self.tokens[i].name, '\\u'):
+                    x = i
+                    j = 0
+                    i = i + 1
+                    replace = []
+                    partialData = None
+                    ucn = ucNbStack[len(ucNbStack) - 1]
+                    while (i < len(self.tokens)) and (j < ucn):
+                        if isinstance(self.tokens[i], tokenDelimitatorStart):
+                            break
+                        if isinstance(self.tokens[i], tokenDelimitatorEnd):
+                            break
+                        if isinstance(self.tokens[i], tokenData):
+                            if len(self.tokens[i].data) >= ucn - j:
+                                replace.append(tokenData(self.tokens[i].data[0 : ucn - j]))
+                                if len(self.tokens[i].data) > ucn - j:
+                                    partialData = tokenData(self.tokens[i].data[ucn - j:])
+                                i = i + 1
+                                break
+                            else:
+                                replace.append(self.tokens[i])
+                                j = j + len(self.tokens[i].data)
+                                i = i + 1
+                                continue
+                        if isinstance(self.tokens[i], token8bitChar) or isinstance(self.tokens[i], tokenBinN):
+                            replace.append(self.tokens[i])
+                            i = i + 1
+                            j = j + 1
+                            continue
+                        raise Exception('Error: incorect utf replacement.')
 
-    PREPROCESS = [
-                  # Some idiotic HTML generators (Frontpage I'm looking at you)
-                  # Put all sorts of crap into <head>. This messes up lxml
-                  (re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
-                   sanitize_head),
-                  # Convert all entities, since lxml doesn't handle them well
-                  (re.compile(r'&(\S+?);'), convert_entities),
-                  # Remove the <![if/endif tags inserted by everybody's darling, MS Word
-                  (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
-                   lambda match: ''),
-                  ]
+                    #calibre rtf2xml does not support utfreplace
+                    replace = []
 
-    # Fix pdftohtml markup
-    PDFTOHTML  = [
-                  # Fix umlauts
-                  # ¨
-                  (re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
-                  (re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
-                  (re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'),
-                  (re.compile(u'¨\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ë'),
-                  (re.compile(u'¨\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ï'),
-                  (re.compile(u'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ï'),
-                  (re.compile(u'¨\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ö'),
-                  (re.compile(u'¨\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ö'),
-                  (re.compile(u'¨\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ü'),
-                  (re.compile(u'¨\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ü'),
+                    newTokens.append(tokenUnicode(self.tokens[x].argument, self.tokens[x].separator, ucNbStack[len(ucNbStack) - 1], replace))
+                    if partialData != None:
+                        newTokens.append(partialData)
+                    continue
 
-                  # Fix accents
-                  # `
-                  (re.compile(u'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'à'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'À'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'è'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'È'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ì'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ì'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ò'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
+            newTokens.append(self.tokens[i])
+            i = i + 1
 
-                  # ´
-                  (re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Á'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ć'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ć'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'é'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'É'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'í'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Í'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ó'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ó'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ń'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ń'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'ś'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Ś'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ú'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ú'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ź'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ź'),
+        self.tokens = list(newTokens)
 
-                  # ˆ
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'â'),
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Â'),
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ê'),
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ê'),
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'î'),
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Î'),
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ô'),
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ô'),
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'û'),
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Û'),
 
-                  # ¸
-                  (re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'),
-                  (re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'),
+    def toRTF(self):
+        result = []
+        for token in self.tokens:
+            result.append(token.toRTF())
+        return "".join(result)
 
-                  # ˛
-                  (re.compile(u'˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ą'),
-                  (re.compile(u'˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ą'),
-                  (re.compile(u'˛\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ę'),
-                  (re.compile(u'˛\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ę'),
-                  
-                  # ˙
-                  (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
-                  (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
-                  
 
-                  # If pdf printed from a browser then the header/footer has a reliable pattern
-                  (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
+class RtfTokenizer():
+    def __init__(self, rtfData):
+        self.rtfData = []
+        self.tokens = []
+        self.rtfData = rtfData
+        self.tokenize()
 
-                  # Center separator lines
-                  (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
+    def tokenize(self):
+        i = 0
+        lastDataStart = -1
+        while i < len(self.rtfData):
 
-                  # Remove page links
-                  (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
-                  # Remove <hr> tags
-                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
-                  # Replace <br><br> with <p>
-                  # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),
+            if isChar(self.rtfData[i], '{'):
+                if lastDataStart > -1:
+                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+                    lastDataStart = -1
+                self.tokens.append(tokenDelimitatorStart())
+                i = i + 1
+                continue
 
-                  # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
-                  (re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
+            if isChar(self.rtfData[i], '}'):
+                if lastDataStart > -1:
+                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+                    lastDataStart = -1
+                self.tokens.append(tokenDelimitatorEnd())
+                i = i + 1
+                continue
 
-                  # Remove gray background
-                  (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
+            if isChar(self.rtfData[i], '\\'):
+                if i + 1 >= len(self.rtfData):
+                    raise Exception('Error: Control character found at the end of the document.')
 
-                  # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
+                if lastDataStart > -1:
+                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+                    lastDataStart = -1
 
-                  # Have paragraphs show better
-                  (re.compile(r'<br.*?>'), lambda match : '<p>'),
-                  # Clean up spaces
-                  (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
-                  # Add space before and after italics
-                  (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
-                  (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
-                                   
-                 ]
+                tokenStart = i
+                i = i + 1
 
-    # Fix Book Designer markup
-    BOOK_DESIGNER = [
-                     # HR
-                     (re.compile('<hr>', re.IGNORECASE),
-                      lambda match : '<span style="page-break-after:always"> </span>'),
-                     # Create header tags
-                     (re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
-                      lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
-                     (re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
-                      lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
-                     (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
-                      lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
-                     (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
-                      lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
-                     ]
-    def __init__(self, input_plugin_preprocess, plugin_preprocess,
-            extra_opts=None):
-        self.input_plugin_preprocess = input_plugin_preprocess
-        self.plugin_preprocess = plugin_preprocess
-        self.extra_opts = extra_opts
+                #Control Words
+                if isAsciiLetter(self.rtfData[i]):
+                    #consume <ASCII Letter Sequence>
+                    consumed = False
+                    while i < len(self.rtfData):
+                        if not isAsciiLetter(self.rtfData[i]):
+                            tokenEnd = i
+                            consumed = True
+                            break
+                        i = i + 1
 
-    def is_baen(self, src):
-        return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
-                          re.IGNORECASE).search(src) is not None
+                    if not consumed:
+                        raise Exception('Error (at:%d): Control Word without end.'%(tokenStart))
 
-    def is_book_designer(self, raw):
-        return re.search('<H2[^><]*id=BookTitle', raw) is not None
+                    #we have numeric argument before delimiter
+                    if isChar(self.rtfData[i], '-') or isDigit(self.rtfData[i]):
+                        #consume the numeric argument
+                        consumed = False
+                        l = 0
+                        while i < len(self.rtfData):
+                            if not isDigit(self.rtfData[i]):
+                                consumed = True
+                                break
+                            l = l + 1
+                            i = i + 1
+                            if l > 10 :
+                                raise Exception('Error (at:%d): Too many digits in control word numeric argument.'%[tokenStart])
 
-    def is_pdftohtml(self, src):
-        return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
+                        if not consumed:
+                            raise Exception('Error (at:%d): Control Word without numeric argument end.'%[tokenStart])
 
-    def __call__(self, html, remove_special_chars=None,
-            get_preprocess_html=False):
-        if remove_special_chars is not None:
-            html = remove_special_chars.sub('', html)
-        html = html.replace('\0', '')
-        is_pdftohtml = self.is_pdftohtml(html)
-        if self.is_baen(html):
-            rules = []
-        elif self.is_book_designer(html):
-            rules = self.BOOK_DESIGNER
-        elif is_pdftohtml:
-            rules = self.PDFTOHTML
-        else:
-            rules = []
+                    separator = ''
+                    if isChar(self.rtfData[i], ' '):
+                        separator = ' '
 
-        start_rules = []
-        if is_pdftohtml:
-            # Remove non breaking spaces
-            start_rules.append((re.compile(ur'\u00a0'), lambda match : ' '))
+                    controlWord = self.rtfData[tokenStart: tokenEnd]
+                    if tokenEnd < i:
+                        value = int(self.rtfData[tokenEnd: i])
+                        if isString(controlWord, "\\bin"):
+                            i = i + value
+                            self.tokens.append(tokenBinN(self.rtfData[tokenStart:i], separator))
+                        else:
+                            self.tokens.append(tokenControlWordWithNumericArgument(controlWord, value, separator))
+                    else:
+                        self.tokens.append(tokenControlWord(controlWord, separator))
+                    #space delimiter, we should discard it
+                    if self.rtfData[i] == ' ':
+                        i = i + 1
 
-        if not getattr(self.extra_opts, 'keep_ligatures', False):
-            html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
+                #Control Symbol
+                else:
+                    self.tokens.append(tokenControlSymbol(self.rtfData[tokenStart : i + 1]))
+                    i = i + 1
+                continue
 
-        end_rules = []
-        if getattr(self.extra_opts, 'remove_header', None):
-            try:
-                rules.insert(0,
-                    (re.compile(self.extra_opts.header_regex), lambda match : '')
-                )
-            except:
-                import traceback
-                print 'Failed to parse remove_header regexp'
-                traceback.print_exc()
+            if lastDataStart < 0:
+                lastDataStart = i
+            i = i + 1
 
-        if getattr(self.extra_opts, 'remove_footer', None):
-            try:
-                rules.insert(0,
-                    (re.compile(self.extra_opts.footer_regex), lambda match : '')
-                )
-            except:
-                import traceback
-                print 'Failed to parse remove_footer regexp'
-                traceback.print_exc()
+    def toRTF(self):
+        result = []
+        for token in self.tokens:
+            result.append(token.toRTF())
+        return "".join(result)
 
-        # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
-        if getattr(self.extra_opts, 'preprocess_html', None):
-            if is_pdftohtml:
-                end_rules.append(
-                    (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
-                )
 
-        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
-            length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
-            if length:
-                print "The pdf line length returned is " + str(length)
-                end_rules.append(
-                    # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
-                )
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) < 2:
+        print ("Usage %prog rtfFileToConvert")
+        sys.exit()
+    f = open(sys.argv[1], 'rb')
+    data = f.read()
+    f.close()
 
-        for rule in self.PREPROCESS + start_rules:
-            html = rule[0].sub(rule[1], html)
+    tokenizer = RtfTokenizer(data)
+    parsedTokens = RtfTokenParser(tokenizer.tokens)
 
-        if get_preprocess_html:
-            return html
+    data = parsedTokens.toRTF()
 
-        def dump(raw, where):
-            import os
-            dp = getattr(self.extra_opts, 'debug_pipeline', None)
-            if dp and os.path.exists(dp):
-                odir = os.path.join(dp, 'input')
-                if os.path.exists(odir):
-                    odir = os.path.join(odir, where)
-                    if not os.path.exists(odir):
-                        os.makedirs(odir)
-                    name, i = None, 0
-                    while not name or os.path.exists(os.path.join(odir, name)):
-                        i += 1
-                        name = '%04d.html'%i
-                    with open(os.path.join(odir, name), 'wb') as f:
-                        f.write(raw.encode('utf-8'))
+    f = open(sys.argv[1], 'w')
+    f.write(data)
+    f.close()
 
-        #dump(html, 'pre-preprocess')
-
-        for rule in rules + end_rules:
-            html = rule[0].sub(rule[1], html)
-
-        #dump(html, 'post-preprocess')
-
-        # Handle broken XHTML w/ SVG (ugh)
-        if 'svg:' in html and SVG_NS not in html:
-            html = html.replace(
-                '<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
-        if 'xlink:' in html and XLINK_NS not in html:
-            html = html.replace(
-                '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
-
-        html = XMLDECL_RE.sub('', html)
-
-        if getattr(self.extra_opts, 'asciiize', False):
-            from calibre.ebooks.unidecode.unidecoder import Unidecoder
-            unidecoder = Unidecoder()
-            html = unidecoder.decode(html)
-
-        if self.plugin_preprocess:
-            html = self.input_plugin_preprocess(html)
-
-        return html
 

From 9a06996b16486a3511e4055535a6be48f484a90a Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 12 Sep 2010 11:17:49 +1000
Subject: [PATCH 09/12] minor tweaks to preprocessing, backed out reflow change

---
 src/calibre/ebooks/conversion/preprocess.py |  4 +--
 src/calibre/ebooks/conversion/utils.py      | 36 +++++++++++----------
 src/calibre/ebooks/pdf/reflow.py            |  4 ---
 3 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 6123577191..46308b2ea0 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -319,8 +319,8 @@ class HTMLPreProcessor(object):
       
         # unwrap hyphenation - moved here so it's executed after header/footer removal
         if is_pdftohtml:
-            # unwrap visible dashes and hyphens - don't delete as 50% or more of the time these
-            # hyphens are for compound words, formatting, etc
+            # unwrap visible dashes and hyphens - don't delete they are often hyphens for
+            # for compound words, formatting, etc
             end_rules.append((re.compile(u'(?<=[-–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
             # unwrap/delete soft hyphens
             end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 68cebb3a11..fb683bdb12 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -29,16 +29,12 @@ class PreProcessor(object):
                    self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
                    return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
 
-    def chapter_link(self, match):
-        chap = match.group('sectionlink')
-        if not chap:
-                   self.html_preprocess_sections = self.html_preprocess_sections + 1
-                   self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
-                   return '<br style="page-break-before:always">'
-        else:
-                   self.html_preprocess_sections = self.html_preprocess_sections + 1
-                   self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
-                   return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>'
+    def chapter_break(self, match):
+        chap = match.group('section')
+        styles = match.group('styles')
+        self.html_preprocess_sections = self.html_preprocess_sections + 1
+        self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
+        return '<'+styles+' style="page-break-before:always">'+chap
 
     def no_markup(self, raw, percent):
         '''
@@ -74,7 +70,7 @@ class PreProcessor(object):
         html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
         
         # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
-        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE)
+        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
         blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
         blanklines = blankreg.findall(html)
         lines = linereg.findall(html)
@@ -100,8 +96,13 @@ class PreProcessor(object):
         chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
         html = chapdetect.sub(self.chapter_head, html)
         if self.html_preprocess_sections < 10:
-            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9}|\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            html = chapdetect2.sub(self.chapter_head, html)    
+
+        if self.html_preprocess_sections < 10:
+            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
             html = chapdetect2.sub(self.chapter_head, html)    
         #    
         # Unwrap lines using punctation if the median length of all lines is less than 200        
@@ -110,13 +111,14 @@ class PreProcessor(object):
         unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
         if length < 200:
             self.log("Unwrapping Lines")
-            html = unwrap.sub(' ', html)        
+            html = unwrap.sub(' ', html)
+            
         # If still no sections after unwrapping lines break on lines with no punctuation
         if self.html_preprocess_sections < 10:
-            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
+            self.log(str(self.html_preprocess_sections) + " split points marked, matching based on punctuation")
             #self.log(html)
-            chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE)
-            html = chapdetect3.sub(self.chapter_head, html)        
+            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</(i|b|u)>){0,2}\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
+            html = chapdetect3.sub(self.chapter_break, html)      
         # search for places where a first or second level heading is immediately followed by another
         # top level heading.  demote the second heading to h3 to prevent splitting between chapter
         # headings and titles, images, etc
diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py
index 36848ddb8b..584d631d0b 100644
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@@ -408,10 +408,6 @@ class Page(object):
     # Fraction of text height that two strings' bottoms can differ by
     # for them to be considered to be part of the same text fragment
     LINE_FACTOR = 0.4
-    
-    # Percentage of the page heigth which should be considered header
-    # or footer to be discarded from reflow considerations
-    HEAD_FOOTER_MARGIN
 
     # Multiplies the average line height when determining row height
     # of a particular element to detect columns.

From cdb696f63bc39b9327abe809fa71e94baa6e0b86 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 13 Sep 2010 00:12:21 +1000
Subject: [PATCH 10/12] enhanced preprocessing class - looking pretty good

---
 src/calibre/ebooks/conversion/preprocess.py | 18 ++--
 src/calibre/ebooks/conversion/utils.py      | 98 +++++++++++++++------
 2 files changed, 82 insertions(+), 34 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 46308b2ea0..f6277956c8 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -62,7 +62,6 @@ def wrap_lines(match):
     else:
                return ital+' '
 
-
 def line_length(format, raw, percent):
     '''
     raw is the raw text to find the line length to use for wrapping.
@@ -76,6 +75,8 @@ def line_length(format, raw, percent):
         linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
     elif format == 'pdf':
         linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
+    elif format == 'spanned_html':
+        linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
     lines = linere.findall(raw)
 
     lengths = []
@@ -223,14 +224,15 @@ class HTMLPreProcessor(object):
                   # Remove page links
                   (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
                   # Remove <hr> tags
-                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
+                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br>'),
 
                   # Remove gray background
                   (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
 
                   # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
-                  (re.compile(r'<br\s*/?>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?'), chap_head),
+                  (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
+                  # Cover the case where every letter in a chapter title is separated by a space
+                  (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
                   
                   # Have paragraphs show better
                   (re.compile(r'<br.*?>'), lambda match : '<p>'),
@@ -238,8 +240,7 @@ class HTMLPreProcessor(object):
                   (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
                   # Add space before and after italics
                   (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
-                  (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
-                                   
+                  (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),                            
                  ]
 
     # Fix Book Designer markup
@@ -327,10 +328,11 @@ class HTMLPreProcessor(object):
             # unwrap/delete soft hyphens with formatting
             end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
         
-        # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
+        # Make the more aggressive chapter marking regex optional with the preprocess option to 
+        # reduce false positives and move after header/footer removal
         if getattr(self.extra_opts, 'preprocess_html', None):
             if is_pdftohtml:
-                end_rules.append((re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head))
+                end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
                 
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
             length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index fb683bdb12..abfa43e7ed 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -8,10 +8,10 @@ __docformat__ = 'restructuredtext en'
 import re
 from calibre.ebooks.conversion.preprocess import line_length
 from calibre.utils.logging import default_log
-from lxml import etree
 
 class PreProcessor(object):
     html_preprocess_sections = 0
+    found_indents = 0
 
     def __init__(self, args):
         self.args = args
@@ -22,11 +22,11 @@ class PreProcessor(object):
         title = match.group('title')
         if not title:
                    self.html_preprocess_sections = self.html_preprocess_sections + 1
-                   self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
+                   self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
                    return '<h2>'+chap+'</h2>\n'
         else:
                    self.html_preprocess_sections = self.html_preprocess_sections + 1
-                   self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+                   self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
                    return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
 
     def chapter_break(self, match):
@@ -35,7 +35,22 @@ class PreProcessor(object):
         self.html_preprocess_sections = self.html_preprocess_sections + 1
         self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
         return '<'+styles+' style="page-break-before:always">'+chap
-
+    
+    def insert_indent(self, match):
+        pstyle = match.group('formatting')
+        span = match.group('span')
+        self.found_indents = self.found_indents + 1
+        if pstyle:
+            if not span:
+                return '<p '+pstyle+' style="text-indent:3%">'
+            else:
+                return '<p '+pstyle+' style="text-indent:3%">'+span
+        else:
+            if not span:
+                return '<p style="text-indent:3%">'
+            else:
+                return '<p style="text-indent:3%">'+span
+        
     def no_markup(self, raw, percent):
         '''
         Detects total marked up line endings in the file. raw is the text to 
@@ -48,7 +63,7 @@ class PreProcessor(object):
         line_end = line_end_ere.findall(raw)
         tot_htm_ends = len(htm_end)
         tot_ln_fds = len(line_end)
-        self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
+        self.log("There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked up endings")
 
         if percent > 1:
             percent = 1
@@ -56,13 +71,18 @@ class PreProcessor(object):
             percent = 0    
     
         min_lns = tot_ln_fds * percent
-        self.log("There must be fewer than " + str(min_lns) + " unmarked lines to return true")
+        self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
         if min_lns > tot_htm_ends:
             return True
             
     def __call__(self, html):
         self.log("*********  Preprocessing HTML  *********")
-        # remove non-breaking spaces
+        # Replace series of non-breaking spaces with text-indent
+        txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
+        html = txtindent.sub(self.insert_indent, html)
+        if self.found_indents > 1:
+            self.log("replaced "+str(self.found_indents)+ " nbsp indents with inline styles")
+        # remove remaining non-breaking spaces
         html = re.sub(ur'\u00a0', ' ', html)
         # Get rid of empty <o:p> tags to simplify other processing
         html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
@@ -83,41 +103,67 @@ class PreProcessor(object):
         html = re.sub(r"\s*</p>", "</p>\n", html)
         html = re.sub(r"\s*<p>\s*", "\n<p>", html)
         
-        # some lit files don't have any <p> tags or equivalent, check and 
-        # mark up line endings if required before proceeding
+        # some lit files don't have any <p> tags or equivalent (generally just plain text between 
+        # <pre> tags), check and  mark up line endings if required before proceeding
         if self.no_markup(html, 0.1):
              self.log("not enough paragraph markers, adding now")
              add_markup = re.compile('(?<!>)(\n)')
              html = add_markup.sub('</p>\n<p>', html)
         
         # detect chapters/sections to match xpath or splitting logic
+        heading = re.compile('<h(1|2)[^>]*>', re.IGNORECASE)
+        self.html_preprocess_sections = len(heading.findall(html))
+        self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
         # 
-        # Start with most typical chapter headings       
-        chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
-        html = chapdetect.sub(self.chapter_head, html)
+        # Start with most typical chapter headings, get more aggressive until one works
+        if self.html_preprocess_sections < 10:
+            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
+            html = chapdetect.sub(self.chapter_head, html)
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
             html = chapdetect2.sub(self.chapter_head, html)    
 
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
-            html = chapdetect2.sub(self.chapter_head, html)    
-        #    
-        # Unwrap lines using punctation if the median length of all lines is less than 200        
-        length = line_length('html', html, 0.4)
-        self.log("*** Median line length is " + str(length) + " ***")
-        unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
-        if length < 200:
-            self.log("Unwrapping Lines")
-            html = unwrap.sub(' ', html)
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            html = chapdetect2.sub(self.chapter_head, html)
             
-        # If still no sections after unwrapping lines break on lines with no punctuation
+        # Unwrap lines
+        # 
+        self.log("Unwrapping Lines")
+        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
+        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so 
+        # that lines can be wrapped across page boundaries
+        paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
+        spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
+        paras = len(paras_reg.findall(html))
+        spans = len(spans_reg.findall(html))
+        if spans > 1:
+            if float(paras) / float(spans) < 0.75:
+                format = 'spanned_html'
+            else:
+                format = 'html'
+        else:
+            format = 'html'
+        
+        # Calculate Length
+        length = line_length(format, html, 0.4)
+        self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
+        #
+        # Unwrap and/or delete soft-hyphens, hyphens
+        html = re.sub(u'­\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
+        html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
+        
+        # Unwrap lines using punctation if the median length of all lines is less than 200        
+        unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+        html = unwrap.sub(' ', html)
+
+        # If still no sections after unwrapping mark split points on lines with no punctuation
         if self.html_preprocess_sections < 10:
-            self.log(str(self.html_preprocess_sections) + " split points marked, matching based on punctuation")
+            self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
             #self.log(html)
-            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</(i|b|u)>){0,2}\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
+            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
             html = chapdetect3.sub(self.chapter_break, html)      
         # search for places where a first or second level heading is immediately followed by another
         # top level heading.  demote the second heading to h3 to prevent splitting between chapter

From 548417ea6b6157faf1688b3b082f3eac5476636f Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 13 Sep 2010 09:18:45 +1000
Subject: [PATCH 11/12] comments and minor tweak

---
 src/calibre/ebooks/conversion/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index abfa43e7ed..ecf030b27d 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -111,7 +111,7 @@ class PreProcessor(object):
              html = add_markup.sub('</p>\n<p>', html)
         
         # detect chapters/sections to match xpath or splitting logic
-        heading = re.compile('<h(1|2)[^>]*>', re.IGNORECASE)
+        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
         self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
         # 
@@ -134,7 +134,7 @@ class PreProcessor(object):
         self.log("Unwrapping Lines")
         # Some OCR sourced files have line breaks in the html using a combination of span & p tags
         # span are used for hard line breaks, p for new paragraphs.  Determine which is used so 
-        # that lines can be wrapped across page boundaries
+        # that lines can be un-wrapped across page boundaries
         paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
         spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
         paras = len(paras_reg.findall(html))

From b73e1b3da50810e151d10a5d62251754a077e605 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 14 Sep 2010 02:56:56 +1000
Subject: [PATCH 12/12] tweaked preprocess for $, added rtf to new preprocess
 logic, changed last pdf default

---
 src/calibre/ebooks/conversion/preprocess.py |  2 +-
 src/calibre/ebooks/rtf/input.py             | 13 +++----------
 src/calibre/gui2/convert/pdf_input.ui       |  2 +-
 3 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index f6277956c8..9464be1210 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -340,7 +340,7 @@ class HTMLPreProcessor(object):
                 # print "The pdf line length returned is " + str(length)
                 end_rules.append(
                     # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                 )
 
         for rule in self.PREPROCESS + start_rules:
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 216ccf591d..d229b80c16 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -8,6 +8,7 @@ from lxml import etree
 
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
 
 class InlineClass(etree.XSLTExtension):
 
@@ -229,16 +230,8 @@ class RTFInput(InputFormatPlugin):
             res = transform.tostring(result)
             res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
             if self.options.preprocess_html:
-                self.log("*********  Preprocessing HTML  *********")
-                # Detect Chapters to match the xpath in the GUI
-                chapdetect = re.compile(r'<p[^>]*>\s*<span[^>]*>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(<(/i|b)>)?)?)\s*</span>\s*</p>', re.IGNORECASE)
-                res = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', res)
-                # Unwrap lines using punctation if the median length of all lines is less than 150
-                length = line_length('html', res, 0.4)
-                self.log("*** Median length is " + str(length) + " ***")
-                unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</span>\s*</p>\s*(?P<up2threeblanks><p[^>]*>\s*(<span[^>]*>\s*</span>\s*)</p>\s*){0,3}\s*<p[^>]*>\s*<span[^>]*>\s*" % length, re.UNICODE)
-                if length < 150:
-                    res = unwrap.sub(' ', res)
+                preprocessor = PreProcessor(res)
+                res = preprocessor(res)
             f.write(res)
         self.write_inline_css(inline_class)
         stream.seek(0)
diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui
index 626c68ea63..b2ee421922 100644
--- a/src/calibre/gui2/convert/pdf_input.ui
+++ b/src/calibre/gui2/convert/pdf_input.ui
@@ -46,7 +46,7 @@
       <double>0.010000000000000</double>
      </property>
      <property name="value">
-      <double>0.500000000000000</double>
+      <double>0.450000000000000</double>
      </property>
     </widget>
    </item>