From 0ad1f3c088f2ff0872de49171fd99a91a50a031a Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Wed, 25 Aug 2010 10:49:42 +1000
Subject: [PATCH 01/43] preprocessing regex tweaks

---
 src/calibre/ebooks/conversion/preprocess.py | 2 +-
 src/calibre/ebooks/rtf/input.py             | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index da652c1a38..940c27344b 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -209,7 +209,7 @@ class HTMLPreProcessor(object):
                   (re.compile(ur'\u00a0'), lambda match : ' '),
 
                   # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
+                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
                   (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
 
                   # Have paragraphs show better
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index dcffbe68ca..eaba28e429 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -231,12 +231,12 @@ class RTFInput(InputFormatPlugin):
             if self.options.preprocess_html:
                 print "*********  Preprocessing HTML  *********\n"
                 # Detect Chapters to match the xpath in the GUI
-                chapdetect = re.compile(r'<p[^>]*>\s*<span[^>]*>\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)\s*</span>\s*</p>', re.IGNORECASE)
+                chapdetect = re.compile(r'<p[^>]*>\s*<span[^>]*>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(<(/i|b)>)?)?)\s*</span>\s*</p>', re.IGNORECASE)
                 res = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', res)
                 # Unwrap lines using punctation if the median length of all lines is less than 150
                 length = line_length('html', res, 0.4)
                 print "*** Median length is " + str(length) + " ***\n"
-                unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</span>\s*(</p>)?\s*(?P<up2threeblanks><p[^>]*>\s*(<span[^>]*>\s*</span>\s*)</p>\s*){0,3}\s*<p[^>]*>\s*(<span[^>]*>)?\s*" % length, re.UNICODE)
+                unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</span>\s*</p>\s*(?P<up2threeblanks><p[^>]*>\s*(<span[^>]*>\s*</span>\s*)</p>\s*){0,3}\s*<p[^>]*>\s*<span[^>]*>\s*" % length, re.UNICODE)
                 if length < 150:
                     res = unwrap.sub(' ', res)
             f.write(res)

From 5c951fb9628617133f17ead6d1393ea84b7c6412 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 4 Sep 2010 15:12:29 +1000
Subject: [PATCH 02/43] Preprocessing Updates

---
 src/calibre/ebooks/conversion/preprocess.py |  26 +++--
 src/calibre/ebooks/html/input.py            |   2 +-
 src/calibre/ebooks/lit/input.py             | 104 ++++++++++++++++++--
 src/calibre/ebooks/mobi/input.py            |  10 ++
 src/calibre/ebooks/pdf/reflow.py            |   4 +
 5 files changed, 132 insertions(+), 14 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 957418f1fd..2954fd7c26 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -62,6 +62,7 @@ def wrap_lines(match):
     else:
                return ital+' '
 
+
 def line_length(format, raw, percent):
     '''
     raw is the raw text to find the line length to use for wrapping.
@@ -191,32 +192,36 @@ class HTMLPreProcessor(object):
                   (re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'),
                   (re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'),
 
+                  # If pdf printed from a browser then the header/footer has a reliable pattern
+                  (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
+
+                  # Center separator lines
+                  (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
+
                   # Remove page links
                   (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
                   # Remove <hr> tags
                   (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
                   # Replace <br><br> with <p>
-                  (re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
+                  # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),
 
-                  # Remove hyphenation
-                  (re.compile(r'-<br.*?>\n\r?'), lambda match: ''),
+                  # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
+                  (re.compile(r'(?<=[-–])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
 
                   # Remove gray background
                   (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
 
                   # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
-                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
+                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+(\s\w+)?)?\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
 
                   # Have paragraphs show better
                   (re.compile(r'<br.*?>'), lambda match : '<p>'),
                   # Clean up spaces
                   (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
-                  # Connect paragraphs split by -
-                  (re.compile(u'(?<=[^\s][-–])[\s]*(</p>)*[\s]*(<p>)*\s*(?=[^\s])'), lambda match: ''),
                   # Add space before and after italics
                   (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
                   (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
+                                   
                  ]
 
     # Fix Book Designer markup
@@ -293,6 +298,13 @@ class HTMLPreProcessor(object):
                 import traceback
                 print 'Failed to parse remove_footer regexp'
                 traceback.print_exc()
+        
+        # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
+        if getattr(self.extra_opts, 'preprocess_html', None):
+            if is_pdftohtml:
+                end_rules.append(
+                    (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
+                )
 
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
             length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index d57bfddd3e..35a8a1a9bc 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -494,7 +494,7 @@ class HTMLInput(InputFormatPlugin):
         if not hasattr(self, 'log'):
             from calibre.utils.logging import default_log
             self.log = default_log
-		self.log("*********  Preprocessing HTML  *********")
+		self.log("*********  Preprocessing HTML - HTML Input plugin *********")
 		# Detect Chapters to match the xpath in the GUI
 		chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
 		html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 9bf20fb1d4..f7bb0fbfd9 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -11,12 +11,14 @@ import re
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.conversion.preprocess import line_length
 
+
 class LITInput(InputFormatPlugin):
 
     name        = 'LIT Input'
     author      = 'Marshall T. Vandegrift'
     description = 'Convert LIT files to HTML'
     file_types  = set(['lit'])
+    html_preprocess_sections = 0
 
     def convert(self, stream, options, file_ext, log,
                 accelerators):
@@ -55,14 +57,104 @@ class LITInput(InputFormatPlugin):
 
 
 	def preprocess_html(self, html):
+
+        def chapter_head(match):
+            chap = match.group('chap')
+            title = match.group('title')
+            if not title:
+                       self.html_preprocess_sections = self.html_preprocess_sections + 1
+                       self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
+                       return '<h2>'+chap+'</h2>\n'
+            else:
+                       self.html_preprocess_sections = self.html_preprocess_sections + 1
+                       self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+                       return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
+
+        def chapter_link(match):
+            chap = match.group('sectionlink')
+            if not chap:
+                       self.html_preprocess_sections = self.html_preprocess_sections + 1
+                       self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
+                       return '<br style="page-break-before:always">'
+            else:
+                       self.html_preprocess_sections = self.html_preprocess_sections + 1
+                       self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
+                       return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>'
+
+
+        def no_markup(raw, percent):
+            '''
+            Detects total marked up line endings in the file. raw is the text to 
+            inspect.  Percent is the minimum percent of line endings which should 
+            be marked up to return true.
+            '''
+            htm_end_ere = re.compile('</p>', re.DOTALL)
+            line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
+            htm_end = htm_end_ere.findall(raw)
+            line_end = line_end_ere.findall(raw)
+            tot_htm_ends = len(htm_end)
+            tot_ln_fds = len(line_end)
+            self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
+    
+            if percent > 1:
+                percent = 1
+            if percent < 0:
+                percent = 0    
+    
+            min_lns = tot_ln_fds * percent
+            self.log("There must be more than " + str(min_lns) + " unmarked lines to be true")
+            if min_lns > tot_htm_ends:
+                return True
+                
 		self.log("*********  Preprocessing HTML  *********")
-		# Detect Chapters to match the xpath in the GUI
-		chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
-		html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
-		# Unwrap lines using punctation if the median length of all lines is less than 150
+		# remove non-breaking spaces
+		html = re.sub(ur'\u00a0', ' ', html)
+		# Get rid of empty <o:p> tags to simplify other processing
+		html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
+		# Get rid of empty span tags
+        html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
+        
+        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+		linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE)
+        blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
+        blanklines = blankreg.findall(html)
+        lines = linereg.findall(html)
+        if len(lines) > 1:
+            self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
+            if float(len(blanklines)) / float(len(lines)) > 0.40:
+                self.log("deleting blank lines")
+                html = blankreg.sub('', html)
+		# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+		html = re.sub(r"\s*</p>", "</p>\n", html)
+		
+		# some lit files don't have any <p> tags or equivalent, check and 
+		# mark up line endings if required before proceeding
+		if no_markup(html, 0.1):
+		     self.log("not enough paragraph markers, adding now")
+             add_markup = re.compile('(?<!>)(\n)')
+             html = add_markup.sub('</p>\n<p>', html)
+        
+		# detect chapters/sections to match xpath or splitting logic
 		#
-		# Insert extra line feeds so the line length regex functions properly
-		html = re.sub(r"</p>", "</p>\n", html)
+		# Mark split points based on embedded links
+		chaplink = re.compile(r'<a\sname[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<sectionlink>[^\s<]+(\s*[^\s<]+){0,4})?\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*</a>', re.IGNORECASE)
+        html = chaplink.sub(chapter_link, html)
+        # Continue with alternate patterns, start with most typical chapter headings
+		if self.html_preprocess_sections < 10:        
+            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}.?(\d+\.?|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\s*){0,4}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
+            html = chapdetect.sub(chapter_head, html)
+		if self.html_preprocess_sections < 10:
+		    self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+		    html = chapdetect2.sub(chapter_head, html)
+		    
+        # search for places where a first or second level heading is immediately followed by another
+        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
+        # headings and titles, images, etc
+        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
+        html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
+		#    
+		# Unwrap lines using punctation if the median length of all lines is less than 150		
 		length = line_length('html', html, 0.4)
 		self.log("*** Median length is " + str(length) + " ***")
 		unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py
index 487e70c04f..b8dc7a9560 100644
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@@ -3,6 +3,7 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
+import re
 from calibre.customize.conversion import InputFormatPlugin
 
 class MOBIInput(InputFormatPlugin):
@@ -37,3 +38,12 @@ class MOBIInput(InputFormatPlugin):
                     include_meta_content_type=False))
                 accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
         return mr.created_opf_path
+
+    def preprocess_html(self, html):
+        # search for places where a first or second level heading is immediately followed by another
+        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
+        # headings and titles, images, etc
+        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
+        html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
+        return html
+
diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py
index 584d631d0b..36848ddb8b 100644
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@@ -408,6 +408,10 @@ class Page(object):
     # Fraction of text height that two strings' bottoms can differ by
     # for them to be considered to be part of the same text fragment
     LINE_FACTOR = 0.4
+    
+    # Percentage of the page heigth which should be considered header
+    # or footer to be discarded from reflow considerations
+    HEAD_FOOTER_MARGIN
 
     # Multiplies the average line height when determining row height
     # of a particular element to detect columns.

From 619ea1f5c258558067996e3902ebafe065dc1737 Mon Sep 17 00:00:00 2001
From: Timothy Legge <timlegge@gmail.com>
Date: Fri, 10 Sep 2010 19:08:18 -0300
Subject: [PATCH 03/43] Move the lookup of content id by extension to a
 function - cleanup/preparation for device collections

---
 src/calibre/devices/kobo/driver.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/src/calibre/devices/kobo/driver.py b/src/calibre/devices/kobo/driver.py
index 5e1c752c76..04c4256a49 100644
--- a/src/calibre/devices/kobo/driver.py
+++ b/src/calibre/devices/kobo/driver.py
@@ -231,21 +231,9 @@ class KOBO(USBMS):
             path = self.normalize_path(path)
             # print "Delete file normalized path: " + path
             extension =  os.path.splitext(path)[1]
-
-            if extension == '.kobo':
-                # Kobo books do not have book files.  They do have some images though
-                #print "kobo book"
-                ContentType = 6
-                ContentID = self.contentid_from_path(path, ContentType)
-            elif extension == '.pdf' or extension == '.epub':
-                # print "ePub or pdf"
-                ContentType = 16
-                #print "Path: " + path
-                ContentID = self.contentid_from_path(path, ContentType)
-                # print "ContentID: " + ContentID
-            else: # if extension == '.html' or extension == '.txt':
-                ContentType = 999 # Yet another hack: to get around Kobo changing how ContentID is stored
-                ContentID = self.contentid_from_path(path, ContentType)
+            ContentType = self.get_content_type_from_extension(extension)
+            
+            ContentID = self.contentid_from_path(path, ContentType)
 
             ImageID = self.delete_via_sql(ContentID, ContentType)
             #print " We would now delete the Images for" + ImageID
@@ -343,6 +331,17 @@ class KOBO(USBMS):
         ContentID = ContentID.replace("\\", '/')
         return ContentID
 
+    def get_content_type_from_extension(self, extension):
+        if extension == '.kobo':
+            # Kobo books do not have book files.  They do have some images though
+            #print "kobo book"
+            ContentType = 6
+        elif extension == '.pdf' or extension == '.epub':
+            # print "ePub or pdf"
+            ContentType = 16
+        else: # if extension == '.html' or extension == '.txt':
+            ContentType = 999 # Yet another hack: to get around Kobo changing how ContentID is stored
+        return ContentType
 
     def path_from_contentid(self, ContentID, ContentType, oncard):
         path = ContentID

From b1b099d7e8fc9484eca0ee0b197334969b30d312 Mon Sep 17 00:00:00 2001
From: Timothy Legge <timlegge@gmail.com>
Date: Fri, 10 Sep 2010 19:12:52 -0300
Subject: [PATCH 04/43] Fix bug processing kobo books introduced in 6084.1.4 -
 no one mentioned it so it must not have caused any real issues

---
 src/calibre/devices/kobo/driver.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/calibre/devices/kobo/driver.py b/src/calibre/devices/kobo/driver.py
index 04c4256a49..9bed6bf0bf 100644
--- a/src/calibre/devices/kobo/driver.py
+++ b/src/calibre/devices/kobo/driver.py
@@ -106,7 +106,10 @@ class KOBO(USBMS):
                             changed = True
                     bl[idx].device_collections = playlist_map.get(lpath, [])
                 else:
-                    book = self.book_from_path(prefix, lpath, title, authors, mime, date, ContentType, ImageID)
+                    if ContentType == '6':
+                        book =  Book(prefix, lpath, title, authors, mime, date, ContentType, ImageID, size=1048576)
+                    else:
+                        book = self.book_from_path(prefix, lpath, title, authors, mime, date, ContentType, ImageID)
                     # print 'Update booklist'
                     if bl.add_book(book, replace_metadata=False):
                         changed = True

From 0fe81939ea5697b8b66863fb37ca6b5ab33ef5f6 Mon Sep 17 00:00:00 2001
From: Timothy Legge <timlegge@gmail.com>
Date: Fri, 10 Sep 2010 20:53:26 -0300
Subject: [PATCH 05/43] Fix long standing bug in date shown for kobo books

---
 src/calibre/devices/kobo/books.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/calibre/devices/kobo/books.py b/src/calibre/devices/kobo/books.py
index 9da99d75c8..496162d668 100644
--- a/src/calibre/devices/kobo/books.py
+++ b/src/calibre/devices/kobo/books.py
@@ -44,16 +44,17 @@ class Book(MetaInformation):
         self.mime = mime
 
         self.size = size # will be set later if None
-        try:
-            if ContentType == '6':
-                self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
-            else:
-                self.datetime = time.gmtime(os.path.getctime(self.path))
-        except:
-             self.datetime = time.gmtime()
 
-	if thumbnail_name is not None:
-	    self.thumbnail = ImageWrapper(thumbnail_name)
+        if ContentType == '6':
+            self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
+        else:
+            try:
+                self.datetime = time.gmtime(os.path.getctime(self.path))
+            except:
+                self.datetime = time.gmtime()
+
+        if thumbnail_name is not None:
+            self.thumbnail = ImageWrapper(thumbnail_name)
         self.tags = []
         if other:
             self.smart_update(other)

From fd7f23f7c2d6d2ec2594f48d02922ba7a00d01c7 Mon Sep 17 00:00:00 2001
From: Timothy Legge <timlegge@gmail.com>
Date: Fri, 10 Sep 2010 22:13:52 -0300
Subject: [PATCH 06/43] Update book.device_collections before add_book is
 called

---
 src/calibre/devices/kobo/driver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/devices/kobo/driver.py b/src/calibre/devices/kobo/driver.py
index 9bed6bf0bf..f24e00143b 100644
--- a/src/calibre/devices/kobo/driver.py
+++ b/src/calibre/devices/kobo/driver.py
@@ -111,9 +111,9 @@ class KOBO(USBMS):
                     else:
                         book = self.book_from_path(prefix, lpath, title, authors, mime, date, ContentType, ImageID)
                     # print 'Update booklist'
+                    book.device_collections = playlist_map.get(book.lpath, [])
                     if bl.add_book(book, replace_metadata=False):
                         changed = True
-                    book.device_collections = playlist_map.get(book.lpath, [])
             except: # Probably a path encoding error
                 import traceback
                 traceback.print_exc()

From d1bbd2c498f528d6d7bd53fb65f5909987bb8573 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Fri, 10 Sep 2010 19:34:30 -0600
Subject: [PATCH 07/43] Fix #6764 (maximum_recursion_depth on
 shorten_components_to (function recursively calls itself on the same data))

---
 src/calibre/utils/filenames.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/calibre/utils/filenames.py b/src/calibre/utils/filenames.py
index 01eb9f30a0..9fd57ab53c 100644
--- a/src/calibre/utils/filenames.py
+++ b/src/calibre/utils/filenames.py
@@ -55,6 +55,9 @@ def shorten_components_to(length, components):
         else:
             if x is components[-1]:
                 b, _, e = x.rpartition('.')
+                if not b and e:
+                    b = e
+                    e = ''
                 r = b[:-delta]+e
                 if r.startswith('.'): r = x[0]+r
             else:

From 4c7373026b9ee8a618dccf8602740d6a7d578aa2 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 11 Sep 2010 12:10:49 +1000
Subject: [PATCH 08/43] preprocessing changes for lit & pdf, added utils.py,
 changed default unwrap_factor

---
 src/calibre/ebooks/conversion/preprocess.py | 15 ++++++++---
 src/calibre/ebooks/conversion/utils.py      |  6 +++++
 src/calibre/ebooks/lit/input.py             | 29 +++++++++++++--------
 src/calibre/ebooks/pdf/input.py             |  4 +--
 4 files changed, 37 insertions(+), 17 deletions(-)
 create mode 100644 src/calibre/ebooks/conversion/utils.py

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 2954fd7c26..452a322d95 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -77,6 +77,7 @@ def line_length(format, raw, percent):
     elif format == 'pdf':
         linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
     lines = linere.findall(raw)
+    print "percent is " + str(percent)
 
     lengths = []
     for line in lines:
@@ -165,6 +166,11 @@ class HTMLPreProcessor(object):
                   (re.compile(u'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ì'),
                   (re.compile(u'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'à'),
                   (re.compile(u'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'À'),
+                  
+                  #(re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'),
+                  #(re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'),
+                  #(re.compile(u'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ò'),
+                  #(re.compile(u'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ò'),
 
                   (re.compile(u'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ó'),
                   (re.compile(u'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ó'),
@@ -206,13 +212,13 @@ class HTMLPreProcessor(object):
                   # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),
 
                   # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
-                  (re.compile(r'(?<=[-–])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
+                  (re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
 
                   # Remove gray background
                   (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
 
                   # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+(\s\w+)?)?\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
+                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
 
                   # Have paragraphs show better
                   (re.compile(r'<br.*?>'), lambda match : '<p>'),
@@ -303,15 +309,16 @@ class HTMLPreProcessor(object):
         if getattr(self.extra_opts, 'preprocess_html', None):
             if is_pdftohtml:
                 end_rules.append(
-                    (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
+                    (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
                 )
 
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
             length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
             if length:
+                print "The pdf line length returned is " + str(length)
                 end_rules.append(
                     # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
                 )
 
         for rule in self.PREPROCESS + start_rules:
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
new file mode 100644
index 0000000000..52be473372
--- /dev/null
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__   = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
\ No newline at end of file
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index f7bb0fbfd9..35dad501be 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -102,7 +102,7 @@ class LITInput(InputFormatPlugin):
                 percent = 0    
     
             min_lns = tot_ln_fds * percent
-            self.log("There must be more than " + str(min_lns) + " unmarked lines to be true")
+            self.log("There must be more than " + str(min_lns) + " unmarked lines to return true")
             if min_lns > tot_htm_ends:
                 return True
                 
@@ -141,24 +141,31 @@ class LITInput(InputFormatPlugin):
         html = chaplink.sub(chapter_link, html)
         # Continue with alternate patterns, start with most typical chapter headings
 		if self.html_preprocess_sections < 10:        
-            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}.?(\d+\.?|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\s*){0,4}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
+            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(\d+\.?|Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
             html = chapdetect.sub(chapter_head, html)
 		if self.html_preprocess_sections < 10:
 		    self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
             chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
-		    html = chapdetect2.sub(chapter_head, html)
-		    
+		    html = chapdetect2.sub(chapter_head, html)    
+        #    
+		# Unwrap lines using punctation if the median length of all lines is less than 150		
+		length = line_length('html', html, 0.4)
+		self.log("*** Median line length is " + str(length) + " ***")
+		unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+		if length < 150:
+		    self.log("Unwrapping Lines")
+			html = unwrap.sub(' ', html)		
+		# If still no sections after unwrapping lines break on lines with no punctuation
+		if self.html_preprocess_sections < 10:
+		    self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
+		    #self.log(html)
+            chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE)
+            html = chapdetect3.sub(chapter_head, html)    	
         # search for places where a first or second level heading is immediately followed by another
         # top level heading.  demote the second heading to h3 to prevent splitting between chapter
         # headings and titles, images, etc
         doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
         html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
-		#    
-		# Unwrap lines using punctation if the median length of all lines is less than 150		
-		length = line_length('html', html, 0.4)
-		self.log("*** Median length is " + str(length) + " ***")
-		unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
-		if length < 150:
-			html = unwrap.sub(' ', html)
+
         return html
 
diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py
index 64a089281e..113c3d99d8 100644
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@@ -22,10 +22,10 @@ class PDFInput(InputFormatPlugin):
     options = set([
         OptionRecommendation(name='no_images', recommended_value=False,
             help=_('Do not extract images from the document')),
-        OptionRecommendation(name='unwrap_factor', recommended_value=0.5,
+        OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
             help=_('Scale used to determine the length at which a line should '
             'be unwrapped. Valid values are a decimal between 0 and 1. The '
-            'default is 0.5, this is the median line length.')),
+            'default is 0.45, this is the median line length.')),
         OptionRecommendation(name='new_pdf_engine', recommended_value=False,
             help=_('Use the new PDF conversion engine.'))
     ])

From faf15b2f3d611594352721d4d06407025fea1320 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 11 Sep 2010 13:09:23 +1000
Subject: [PATCH 09/43] preprocess merge gone wrong, fixing

---
 src/calibre/ebooks/conversion/preprocess.py | 25 ++++++---------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index e2364d961f..24a389e65c 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -168,7 +168,6 @@ class HTMLPreProcessor(object):
                   (re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
                   (re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
                   (re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
-<<<<<<< TREE
                   (re.compile(u'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'è'),
                   (re.compile(u'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'È'),
                   (re.compile(u'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ì'),
@@ -176,13 +175,6 @@ class HTMLPreProcessor(object):
                   (re.compile(u'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'à'),
                   (re.compile(u'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'À'),
                   
-                  #(re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'),
-                  #(re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'),
-                  #(re.compile(u'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ò'),
-                  #(re.compile(u'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ò'),
-=======
->>>>>>> MERGE-SOURCE
-
                   # ´
                   (re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),
                   (re.compile(u'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Á'),
@@ -218,14 +210,7 @@ class HTMLPreProcessor(object):
                   # ¸
                   (re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'),
                   (re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'),
-
-<<<<<<< TREE
-                  # If pdf printed from a browser then the header/footer has a reliable pattern
-                  (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
-
-                  # Center separator lines
-                  (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
-=======
+                  
                   # ˛
                   (re.compile(u'˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ą'),
                   (re.compile(u'˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ą'),
@@ -235,8 +220,12 @@ class HTMLPreProcessor(object):
                   # ˙
                   (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
                   (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
-                  
->>>>>>> MERGE-SOURCE
+
+                  # If pdf printed from a browser then the header/footer has a reliable pattern
+                  (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
+
+                  # Center separator lines
+                  (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
 
                   # Remove page links
                   (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),

From 2a906184ad4c56d3018806c03bf2647bd8ecc242 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 11 Sep 2010 13:17:21 +1000
Subject: [PATCH 10/43] preprocess merge gone wrong, merged original accent
 code back

---
 src/calibre/ebooks/conversion/preprocess.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 24a389e65c..f2b19efa9b 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -168,13 +168,7 @@ class HTMLPreProcessor(object):
                   (re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
                   (re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
                   (re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'è'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'È'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ì'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ì'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'à'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'À'),
-                  
+
                   # ´
                   (re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),
                   (re.compile(u'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Á'),
@@ -210,7 +204,7 @@ class HTMLPreProcessor(object):
                   # ¸
                   (re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'),
                   (re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'),
-                  
+
                   # ˛
                   (re.compile(u'˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ą'),
                   (re.compile(u'˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ą'),
@@ -221,6 +215,7 @@ class HTMLPreProcessor(object):
                   (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
                   (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
 
+
                   # If pdf printed from a browser then the header/footer has a reliable pattern
                   (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
 

From 480eccb0b0c3921fd356d329e6d601b9207c2d26 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 11 Sep 2010 15:33:10 +1000
Subject: [PATCH 11/43] Fixed unwrapping for various hyphen and dash types,
 other minor tweaks to pdf

---
 src/calibre/ebooks/conversion/preprocess.py | 28 ++++++++++++++-------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index f2b19efa9b..c120f0a560 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -77,7 +77,6 @@ def line_length(format, raw, percent):
     elif format == 'pdf':
         linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
     lines = linere.findall(raw)
-    print "percent is " + str(percent)
 
     lengths = []
     for line in lines:
@@ -230,14 +229,17 @@ class HTMLPreProcessor(object):
                   # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),
 
                   # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
-                  (re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
+                  #(re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
+                  # unwrap/delete soft hyphens
+                  #(re.compile(u'[­]\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
 
                   # Remove gray background
                   (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
 
                   # Detect Chapters to match default XPATH in GUI
                   (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
-
+                  (re.compile(r'<br\s*/?>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?'), chap_head),
+                  
                   # Have paragraphs show better
                   (re.compile(r'<br.*?>'), lambda match : '<p>'),
                   # Clean up spaces
@@ -322,21 +324,29 @@ class HTMLPreProcessor(object):
                 import traceback
                 print 'Failed to parse remove_footer regexp'
                 traceback.print_exc()
+      
+        # unwrap hyphenation - moved here so it's executed after header/footer removal
+        if is_pdftohtml:
+            # unwrap visible dashes and hyphens - don't delete as 50% or more of the time these
+            # hyphens are for compound words, formatting, etc
+            end_rules.append((re.compile(u'(?<=[-–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
+            # unwrap/delete soft hyphens
+            end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
+            # unwrap/delete soft hyphens with formatting
+            end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
         
         # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
         if getattr(self.extra_opts, 'preprocess_html', None):
             if is_pdftohtml:
-                end_rules.append(
-                    (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
-                )
-
+                end_rules.append((re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head))
+                
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
             length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
             if length:
-                print "The pdf line length returned is " + str(length)
+                # print "The pdf line length returned is " + str(length)
                 end_rules.append(
                     # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
                 )
 
         for rule in self.PREPROCESS + start_rules:

From f1be85806e73839fd9fbef1b6bd1af8619918fa1 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sat, 11 Sep 2010 09:51:42 +0100
Subject: [PATCH 12/43] Fix #6771 - Search renaming exception.

---
 src/calibre/gui2/tag_view.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/calibre/gui2/tag_view.py b/src/calibre/gui2/tag_view.py
index a64eb2eb9a..519d533ff6 100644
--- a/src/calibre/gui2/tag_view.py
+++ b/src/calibre/gui2/tag_view.py
@@ -512,7 +512,8 @@ class TagsModel(QAbstractItemModel): # {{{
                     _('The saved search name %s is already used.')%val).exec_()
                 return False
             saved_searches().rename(unicode(item.data(role).toString()), val)
-            self.tags_view.search_item_renamed.emit()
+            item.tag.name = val
+            self.tags_view.search_item_renamed.emit() # Does a refresh
         else:
             if key == 'series':
                 self.db.rename_series(item.tag.id, val)
@@ -526,8 +527,8 @@ class TagsModel(QAbstractItemModel): # {{{
                 self.db.rename_custom_item(item.tag.id, val,
                                     label=self.db.field_metadata[key]['label'])
             self.tags_view.tag_item_renamed.emit()
-        item.tag.name = val
-        self.refresh() # Should work, because no categories can have disappeared
+            item.tag.name = val
+            self.refresh() # Should work, because no categories can have disappeared
         if path:
             idx = self.index_for_path(path)
             if idx.isValid():
@@ -669,7 +670,7 @@ class TagBrowserMixin(object): # {{{
         self.tags_view.saved_search_edit.connect(self.do_saved_search_edit)
         self.tags_view.author_sort_edit.connect(self.do_author_sort_edit)
         self.tags_view.tag_item_renamed.connect(self.do_tag_item_renamed)
-        self.tags_view.search_item_renamed.connect(self.saved_search.clear_to_help)
+        self.tags_view.search_item_renamed.connect(self.saved_searches_changed)
         self.edit_categories.clicked.connect(lambda x:
                 self.do_user_categories_edit())
 

From cf7cc4de4d9b9fa5e4b22c5ce2cb63c099165589 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 11 Sep 2010 21:02:44 +1000
Subject: [PATCH 13/43] preprocess updates for lit, html, and pdf

---
 src/calibre/ebooks/conversion/preprocess.py |   8 --
 src/calibre/ebooks/conversion/utils.py      | 122 +++++++++++++++++++-
 src/calibre/ebooks/html/input.py            |  20 +---
 src/calibre/ebooks/lit/input.py             | 117 +------------------
 src/calibre/ebooks/pdb/pdf/reader.py        |   2 +-
 src/calibre/ebooks/pdf/input.py             |   2 +-
 6 files changed, 129 insertions(+), 142 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index c120f0a560..6123577191 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -214,7 +214,6 @@ class HTMLPreProcessor(object):
                   (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
                   (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
 
-
                   # If pdf printed from a browser then the header/footer has a reliable pattern
                   (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
 
@@ -225,13 +224,6 @@ class HTMLPreProcessor(object):
                   (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
                   # Remove <hr> tags
                   (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
-                  # Replace <br><br> with <p>
-                  # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),
-
-                  # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
-                  #(re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
-                  # unwrap/delete soft hyphens
-                  #(re.compile(u'[­]\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
 
                   # Remove gray background
                   (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 52be473372..68cebb3a11 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -3,4 +3,124 @@
 
 __license__   = 'GPL v3'
 __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
-__docformat__ = 'restructuredtext en'
\ No newline at end of file
+__docformat__ = 'restructuredtext en'
+
+import re
+from calibre.ebooks.conversion.preprocess import line_length
+from calibre.utils.logging import default_log
+from lxml import etree
+
+class PreProcessor(object):
+    html_preprocess_sections = 0
+
+    def __init__(self, args):
+        self.args = args
+        self.log = default_log
+   
+    def chapter_head(self, match):
+        chap = match.group('chap')
+        title = match.group('title')
+        if not title:
+                   self.html_preprocess_sections = self.html_preprocess_sections + 1
+                   self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
+                   return '<h2>'+chap+'</h2>\n'
+        else:
+                   self.html_preprocess_sections = self.html_preprocess_sections + 1
+                   self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+                   return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
+
+    def chapter_link(self, match):
+        chap = match.group('sectionlink')
+        if not chap:
+                   self.html_preprocess_sections = self.html_preprocess_sections + 1
+                   self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
+                   return '<br style="page-break-before:always">'
+        else:
+                   self.html_preprocess_sections = self.html_preprocess_sections + 1
+                   self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
+                   return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>'
+
+    def no_markup(self, raw, percent):
+        '''
+        Detects total marked up line endings in the file. raw is the text to 
+        inspect.  Percent is the minimum percent of line endings which should 
+        be marked up to return true.
+        '''
+        htm_end_ere = re.compile('</p>', re.DOTALL)
+        line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
+        htm_end = htm_end_ere.findall(raw)
+        line_end = line_end_ere.findall(raw)
+        tot_htm_ends = len(htm_end)
+        tot_ln_fds = len(line_end)
+        self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
+
+        if percent > 1:
+            percent = 1
+        if percent < 0:
+            percent = 0    
+    
+        min_lns = tot_ln_fds * percent
+        self.log("There must be fewer than " + str(min_lns) + " unmarked lines to return true")
+        if min_lns > tot_htm_ends:
+            return True
+            
+    def __call__(self, html):
+        self.log("*********  Preprocessing HTML  *********")
+        # remove non-breaking spaces
+        html = re.sub(ur'\u00a0', ' ', html)
+        # Get rid of empty <o:p> tags to simplify other processing
+        html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
+        # Get rid of empty span tags
+        html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
+        
+        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE)
+        blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
+        blanklines = blankreg.findall(html)
+        lines = linereg.findall(html)
+        if len(lines) > 1:
+            self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
+            if float(len(blanklines)) / float(len(lines)) > 0.40:
+                self.log("deleting blank lines")
+                html = blankreg.sub('', html)
+        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
+        html = re.sub(r"\s*</p>", "</p>\n", html)
+        html = re.sub(r"\s*<p>\s*", "\n<p>", html)
+        
+        # some lit files don't have any <p> tags or equivalent, check and 
+        # mark up line endings if required before proceeding
+        if self.no_markup(html, 0.1):
+             self.log("not enough paragraph markers, adding now")
+             add_markup = re.compile('(?<!>)(\n)')
+             html = add_markup.sub('</p>\n<p>', html)
+        
+        # detect chapters/sections to match xpath or splitting logic
+        # 
+        # Start with most typical chapter headings       
+        chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
+        html = chapdetect.sub(self.chapter_head, html)
+        if self.html_preprocess_sections < 10:
+            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9}|\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            html = chapdetect2.sub(self.chapter_head, html)    
+        #    
+        # Unwrap lines using punctation if the median length of all lines is less than 200        
+        length = line_length('html', html, 0.4)
+        self.log("*** Median line length is " + str(length) + " ***")
+        unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+        if length < 200:
+            self.log("Unwrapping Lines")
+            html = unwrap.sub(' ', html)        
+        # If still no sections after unwrapping lines break on lines with no punctuation
+        if self.html_preprocess_sections < 10:
+            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
+            #self.log(html)
+            chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE)
+            html = chapdetect3.sub(self.chapter_head, html)        
+        # search for places where a first or second level heading is immediately followed by another
+        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
+        # headings and titles, images, etc
+        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
+        html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
+        
+        return html
\ No newline at end of file
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 35a8a1a9bc..e83216ae1f 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -24,7 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows
 from calibre import unicode_path
 from calibre.utils.localization import get_lang
 from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
 
 class Link(object):
     '''
@@ -491,20 +491,6 @@ class HTMLInput(InputFormatPlugin):
         return (None, raw)
 
 	def preprocess_html(self, html):
-        if not hasattr(self, 'log'):
-            from calibre.utils.logging import default_log
-            self.log = default_log
-		self.log("*********  Preprocessing HTML - HTML Input plugin *********")
-		# Detect Chapters to match the xpath in the GUI
-		chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
-		html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
-		# Unwrap lines using punctation if the median length of all lines is less than 150
-		#
-		# Insert extra line feeds so the line length regex functions properly
-		html = re.sub(r"</p>", "</p>\n", html)
-		length = line_length('html', html, 0.4)
-		self.log.debug("*** Median length is " + str(length) + " ***")
-		unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
-		if length < 150:
-			html = unwrap.sub(' ', html)
+        preprocessor = PreProcessor(html)
+        html = preprocessor(html)
         return html
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 35dad501be..58e7bc84bf 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -6,10 +6,8 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-import re
-
 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
 
 
 class LITInput(InputFormatPlugin):
@@ -18,7 +16,6 @@ class LITInput(InputFormatPlugin):
     author      = 'Marshall T. Vandegrift'
     description = 'Convert LIT files to HTML'
     file_types  = set(['lit'])
-    html_preprocess_sections = 0
 
     def convert(self, stream, options, file_ext, log,
                 accelerators):
@@ -57,115 +54,7 @@ class LITInput(InputFormatPlugin):
 
 
 	def preprocess_html(self, html):
-
-        def chapter_head(match):
-            chap = match.group('chap')
-            title = match.group('title')
-            if not title:
-                       self.html_preprocess_sections = self.html_preprocess_sections + 1
-                       self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
-                       return '<h2>'+chap+'</h2>\n'
-            else:
-                       self.html_preprocess_sections = self.html_preprocess_sections + 1
-                       self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
-                       return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
-
-        def chapter_link(match):
-            chap = match.group('sectionlink')
-            if not chap:
-                       self.html_preprocess_sections = self.html_preprocess_sections + 1
-                       self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
-                       return '<br style="page-break-before:always">'
-            else:
-                       self.html_preprocess_sections = self.html_preprocess_sections + 1
-                       self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
-                       return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>'
-
-
-        def no_markup(raw, percent):
-            '''
-            Detects total marked up line endings in the file. raw is the text to 
-            inspect.  Percent is the minimum percent of line endings which should 
-            be marked up to return true.
-            '''
-            htm_end_ere = re.compile('</p>', re.DOTALL)
-            line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
-            htm_end = htm_end_ere.findall(raw)
-            line_end = line_end_ere.findall(raw)
-            tot_htm_ends = len(htm_end)
-            tot_ln_fds = len(line_end)
-            self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
-    
-            if percent > 1:
-                percent = 1
-            if percent < 0:
-                percent = 0    
-    
-            min_lns = tot_ln_fds * percent
-            self.log("There must be more than " + str(min_lns) + " unmarked lines to return true")
-            if min_lns > tot_htm_ends:
-                return True
-                
-		self.log("*********  Preprocessing HTML  *********")
-		# remove non-breaking spaces
-		html = re.sub(ur'\u00a0', ' ', html)
-		# Get rid of empty <o:p> tags to simplify other processing
-		html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
-		# Get rid of empty span tags
-        html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
-        
-        # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
-		linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE)
-        blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
-        blanklines = blankreg.findall(html)
-        lines = linereg.findall(html)
-        if len(lines) > 1:
-            self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
-            if float(len(blanklines)) / float(len(lines)) > 0.40:
-                self.log("deleting blank lines")
-                html = blankreg.sub('', html)
-		# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
-		html = re.sub(r"\s*</p>", "</p>\n", html)
-		
-		# some lit files don't have any <p> tags or equivalent, check and 
-		# mark up line endings if required before proceeding
-		if no_markup(html, 0.1):
-		     self.log("not enough paragraph markers, adding now")
-             add_markup = re.compile('(?<!>)(\n)')
-             html = add_markup.sub('</p>\n<p>', html)
-        
-		# detect chapters/sections to match xpath or splitting logic
-		#
-		# Mark split points based on embedded links
-		chaplink = re.compile(r'<a\sname[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<sectionlink>[^\s<]+(\s*[^\s<]+){0,4})?\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*</a>', re.IGNORECASE)
-        html = chaplink.sub(chapter_link, html)
-        # Continue with alternate patterns, start with most typical chapter headings
-		if self.html_preprocess_sections < 10:        
-            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(\d+\.?|Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
-            html = chapdetect.sub(chapter_head, html)
-		if self.html_preprocess_sections < 10:
-		    self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
-		    html = chapdetect2.sub(chapter_head, html)    
-        #    
-		# Unwrap lines using punctation if the median length of all lines is less than 150		
-		length = line_length('html', html, 0.4)
-		self.log("*** Median line length is " + str(length) + " ***")
-		unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
-		if length < 150:
-		    self.log("Unwrapping Lines")
-			html = unwrap.sub(' ', html)		
-		# If still no sections after unwrapping lines break on lines with no punctuation
-		if self.html_preprocess_sections < 10:
-		    self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
-		    #self.log(html)
-            chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE)
-            html = chapdetect3.sub(chapter_head, html)    	
-        # search for places where a first or second level heading is immediately followed by another
-        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
-        # headings and titles, images, etc
-        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
-        html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
-
+        preprocessor = PreProcessor(html)
+        html = preprocessor(html)
         return html
 
diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py
index 3ae9f8ccca..c151551866 100644
--- a/src/calibre/ebooks/pdb/pdf/reader.py
+++ b/src/calibre/ebooks/pdb/pdf/reader.py
@@ -21,7 +21,7 @@ class Reader(FormatReader):
         self.options = options
         setattr(self.options, 'new_pdf_engine', False)
         setattr(self.options, 'no_images', False)
-        setattr(self.options, 'unwrap_factor', 0.5)
+        setattr(self.options, 'unwrap_factor', 0.45)
 
     def extract_content(self, output_dir):
         self.log.info('Extracting PDF...')
diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py
index 113c3d99d8..14b3552b04 100644
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@@ -25,7 +25,7 @@ class PDFInput(InputFormatPlugin):
         OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
             help=_('Scale used to determine the length at which a line should '
             'be unwrapped. Valid values are a decimal between 0 and 1. The '
-            'default is 0.45, this is the median line length.')),
+            'default is 0.45, just below the median line length.')),
         OptionRecommendation(name='new_pdf_engine', recommended_value=False,
             help=_('Use the new PDF conversion engine.'))
     ])

From f6de0bef13d7d1001b951d465cff3135aad616ed Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 11 Sep 2010 22:15:09 +1000
Subject: [PATCH 14/43] replaced messed up rtf file

---
 src/calibre/ebooks/rtf/preprocess.py | 624 +++++++++++++--------------
 1 file changed, 289 insertions(+), 335 deletions(-)

diff --git a/src/calibre/ebooks/rtf/preprocess.py b/src/calibre/ebooks/rtf/preprocess.py
index ee45da697f..a3076651fd 100644
--- a/src/calibre/ebooks/rtf/preprocess.py
+++ b/src/calibre/ebooks/rtf/preprocess.py
@@ -1,390 +1,344 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
 
 __license__   = 'GPL v3'
-__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__copyright__ = '2010, Gerendi Sandor Attila'
 __docformat__ = 'restructuredtext en'
 
-import functools, re
+"""
+RTF tokenizer and token parser. v.1.0 (1/17/2010)
+Author: Gerendi Sandor Attila
 
-from calibre import entity_to_unicode
+At this point this will tokenize a RTF file then rebuild it from the tokens.
+In the process the UTF8 tokens are altered to be supported by the RTF2XML and also remain RTF specification compilant.
+"""
 
-XMLDECL_RE    = re.compile(r'^\s*<[?]xml.*?[?]>')
-SVG_NS       = 'http://www.w3.org/2000/svg'
-XLINK_NS     = 'http://www.w3.org/1999/xlink'
+class tokenDelimitatorStart():
+    def __init__(self):
+        pass
+    def toRTF(self):
+        return b'{'
+    def __repr__(self):
+        return '{'
 
-convert_entities = functools.partial(entity_to_unicode,
-        result_exceptions = {
-            u'<' : '&lt;',
-            u'>' : '&gt;',
-            u"'" : '&apos;',
-            u'"' : '&quot;',
-            u'&' : '&amp;',
-        })
-_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
+class tokenDelimitatorEnd():
+    def __init__(self):
+        pass
+    def toRTF(self):
+        return b'}'
+    def __repr__(self):
+        return '}'
 
-LIGATURES = {
-#        u'\u00c6': u'AE',
-#        u'\u00e6': u'ae',
-#        u'\u0152': u'OE',
-#        u'\u0153': u'oe',
-#        u'\u0132': u'IJ',
-#        u'\u0133': u'ij',
-#        u'\u1D6B': u'ue',
-        u'\uFB00': u'ff',
-        u'\uFB01': u'fi',
-        u'\uFB02': u'fl',
-        u'\uFB03': u'ffi',
-        u'\uFB04': u'ffl',
-        u'\uFB05': u'ft',
-        u'\uFB06': u'st',
-        }
+class tokenControlWord():
+    def __init__(self, name, separator = ''):
+        self.name = name
+        self.separator = separator
+    def toRTF(self):
+        return self.name + self.separator
+    def __repr__(self):
+        return self.name + self.separator
 
-_ligpat = re.compile(u'|'.join(LIGATURES))
+class tokenControlWordWithNumericArgument():
+    def __init__(self, name, argument, separator = ''):
+        self.name = name
+        self.argument = argument
+        self.separator = separator
+    def toRTF(self):
+        return self.name + repr(self.argument) + self.separator
+    def __repr__(self):
+        return self.name + repr(self.argument) + self.separator
 
-def sanitize_head(match):
-    x = match.group(1)
-    x = _span_pat.sub('', x)
-    return '<head>\n%s\n</head>' % x
+class tokenControlSymbol():
+    def __init__(self, name):
+        self.name = name
+    def toRTF(self):
+        return self.name
+    def __repr__(self):
+        return self.name
 
-def chap_head(match):
-    chap = match.group('chap')
-    title = match.group('title')
-    if not title:
-               return '<h1>'+chap+'</h1><br/>\n'
-    else:
-               return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
+class tokenData():
+    def __init__(self, data):
+        self.data = data
+    def toRTF(self):
+        return self.data
+    def __repr__(self):
+        return self.data
 
-def wrap_lines(match):
-    ital = match.group('ital')
-    if not ital:
-               return ' '
-    else:
-               return ital+' '
+class tokenBinN():
+    def __init__(self, data, separator = ''):
+        self.data = data
+        self.separator = separator
+    def toRTF(self):
+        return "\\bin" + repr(len(self.data)) + self.separator + self.data
+    def __repr__(self):
+        return "\\bin" + repr(len(self.data)) + self.separator + self.data
+
+class token8bitChar():
+    def __init__(self, data):
+        self.data = data
+    def toRTF(self):
+        return "\\'" + self.data
+    def __repr__(self):
+        return "\\'" + self.data
+
+class tokenUnicode():
+    def __init__(self, data, separator = '', current_ucn = 1, eqList = []):
+        self.data = data
+        self.separator = separator
+        self.current_ucn = current_ucn
+        self.eqList = eqList
+    def toRTF(self):
+        result = '\\u' + repr(self.data) + ' '
+        ucn = self.current_ucn
+        if len(self.eqList) < ucn:
+            ucn = len(self.eqList)
+            result =  tokenControlWordWithNumericArgument('\\uc', ucn).toRTF() + result
+        i = 0
+        for eq in self.eqList:
+            if i >= ucn:
+                break
+            result = result + eq.toRTF()
+        return result
+    def __repr__(self):
+        return '\\u' + repr(self.data)
 
 
-def line_length(format, raw, percent):
-    '''
-    raw is the raw text to find the line length to use for wrapping.
-    percentage is a decimal number, 0 - 1 which is used to determine
-    how far in the list of line lengths to use. The list of line lengths is
-    ordered smallest to larged and does not include duplicates. 0.5 is the
-    median value.
-    '''
-    raw = raw.replace('&nbsp;', ' ')
-    if format == 'html':
-        linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
-    elif format == 'pdf':
-        linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
-    lines = linere.findall(raw)
-    print "percent is " + str(percent)
+def isAsciiLetter(value):
+    return ((value >= 'a') and (value <= 'z')) or ((value >= 'A') and (value <= 'Z'))
 
-    lengths = []
-    for line in lines:
-        if len(line) > 0:
-            lengths.append(len(line))
+def isDigit(value):
+    return (value >= '0') and (value <= '9')
 
-    if not lengths:
-        return 0
+def isChar(value, char):
+    return value == char
 
-    lengths = list(set(lengths))
-    total = sum(lengths)
-    avg = total / len(lengths)
-    max_line = avg * 2
-
-    lengths = sorted(lengths)
-    for i in range(len(lengths) - 1, -1, -1):
-        if lengths[i] > max_line:
-            del lengths[i]
-
-    if percent > 1:
-        percent = 1
-    if percent < 0:
-        percent = 0
-
-    index = int(len(lengths) * percent) - 1
-
-    return lengths[index]
+def isString(buffer, string):
+    return buffer == string
 
 
-class CSSPreProcessor(object):
+class RtfTokenParser():
+    def __init__(self, tokens):
+        self.tokens = tokens
+        self.process()
+        self.processUnicode()
 
-    PAGE_PAT   = re.compile(r'@page[^{]*?{[^}]*?}')
+    def process(self):
+        i = 0
+        newTokens = []
+        while i < len(self.tokens):
+            if isinstance(self.tokens[i], tokenControlSymbol):
+                if isString(self.tokens[i].name, "\\'"):
+                    i = i + 1
+                    if not isinstance(self.tokens[i], tokenData):
+                        raise Exception('Error: token8bitChar without data.')
+                    if len(self.tokens[i].data) < 2:
+                        raise Exception('Error: token8bitChar without data.')
+                    newTokens.append(token8bitChar(self.tokens[i].data[0:2]))
+                    if len(self.tokens[i].data) > 2:
+                        newTokens.append(tokenData(self.tokens[i].data[2:]))
+                    i = i + 1
+                    continue
 
-    def __call__(self, data, add_namespace=False):
-        from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
-        data = self.PAGE_PAT.sub('', data)
-        if not add_namespace:
-            return data
-        ans, namespaced = [], False
-        for line in data.splitlines():
-            ll = line.lstrip()
-            if not (namespaced or ll.startswith('@import') or
-                        ll.startswith('@charset')):
-                ans.append(XHTML_CSS_NAMESPACE.strip())
-                namespaced = True
-            ans.append(line)
+            newTokens.append(self.tokens[i])
+            i = i + 1
 
-        return u'\n'.join(ans)
+        self.tokens = list(newTokens)
 
-class HTMLPreProcessor(object):
+    def processUnicode(self):
+        i = 0
+        newTokens = []
+        ucNbStack = [1]
+        while i < len(self.tokens):
+            if isinstance(self.tokens[i], tokenDelimitatorStart):
+                ucNbStack.append(ucNbStack[len(ucNbStack) - 1])
+                newTokens.append(self.tokens[i])
+                i = i + 1
+                continue
+            if isinstance(self.tokens[i], tokenDelimitatorEnd):
+                ucNbStack.pop()
+                newTokens.append(self.tokens[i])
+                i = i + 1
+                continue
+            if isinstance(self.tokens[i], tokenControlWordWithNumericArgument):
+                if isString(self.tokens[i].name, '\\uc'):
+                    ucNbStack[len(ucNbStack) - 1] = self.tokens[i].argument
+                    newTokens.append(self.tokens[i])
+                    i = i + 1
+                    continue
+                if isString(self.tokens[i].name, '\\u'):
+                    x = i
+                    j = 0
+                    i = i + 1
+                    replace = []
+                    partialData = None
+                    ucn = ucNbStack[len(ucNbStack) - 1]
+                    while (i < len(self.tokens)) and (j < ucn):
+                        if isinstance(self.tokens[i], tokenDelimitatorStart):
+                            break
+                        if isinstance(self.tokens[i], tokenDelimitatorEnd):
+                            break
+                        if isinstance(self.tokens[i], tokenData):
+                            if len(self.tokens[i].data) >= ucn - j:
+                                replace.append(tokenData(self.tokens[i].data[0 : ucn - j]))
+                                if len(self.tokens[i].data) > ucn - j:
+                                    partialData = tokenData(self.tokens[i].data[ucn - j:])
+                                i = i + 1
+                                break
+                            else:
+                                replace.append(self.tokens[i])
+                                j = j + len(self.tokens[i].data)
+                                i = i + 1
+                                continue
+                        if isinstance(self.tokens[i], token8bitChar) or isinstance(self.tokens[i], tokenBinN):
+                            replace.append(self.tokens[i])
+                            i = i + 1
+                            j = j + 1
+                            continue
+                        raise Exception('Error: incorect utf replacement.')
 
-    PREPROCESS = [
-                  # Some idiotic HTML generators (Frontpage I'm looking at you)
-                  # Put all sorts of crap into <head>. This messes up lxml
-                  (re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
-                   sanitize_head),
-                  # Convert all entities, since lxml doesn't handle them well
-                  (re.compile(r'&(\S+?);'), convert_entities),
-                  # Remove the <![if/endif tags inserted by everybody's darling, MS Word
-                  (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
-                   lambda match: ''),
-                  ]
+                    #calibre rtf2xml does not support utfreplace
+                    replace = []
 
-    # Fix pdftohtml markup
-    PDFTOHTML  = [
-                  # Fix umlauts
-                  # ¨
-                  (re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
-                  (re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
-                  (re.compile(u'¨\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ë'),
-                  (re.compile(u'¨\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ë'),
-                  (re.compile(u'¨\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ï'),
-                  (re.compile(u'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ï'),
-                  (re.compile(u'¨\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ö'),
-                  (re.compile(u'¨\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ö'),
-                  (re.compile(u'¨\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ü'),
-                  (re.compile(u'¨\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ü'),
+                    newTokens.append(tokenUnicode(self.tokens[x].argument, self.tokens[x].separator, ucNbStack[len(ucNbStack) - 1], replace))
+                    if partialData != None:
+                        newTokens.append(partialData)
+                    continue
 
-                  # Fix accents
-                  # `
-                  (re.compile(u'`\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'à'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'À'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'è'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'È'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'ì'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ì'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ò'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
-                  (re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
+            newTokens.append(self.tokens[i])
+            i = i + 1
 
-                  # ´
-                  (re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Á'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ć'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ć'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'é'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'É'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'í'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Í'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ó'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ó'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ń'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ń'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'ś'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Ś'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ú'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ú'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ź'),
-                  (re.compile(u'´\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ź'),
+        self.tokens = list(newTokens)
 
-                  # ˆ
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'â'),
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Â'),
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ê'),
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ê'),
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'î'),
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Î'),
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ô'),
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ô'),
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'û'),
-                  (re.compile(u'ˆ\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Û'),
 
-                  # ¸
-                  (re.compile(u'¸\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'ç'),
-                  (re.compile(u'¸\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Ç'),
+    def toRTF(self):
+        result = []
+        for token in self.tokens:
+            result.append(token.toRTF())
+        return "".join(result)
 
-                  # ˛
-                  (re.compile(u'˛\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ą'),
-                  (re.compile(u'˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ą'),
-                  (re.compile(u'˛\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ę'),
-                  (re.compile(u'˛\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ę'),
-                  
-                  # ˙
-                  (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
-                  (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
-                  
 
-                  # If pdf printed from a browser then the header/footer has a reliable pattern
-                  (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
+class RtfTokenizer():
+    def __init__(self, rtfData):
+        self.rtfData = []
+        self.tokens = []
+        self.rtfData = rtfData
+        self.tokenize()
 
-                  # Center separator lines
-                  (re.compile(u'<br>\s*(?P<break>([*#•]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
+    def tokenize(self):
+        i = 0
+        lastDataStart = -1
+        while i < len(self.rtfData):
 
-                  # Remove page links
-                  (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
-                  # Remove <hr> tags
-                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
-                  # Replace <br><br> with <p>
-                  # (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),
+            if isChar(self.rtfData[i], '{'):
+                if lastDataStart > -1:
+                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+                    lastDataStart = -1
+                self.tokens.append(tokenDelimitatorStart())
+                i = i + 1
+                continue
 
-                  # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
-                  (re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
+            if isChar(self.rtfData[i], '}'):
+                if lastDataStart > -1:
+                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+                    lastDataStart = -1
+                self.tokens.append(tokenDelimitatorEnd())
+                i = i + 1
+                continue
 
-                  # Remove gray background
-                  (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
+            if isChar(self.rtfData[i], '\\'):
+                if i + 1 >= len(self.rtfData):
+                    raise Exception('Error: Control character found at the end of the document.')
 
-                  # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
+                if lastDataStart > -1:
+                    self.tokens.append(tokenData(self.rtfData[lastDataStart : i]))
+                    lastDataStart = -1
 
-                  # Have paragraphs show better
-                  (re.compile(r'<br.*?>'), lambda match : '<p>'),
-                  # Clean up spaces
-                  (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
-                  # Add space before and after italics
-                  (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
-                  (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
-                                   
-                 ]
+                tokenStart = i
+                i = i + 1
 
-    # Fix Book Designer markup
-    BOOK_DESIGNER = [
-                     # HR
-                     (re.compile('<hr>', re.IGNORECASE),
-                      lambda match : '<span style="page-break-after:always"> </span>'),
-                     # Create header tags
-                     (re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
-                      lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
-                     (re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
-                      lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
-                     (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
-                      lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
-                     (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
-                      lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
-                     ]
-    def __init__(self, input_plugin_preprocess, plugin_preprocess,
-            extra_opts=None):
-        self.input_plugin_preprocess = input_plugin_preprocess
-        self.plugin_preprocess = plugin_preprocess
-        self.extra_opts = extra_opts
+                #Control Words
+                if isAsciiLetter(self.rtfData[i]):
+                    #consume <ASCII Letter Sequence>
+                    consumed = False
+                    while i < len(self.rtfData):
+                        if not isAsciiLetter(self.rtfData[i]):
+                            tokenEnd = i
+                            consumed = True
+                            break
+                        i = i + 1
 
-    def is_baen(self, src):
-        return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
-                          re.IGNORECASE).search(src) is not None
+                    if not consumed:
+                        raise Exception('Error (at:%d): Control Word without end.'%(tokenStart))
 
-    def is_book_designer(self, raw):
-        return re.search('<H2[^><]*id=BookTitle', raw) is not None
+                    #we have numeric argument before delimiter
+                    if isChar(self.rtfData[i], '-') or isDigit(self.rtfData[i]):
+                        #consume the numeric argument
+                        consumed = False
+                        l = 0
+                        while i < len(self.rtfData):
+                            if not isDigit(self.rtfData[i]):
+                                consumed = True
+                                break
+                            l = l + 1
+                            i = i + 1
+                            if l > 10 :
+                                raise Exception('Error (at:%d): Too many digits in control word numeric argument.'%[tokenStart])
 
-    def is_pdftohtml(self, src):
-        return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
+                        if not consumed:
+                            raise Exception('Error (at:%d): Control Word without numeric argument end.'%[tokenStart])
 
-    def __call__(self, html, remove_special_chars=None,
-            get_preprocess_html=False):
-        if remove_special_chars is not None:
-            html = remove_special_chars.sub('', html)
-        html = html.replace('\0', '')
-        is_pdftohtml = self.is_pdftohtml(html)
-        if self.is_baen(html):
-            rules = []
-        elif self.is_book_designer(html):
-            rules = self.BOOK_DESIGNER
-        elif is_pdftohtml:
-            rules = self.PDFTOHTML
-        else:
-            rules = []
+                    separator = ''
+                    if isChar(self.rtfData[i], ' '):
+                        separator = ' '
 
-        start_rules = []
-        if is_pdftohtml:
-            # Remove non breaking spaces
-            start_rules.append((re.compile(ur'\u00a0'), lambda match : ' '))
+                    controlWord = self.rtfData[tokenStart: tokenEnd]
+                    if tokenEnd < i:
+                        value = int(self.rtfData[tokenEnd: i])
+                        if isString(controlWord, "\\bin"):
+                            i = i + value
+                            self.tokens.append(tokenBinN(self.rtfData[tokenStart:i], separator))
+                        else:
+                            self.tokens.append(tokenControlWordWithNumericArgument(controlWord, value, separator))
+                    else:
+                        self.tokens.append(tokenControlWord(controlWord, separator))
+                    #space delimiter, we should discard it
+                    if self.rtfData[i] == ' ':
+                        i = i + 1
 
-        if not getattr(self.extra_opts, 'keep_ligatures', False):
-            html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
+                #Control Symbol
+                else:
+                    self.tokens.append(tokenControlSymbol(self.rtfData[tokenStart : i + 1]))
+                    i = i + 1
+                continue
 
-        end_rules = []
-        if getattr(self.extra_opts, 'remove_header', None):
-            try:
-                rules.insert(0,
-                    (re.compile(self.extra_opts.header_regex), lambda match : '')
-                )
-            except:
-                import traceback
-                print 'Failed to parse remove_header regexp'
-                traceback.print_exc()
+            if lastDataStart < 0:
+                lastDataStart = i
+            i = i + 1
 
-        if getattr(self.extra_opts, 'remove_footer', None):
-            try:
-                rules.insert(0,
-                    (re.compile(self.extra_opts.footer_regex), lambda match : '')
-                )
-            except:
-                import traceback
-                print 'Failed to parse remove_footer regexp'
-                traceback.print_exc()
+    def toRTF(self):
+        result = []
+        for token in self.tokens:
+            result.append(token.toRTF())
+        return "".join(result)
 
-        # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
-        if getattr(self.extra_opts, 'preprocess_html', None):
-            if is_pdftohtml:
-                end_rules.append(
-                    (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
-                )
 
-        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
-            length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
-            if length:
-                print "The pdf line length returned is " + str(length)
-                end_rules.append(
-                    # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
-                )
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) < 2:
+        print ("Usage %prog rtfFileToConvert")
+        sys.exit()
+    f = open(sys.argv[1], 'rb')
+    data = f.read()
+    f.close()
 
-        for rule in self.PREPROCESS + start_rules:
-            html = rule[0].sub(rule[1], html)
+    tokenizer = RtfTokenizer(data)
+    parsedTokens = RtfTokenParser(tokenizer.tokens)
 
-        if get_preprocess_html:
-            return html
+    data = parsedTokens.toRTF()
 
-        def dump(raw, where):
-            import os
-            dp = getattr(self.extra_opts, 'debug_pipeline', None)
-            if dp and os.path.exists(dp):
-                odir = os.path.join(dp, 'input')
-                if os.path.exists(odir):
-                    odir = os.path.join(odir, where)
-                    if not os.path.exists(odir):
-                        os.makedirs(odir)
-                    name, i = None, 0
-                    while not name or os.path.exists(os.path.join(odir, name)):
-                        i += 1
-                        name = '%04d.html'%i
-                    with open(os.path.join(odir, name), 'wb') as f:
-                        f.write(raw.encode('utf-8'))
+    f = open(sys.argv[1], 'w')
+    f.write(data)
+    f.close()
 
-        #dump(html, 'pre-preprocess')
-
-        for rule in rules + end_rules:
-            html = rule[0].sub(rule[1], html)
-
-        #dump(html, 'post-preprocess')
-
-        # Handle broken XHTML w/ SVG (ugh)
-        if 'svg:' in html and SVG_NS not in html:
-            html = html.replace(
-                '<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
-        if 'xlink:' in html and XLINK_NS not in html:
-            html = html.replace(
-                '<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
-
-        html = XMLDECL_RE.sub('', html)
-
-        if getattr(self.extra_opts, 'asciiize', False):
-            from calibre.ebooks.unidecode.unidecoder import Unidecoder
-            unidecoder = Unidecoder()
-            html = unidecoder.decode(html)
-
-        if self.plugin_preprocess:
-            html = self.input_plugin_preprocess(html)
-
-        return html
 

From 86e68579f32972a2424771a7f3e84d046d630283 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 11 Sep 2010 08:39:40 -0400
Subject: [PATCH 15/43] PDF Input: Fix bug #6734, add additional matching for
 unicode characters.

---
 src/calibre/ebooks/conversion/preprocess.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index f7b803974f..256bcce6fc 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -166,6 +166,17 @@ class HTMLPreProcessor(object):
                   (re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
                   (re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
                   (re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
+                  # ` with letter before
+                  (re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'),
+                  (re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'),
+                  (re.compile(u'e\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'è'),
+                  (re.compile(u'E\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'È'),
+                  (re.compile(u'i\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ì'),
+                  (re.compile(u'I\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ì'),
+                  (re.compile(u'o\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ò'),
+                  (re.compile(u'O\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ò'),
+                  (re.compile(u'u\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'ù'),
+                  (re.compile(u'U\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'Ù'),
 
                   # ´
                   (re.compile(u'´\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'á'),

From c4071a245d256642568aa8fc827a8e8516f0df98 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sat, 11 Sep 2010 13:40:27 +0100
Subject: [PATCH 16/43] Fix library sorting problem introduced by calling
 model.refresh() in the device connection sequence.

---
 src/calibre/gui2/library/models.py | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py
index bb47508531..8ad0cd6818 100644
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@@ -121,10 +121,8 @@ class BooksModel(QAbstractTableModel): # {{{
     def set_device_connected(self, is_connected):
         self.device_connected = is_connected
         self.db.refresh_ondevice()
-        self.refresh()
+        self.refresh() # does a resort()
         self.research()
-        if is_connected and self.sorted_on[0] == 'ondevice':
-            self.resort()
 
     def set_book_on_device_func(self, func):
         self.book_on_device = func
@@ -249,7 +247,7 @@ class BooksModel(QAbstractTableModel): # {{{
             # the search and count records for restrictions
             self.searched.emit(True)
 
-    def sort(self, col, order, reset=True):
+    def sort(self, col, order, reset=True, update_history=True):
         if not self.db:
             return
         self.about_to_be_sorted.emit(self.db.id)
@@ -260,23 +258,23 @@ class BooksModel(QAbstractTableModel): # {{{
             self.clear_caches()
             self.reset()
         self.sorted_on = (label, order)
-        self.sort_history.insert(0, self.sorted_on)
+        if update_history:
+            self.sort_history.insert(0, self.sorted_on)
         self.sorting_done.emit(self.db.index)
 
     def refresh(self, reset=True):
-        try:
-            col = self.column_map.index(self.sorted_on[0])
-        except:
-            col = 0
         self.db.refresh(field=None)
-        self.sort(col, self.sorted_on[1], reset=reset)
+        self.resort(reset=reset)
 
-    def resort(self, reset=True):
-        try:
-            col = self.column_map.index(self.sorted_on[0])
-        except ValueError:
-            col = 0
-        self.sort(col, self.sorted_on[1], reset=reset)
+    def resort(self, reset=True, history=5): # Bug report needed history=4 :)
+        for col,ord in reversed(self.sort_history[:history]):
+            try:
+                col = self.column_map.index(col)
+            except ValueError:
+                col = 0
+            self.sort(col, ord, reset=False, update_history=False)
+        if reset:
+            self.reset()
 
     def research(self, reset=True):
         self.search(self.last_search, reset=reset)

From 96478da323e642febb94c2c1a2c9826a6b3dddb7 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 11 Sep 2010 08:48:47 -0400
Subject: [PATCH 17/43] PLM Input: Fix cleanup code.

---
 src/calibre/ebooks/pml/pmlconverter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py
index 166695ff5c..3a4454725a 100644
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@@ -216,7 +216,7 @@ class PML_HTMLizer(object):
                 html = re.sub(r'(?u)%s\s*%s' % (open % '.*?', close), '', html)
             else:
                 html = re.sub(r'(?u)%s\s*%s' % (open, close), '', html)
-        html = re.sub(r'<p>\s*</p>', '', html)
+        html = re.sub(r'(?imu)<p>\s*</p>', '', html)
         return html
 
     def start_line(self):

From dc7bc5dd5d890278d7f43377e9df944675888fc6 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 11 Sep 2010 09:01:34 -0400
Subject: [PATCH 18/43] PML Input: Fix bug #6770, put toc link after header so
 toc link goes to correct page.

---
 src/calibre/ebooks/pml/pmlconverter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py
index 3a4454725a..6e479a71ef 100644
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@@ -556,7 +556,7 @@ class PML_HTMLizer(object):
                             text = t
                         else:
                             self.toc.add_item(os.path.basename(self.file_name), id, value)
-                            text = '<span id="%s"></span>%s' % (id, t)
+                            text = '%s<span id="%s"></span>' % (t, id)
                     elif c == 'm':
                         empty = False
                         src = self.code_value(line)

From c2b3c445e17a38b5599393c943036c6c448886da Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 11 Sep 2010 09:09:08 -0400
Subject: [PATCH 19/43] PML Input: Remove emtpy lines.

---
 src/calibre/ebooks/pml/pmlconverter.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/calibre/ebooks/pml/pmlconverter.py b/src/calibre/ebooks/pml/pmlconverter.py
index 6e479a71ef..b0fc15197a 100644
--- a/src/calibre/ebooks/pml/pmlconverter.py
+++ b/src/calibre/ebooks/pml/pmlconverter.py
@@ -207,6 +207,7 @@ class PML_HTMLizer(object):
         while html != old:
             old = html
             html = self.cleanup_html_remove_redundant(html)
+        html = re.sub(r'(?imu)^\s*', '', html)
         return html
 
     def cleanup_html_remove_redundant(self, html):

From ef8408869cebac380474deb971c4b6910680c895 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 11 Sep 2010 09:13:23 -0400
Subject: [PATCH 20/43] TXT Output: preserve spaces, handle tab character
 correct. &#09; is reduced to a single space by many renderers.

---
 src/calibre/ebooks/txt/processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/txt/processor.py b/src/calibre/ebooks/txt/processor.py
index a12e8a0761..dac1e34df7 100644
--- a/src/calibre/ebooks/txt/processor.py
+++ b/src/calibre/ebooks/txt/processor.py
@@ -77,7 +77,7 @@ def separate_paragraphs_print_formatted(txt):
 
 def preserve_spaces(txt):
     txt = txt.replace(' ', '&nbsp;')
-    txt = txt.replace('\t', '&#09;')
+    txt = txt.replace('\t', '&nbsp;&nbsp;&nbsp;&nbsp;')
     return txt
 
 def opf_writer(path, opf_name, manifest, spine, mi):

From a58aa5f0e5f455defefe94c10f372d33763e9b75 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sat, 11 Sep 2010 15:37:11 +0100
Subject: [PATCH 21/43] Fix bug reported in forum:
 http://www.mobileread.com/forums/showthread.php?t=98242

cache.refresh still used a parameter when calling search that was removed some releases ago.
---
 src/calibre/library/caches.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py
index b9c1211c7f..2096180f3c 100644
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@@ -549,7 +549,7 @@ class ResultCache(SearchQueryParser):
             self.sort(field, ascending)
         self._map_filtered = list(self._map)
         if self.search_restriction:
-            self.search('', return_matches=False, ignore_search_restriction=False)
+            self.search('', return_matches=False)
 
     def seriescmp(self, sidx, siidx, x, y, library_order=None):
         try:

From 3766f34aab8b6ae8b78570fb51d17bd92edc39a7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 11 Sep 2010 11:54:54 -0600
Subject: [PATCH 22/43] Fix regression in filename shortening that caused loss
 of filename extension

---
 src/calibre/utils/filenames.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/calibre/utils/filenames.py b/src/calibre/utils/filenames.py
index 9fd57ab53c..47ccbe73c2 100644
--- a/src/calibre/utils/filenames.py
+++ b/src/calibre/utils/filenames.py
@@ -54,10 +54,8 @@ def shorten_components_to(length, components):
             r = x[0] if x is components[-1] else ''
         else:
             if x is components[-1]:
-                b, _, e = x.rpartition('.')
-                if not b and e:
-                    b = e
-                    e = ''
+                b, e = os.path.splitext(x)
+                if e == '.': e = ''
                 r = b[:-delta]+e
                 if r.startswith('.'): r = x[0]+r
             else:

From 6eaa75527b5754cfbb8df833ad3375b724d51cfd Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sat, 11 Sep 2010 21:01:26 +0100
Subject: [PATCH 23/43] resort maximum_resort_levels tweak implemented

---
 resources/default_tweaks.py        | 7 +++++++
 src/calibre/gui2/library/models.py | 4 ++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py
index 66ee4d1471..9d9bc7651c 100644
--- a/resources/default_tweaks.py
+++ b/resources/default_tweaks.py
@@ -114,3 +114,10 @@ add_new_book_tags_when_importing_books = False
 # Set the maximum number of tags to show per book in the content server
 max_content_server_tags_shown=5
 
+
+# Set the maximum number of sort 'levels' that calibre will use to resort the
+# library after certain operations such as searches or device insertion. Each
+# sort level adds a performance penalty. If the database is large (thousands of
+# books) the penalty might be noticeable. If you are not concerned about multi-
+# level sorts, and if you are seeing a slowdown, reduce the value of this tweak.
+maximum_resort_levels = 5
\ No newline at end of file
diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py
index 8ad0cd6818..d2f38cc0a1 100644
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@@ -266,8 +266,8 @@ class BooksModel(QAbstractTableModel): # {{{
         self.db.refresh(field=None)
         self.resort(reset=reset)
 
-    def resort(self, reset=True, history=5): # Bug report needed history=4 :)
-        for col,ord in reversed(self.sort_history[:history]):
+    def resort(self, reset=True):
+        for col,ord in reversed(self.sort_history[:tweaks['maximum_resort_levels']]):
             try:
                 col = self.column_map.index(col)
             except ValueError:

From 721e61ef2a1fd090566e232ff9ca65e37400fe44 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sat, 11 Sep 2010 21:05:05 +0100
Subject: [PATCH 24/43] Clean up tweaks.py formatting (add blank lines)

---
 resources/default_tweaks.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py
index 9d9bc7651c..71bf2c6c37 100644
--- a/resources/default_tweaks.py
+++ b/resources/default_tweaks.py
@@ -120,4 +120,5 @@ max_content_server_tags_shown=5
 # sort level adds a performance penalty. If the database is large (thousands of
 # books) the penalty might be noticeable. If you are not concerned about multi-
 # level sorts, and if you are seeing a slowdown, reduce the value of this tweak.
-maximum_resort_levels = 5
\ No newline at end of file
+maximum_resort_levels = 5
+

From e531b517670e90cf99b8255fd47775e50450d7d1 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 11 Sep 2010 16:16:57 -0600
Subject: [PATCH 25/43] Code organization

---
 src/calibre/library/caches.py         | 48 ++++++++++++++++-----------
 src/calibre/library/field_metadata.py |  5 ++-
 2 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py
index 2096180f3c..eb0ceb3fe4 100644
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@@ -141,6 +141,8 @@ class ResultCache(SearchQueryParser):
         for x in self.iterall():
             yield x[idx]
 
+    # Search functions {{{
+
     def universal_set(self):
         return set([i[0] for i in self._data if i is not None])
 
@@ -462,6 +464,30 @@ class ResultCache(SearchQueryParser):
                             continue
         return matches
 
+    def search(self, query, return_matches=False):
+        ans = self.search_getting_ids(query, self.search_restriction)
+        if return_matches:
+            return ans
+        self._map_filtered = ans
+
+    def search_getting_ids(self, query, search_restriction):
+        q = ''
+        if not query or not query.strip():
+            q = search_restriction
+        else:
+            q = query
+            if search_restriction:
+                q = u'%s (%s)' % (search_restriction, query)
+        if not q:
+            return list(self._map)
+        matches = sorted(self.parse(q))
+        return [id for id in self._map if id in matches]
+
+    def set_search_restriction(self, s):
+        self.search_restriction = s
+
+    # }}}
+
     def remove(self, id):
         self._data[id] = None
         if id in self._map:
@@ -551,6 +577,8 @@ class ResultCache(SearchQueryParser):
         if self.search_restriction:
             self.search('', return_matches=False)
 
+    # Sorting functions {{{
+
     def seriescmp(self, sidx, siidx, x, y, library_order=None):
         try:
             if library_order:
@@ -615,24 +643,6 @@ class ResultCache(SearchQueryParser):
         self._map.sort(cmp=fcmp, reverse=not ascending)
         self._map_filtered = [id for id in self._map if id in self._map_filtered]
 
-    def search(self, query, return_matches=False):
-        ans = self.search_getting_ids(query, self.search_restriction)
-        if return_matches:
-            return ans
-        self._map_filtered = ans
+    # }}}
 
-    def search_getting_ids(self, query, search_restriction):
-        q = ''
-        if not query or not query.strip():
-            q = search_restriction
-        else:
-            q = query
-            if search_restriction:
-                q = u'%s (%s)' % (search_restriction, query)
-        if not q:
-            return list(self._map)
-        matches = sorted(self.parse(q))
-        return [id for id in self._map if id in matches]
 
-    def set_search_restriction(self, s):
-        self.search_restriction = s
diff --git a/src/calibre/library/field_metadata.py b/src/calibre/library/field_metadata.py
index 66cdee51f0..096dfa66fe 100644
--- a/src/calibre/library/field_metadata.py
+++ b/src/calibre/library/field_metadata.py
@@ -69,6 +69,8 @@ class FieldMetadata(dict):
     VALID_DATA_TYPES = frozenset([None, 'rating', 'text', 'comments', 'datetime',
                                   'int', 'float', 'bool', 'series'])
 
+    # Builtin metadata {{{
+
     _field_metadata = [
             ('authors',   {'table':'authors',
                            'column':'name',
@@ -287,7 +289,8 @@ class FieldMetadata(dict):
                            'search_terms':[],
                            'is_custom':False,
                            'is_category':False}),
-            ]
+        ]
+    # }}}
 
     # search labels that are not db columns
     search_items = [    'all',

From 7382552d18d604dff3b5472195fa9f3c07b0186c Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 11 Sep 2010 19:11:30 -0600
Subject: [PATCH 26/43] Much faster sorting code

---
 src/calibre/library/caches.py | 178 ++++++++++++++++++++++++++++++++--
 1 file changed, 171 insertions(+), 7 deletions(-)

diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py
index eb0ceb3fe4..59d5b45d5f 100644
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@@ -607,16 +607,22 @@ class ResultCache(SearchQueryParser):
                 y = UNDEFINED_DATE
             return cmp(x, y)
         if subsort and ans == 0:
-            return cmp(self._data[x][11].lower(), self._data[y][11].lower())
+            idx = self.FIELD_MAP['sort']
+            return cmp(self._data[x][idx].lower(), self._data[y][idx].lower())
         return ans
 
-    def sort(self, field, ascending, subsort=False):
+    def sanitize_field_name(self, field):
         field = field.lower().strip()
-        if field in ('author', 'tag', 'comment'):
-            field += 's'
-        if   field == 'date': field = 'timestamp'
-        elif field == 'title': field = 'sort'
-        elif field == 'authors': field = 'author_sort'
+        if field not in self.field_metadata.iterkeys():
+            if field in ('author', 'tag', 'comment'):
+                field += 's'
+            if   field == 'date': field = 'timestamp'
+            elif field == 'title': field = 'sort'
+            elif field == 'authors': field = 'author_sort'
+        return field
+
+    def sort(self, field, ascending, subsort=False):
+        field = self.sanitize_field_name(field)
         as_string = field not in ('size', 'rating', 'timestamp')
 
         if self.first_sort:
@@ -643,6 +649,164 @@ class ResultCache(SearchQueryParser):
         self._map.sort(cmp=fcmp, reverse=not ascending)
         self._map_filtered = [id for id in self._map if id in self._map_filtered]
 
+    def multisort(self, fields=[], subsort=False):
+        fields = [(self.sanitize_field_name(x), bool(y)) for x, y in fields]
+        if subsort and 'sort' not in [x[0] for x in fields]:
+            fields += [('sort', True)]
+        if not fields:
+            fields = [('timestamp', False)]
+        keys = self.field_metadata.keys()
+        for f, order in fields:
+            if f not in keys:
+                raise ValueError(f + ' not an existing field name')
+
+        keyg = SortKeyGenerator(fields, self.field_metadata, self._data)
+        if len(fields) == 1:
+            self._map.sort(key=keyg, reverse=not fields[0][1])
+        else:
+            self._map.sort(key=keyg)
+        self._map_filtered = [id for id in self._map if id in self._map_filtered]
+
+
+class SortKey(object):
+
+    def __init__(self, orders, values):
+        self.orders, self.values = orders, values
+
+    def __cmp__(self, other):
+        for i, ascending in enumerate(self.orders):
+            ans = cmp(self.values[i], other.values[i])
+            if ans != 0:
+                if not ascending:
+                    ans *= -1
+                return ans
+        return 0
+
+class SortKeyGenerator(object):
+
+    def __init__(self, fields, field_metadata, data):
+        self.field_metadata = field_metadata
+        self.orders = [x[1] for x in fields]
+        self.entries = [(x[0], field_metadata[x[0]]) for x in fields]
+        self.library_order = tweaks['title_series_sorting'] == 'library_order'
+        self.data = data
+
+    def __call__(self, record):
+        values = tuple(self.itervals(self.data[record]))
+        if len(values) == 1:
+            return values[0]
+        return SortKey(self.orders, values)
+
+    def itervals(self, record):
+        for name, fm in self.entries:
+            dt = fm['datatype']
+            val = record[fm['rec_index']]
+
+            if dt == 'datetime':
+                if val is None:
+                    val = UNDEFINED_DATE
+
+            elif dt == 'series':
+                if val is None:
+                    val = ('', 1)
+                else:
+                    val = val.lower()
+                    if self.library_order:
+                        val = title_sort(val)
+                    sidx_fm = self.field_metadata[name + '_index']
+                    sidx = record[sidx_fm['rec_index']]
+                    val = (val, sidx)
+
+            elif dt in ('text', 'comments'):
+                if val is None:
+                    val = ''
+                val = val.lower()
+            yield val
+
     # }}}
 
 
+if __name__ == '__main__':
+    # Testing.timing for new multi-sort {{{
+    import time
+
+    from calibre.library import db
+    db = db()
+
+    db.refresh()
+
+    fields = db.field_metadata.keys()
+
+    print fields
+
+
+    def do_single_sort(meth, field, order):
+        if meth == 'old':
+            db.data.sort(field, order)
+        else:
+            db.data.multisort([(field, order)])
+
+    def test_single_sort(field):
+        for meth in ('old', 'new'):
+            ttime = 0
+            NUM = 10
+            asc = desc = None
+            for i in range(NUM):
+                db.data.sort('id', False)
+                st = time.time()
+                do_single_sort(meth, field, True)
+                asc = db.data._map
+                do_single_sort(meth, field, False)
+                desc = db.data._map
+                ttime += time.time() - st
+            yield (ttime/NUM, asc, desc)
+
+
+    print 'Running single sort differentials'
+    for field in fields:
+        if field in ('search', 'id', 'news', 'flags'): continue
+        print '\t', field
+        old, new = test_single_sort(field)
+        if old[1] != new[1] or old[2] != new[2]:
+            print '\t\t', 'Sort failure!'
+            raise SystemExit(1)
+        print '\t\t', 'Old:', old[0], 'New:', new[0], 'Ratio: %.2f'%(new[0]/old[0])
+
+    def do_multi_sort(meth, ms):
+        if meth == 'new':
+            db.data.multisort(ms)
+        else:
+            for s in reversed(ms):
+                db.data.sort(*s)
+
+    def test_multi_sort(ms):
+        for meth in ('old', 'new'):
+            ttime = 0
+            NUM = 10
+            for i in range(NUM):
+                db.data.sort('id', False)
+                st = time.time()
+                do_multi_sort(meth, ms)
+                ttime += time.time() - st
+            yield (ttime/NUM, db.data._map)
+
+    print 'Running multi-sort differentials'
+
+    for ms in [
+            [('timestamp', False), ('author', True), ('title', False)],
+            [('size', True), ('tags', True), ('author', False)],
+            [('series', False), ('title', True)],
+            [('size', True), ('tags', True), ('author', False), ('pubdate',
+                True), ('tags', False), ('formats', False), ('uuid', True)],
+
+            ]:
+        print '\t', ms
+        db.data.sort('id', False)
+        old, new = test_multi_sort(ms)
+        if old[1] != new[1]:
+            print '\t\t', 'Sort failure!'
+            raise SystemExit()
+        print '\t\t', 'Old:', old[0], 'New:', new[0], 'Ratio: %.2f'%(new[0]/old[0])
+
+    # }}}
+

From 9a06996b16486a3511e4055535a6be48f484a90a Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 12 Sep 2010 11:17:49 +1000
Subject: [PATCH 27/43] minor tweaks to preprocessing, backed out reflow change

---
 src/calibre/ebooks/conversion/preprocess.py |  4 +--
 src/calibre/ebooks/conversion/utils.py      | 36 +++++++++++----------
 src/calibre/ebooks/pdf/reflow.py            |  4 ---
 3 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 6123577191..46308b2ea0 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -319,8 +319,8 @@ class HTMLPreProcessor(object):
       
         # unwrap hyphenation - moved here so it's executed after header/footer removal
         if is_pdftohtml:
-            # unwrap visible dashes and hyphens - don't delete as 50% or more of the time these
-            # hyphens are for compound words, formatting, etc
+            # unwrap visible dashes and hyphens - don't delete they are often hyphens for
+            # for compound words, formatting, etc
             end_rules.append((re.compile(u'(?<=[-–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
             # unwrap/delete soft hyphens
             end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 68cebb3a11..fb683bdb12 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -29,16 +29,12 @@ class PreProcessor(object):
                    self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
                    return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
 
-    def chapter_link(self, match):
-        chap = match.group('sectionlink')
-        if not chap:
-                   self.html_preprocess_sections = self.html_preprocess_sections + 1
-                   self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
-                   return '<br style="page-break-before:always">'
-        else:
-                   self.html_preprocess_sections = self.html_preprocess_sections + 1
-                   self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
-                   return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>'
+    def chapter_break(self, match):
+        chap = match.group('section')
+        styles = match.group('styles')
+        self.html_preprocess_sections = self.html_preprocess_sections + 1
+        self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
+        return '<'+styles+' style="page-break-before:always">'+chap
 
     def no_markup(self, raw, percent):
         '''
@@ -74,7 +70,7 @@ class PreProcessor(object):
         html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
         
         # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
-        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE)
+        linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
         blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
         blanklines = blankreg.findall(html)
         lines = linereg.findall(html)
@@ -100,8 +96,13 @@ class PreProcessor(object):
         chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
         html = chapdetect.sub(self.chapter_head, html)
         if self.html_preprocess_sections < 10:
-            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9}|\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            html = chapdetect2.sub(self.chapter_head, html)    
+
+        if self.html_preprocess_sections < 10:
+            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
             html = chapdetect2.sub(self.chapter_head, html)    
         #    
         # Unwrap lines using punctation if the median length of all lines is less than 200        
@@ -110,13 +111,14 @@ class PreProcessor(object):
         unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
         if length < 200:
             self.log("Unwrapping Lines")
-            html = unwrap.sub(' ', html)        
+            html = unwrap.sub(' ', html)
+            
         # If still no sections after unwrapping lines break on lines with no punctuation
         if self.html_preprocess_sections < 10:
-            self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
+            self.log(str(self.html_preprocess_sections) + " split points marked, matching based on punctuation")
             #self.log(html)
-            chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE)
-            html = chapdetect3.sub(self.chapter_head, html)        
+            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</(i|b|u)>){0,2}\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
+            html = chapdetect3.sub(self.chapter_break, html)      
         # search for places where a first or second level heading is immediately followed by another
         # top level heading.  demote the second heading to h3 to prevent splitting between chapter
         # headings and titles, images, etc
diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py
index 36848ddb8b..584d631d0b 100644
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@@ -408,10 +408,6 @@ class Page(object):
     # Fraction of text height that two strings' bottoms can differ by
     # for them to be considered to be part of the same text fragment
     LINE_FACTOR = 0.4
-    
-    # Percentage of the page heigth which should be considered header
-    # or footer to be discarded from reflow considerations
-    HEAD_FOOTER_MARGIN
 
     # Multiplies the average line height when determining row height
     # of a particular element to detect columns.

From bcd0430791f44ec926910eeb8bb18d7cbbff5fc9 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sun, 12 Sep 2010 13:37:28 +0100
Subject: [PATCH 28/43] Starting from Kovid's multisort: 1) change
 _map_filtered to an ordered dict to make 'in' operations much faster 2) add a
 method to field_metadata to return a dict of database fields. 3) fix a couple
 of places where field_metadata needed to be used. 4) make changes so
 gui2.library.models.resort uses multisort

---
 src/calibre/gui2/library/models.py    | 14 +++----
 src/calibre/library/caches.py         | 59 ++++++++++++++++-----------
 src/calibre/library/database2.py      |  1 +
 src/calibre/library/field_metadata.py |  3 ++
 4 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py
index d2f38cc0a1..d18516493a 100644
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@@ -247,7 +247,7 @@ class BooksModel(QAbstractTableModel): # {{{
             # the search and count records for restrictions
             self.searched.emit(True)
 
-    def sort(self, col, order, reset=True, update_history=True):
+    def sort(self, col, order, reset=True):
         if not self.db:
             return
         self.about_to_be_sorted.emit(self.db.id)
@@ -258,8 +258,7 @@ class BooksModel(QAbstractTableModel): # {{{
             self.clear_caches()
             self.reset()
         self.sorted_on = (label, order)
-        if update_history:
-            self.sort_history.insert(0, self.sorted_on)
+        self.sort_history.insert(0, self.sorted_on)
         self.sorting_done.emit(self.db.index)
 
     def refresh(self, reset=True):
@@ -267,12 +266,9 @@ class BooksModel(QAbstractTableModel): # {{{
         self.resort(reset=reset)
 
     def resort(self, reset=True):
-        for col,ord in reversed(self.sort_history[:tweaks['maximum_resort_levels']]):
-            try:
-                col = self.column_map.index(col)
-            except ValueError:
-                col = 0
-            self.sort(col, ord, reset=False, update_history=False)
+        if not self.db:
+            return
+        self.db.multisort(self.sort_history[:tweaks['maximum_resort_levels']])
         if reset:
             self.reset()
 
diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py
index 59d5b45d5f..c342d5ff15 100644
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@@ -20,6 +20,7 @@ from calibre.utils.search_query_parser import SearchQueryParser
 from calibre.utils.pyparsing import ParseException
 from calibre.ebooks.metadata import title_sort
 from calibre import fit_image
+from calibre.utils.ordered_dict import OrderedDict
 
 class CoverCache(Thread):
 
@@ -112,7 +113,8 @@ class ResultCache(SearchQueryParser):
     '''
     def __init__(self, FIELD_MAP, field_metadata):
         self.FIELD_MAP = FIELD_MAP
-        self._map = self._map_filtered = self._data = []
+        self._map = self._data = []
+        self._map_filtered = OrderedDict()
         self.first_sort = True
         self.search_restriction = ''
         self.field_metadata = field_metadata
@@ -122,14 +124,14 @@ class ResultCache(SearchQueryParser):
         self.build_numeric_relop_dict()
 
     def __getitem__(self, row):
-        return self._data[self._map_filtered[row]]
+        return self._data[self._map_filtered.keys()[row]]
 
     def __len__(self):
         return len(self._map_filtered)
 
     def __iter__(self):
         for id in self._map_filtered:
-            yield self._data[id]
+            yield id
 
     def iterall(self):
         for x in self._data:
@@ -468,7 +470,7 @@ class ResultCache(SearchQueryParser):
         ans = self.search_getting_ids(query, self.search_restriction)
         if return_matches:
             return ans
-        self._map_filtered = ans
+        self._map_filtered = OrderedDict.fromkeys(ans, True)
 
     def search_getting_ids(self, query, search_restriction):
         q = ''
@@ -480,7 +482,7 @@ class ResultCache(SearchQueryParser):
                 q = u'%s (%s)' % (search_restriction, query)
         if not q:
             return list(self._map)
-        matches = sorted(self.parse(q))
+        matches = self.parse(q)
         return [id for id in self._map if id in matches]
 
     def set_search_restriction(self, s):
@@ -493,18 +495,18 @@ class ResultCache(SearchQueryParser):
         if id in self._map:
             self._map.remove(id)
         if id in self._map_filtered:
-            self._map_filtered.remove(id)
+            del self._map_filtered[id]
 
     def set(self, row, col, val, row_is_id=False):
-        id = row if row_is_id else self._map_filtered[row]
+        id = row if row_is_id else self._map_filtered.keys()[row]
         self._data[id][col] = val
 
     def get(self, row, col, row_is_id=False):
-        id = row if row_is_id else self._map_filtered[row]
+        id = row if row_is_id else self._map_filtered.keys()[row]
         return self._data[id][col]
 
     def index(self, id, cache=False):
-        x = self._map if cache else self._map_filtered
+        x = self._map if cache else self._map_filtered.keys()
         return x.index(id)
 
     def row(self, id):
@@ -544,13 +546,18 @@ class ResultCache(SearchQueryParser):
             self._data[id].append(db.has_cover(id, index_is_id=True))
             self._data[id].append(db.book_on_device_string(id))
         self._map[0:0] = ids
-        self._map_filtered[0:0] = ids
+        mf = OrderedDict()
+        for id in ids:
+            mf[id] = True
+        for id in self._map_filtered:
+            mf[id] = True
+        self._map_filtered = mf
 
     def books_deleted(self, ids):
         for id in ids:
             self._data[id] = None
             if id in self._map: self._map.remove(id)
-            if id in self._map_filtered: self._map_filtered.remove(id)
+            if id in self._map_filtered: del self._map_filtered[id]
 
     def count(self):
         return len(self._map)
@@ -573,7 +580,7 @@ class ResultCache(SearchQueryParser):
         self._map = [i[0] for i in self._data if i is not None]
         if field is not None:
             self.sort(field, ascending)
-        self._map_filtered = list(self._map)
+        self._map_filtered = OrderedDict.fromkeys(self._map, True)
         if self.search_restriction:
             self.search('', return_matches=False)
 
@@ -644,10 +651,14 @@ class ResultCache(SearchQueryParser):
                 self.FIELD_MAP['series_index'],
                 library_order=tweaks['title_series_sorting'] == 'library_order')
         else:
-            fcmp = functools.partial(self.cmp, self.FIELD_MAP[field],
+            fcmp = functools.partial(self.cmp, self.field_metadata[field]['rec_index'],
                                      subsort=subsort, asstr=as_string)
         self._map.sort(cmp=fcmp, reverse=not ascending)
-        self._map_filtered = [id for id in self._map if id in self._map_filtered]
+        mf = OrderedDict()
+        for id in self._map:
+            if id in self._map_filtered:
+                mf[id] = True
+        self._map_filtered = mf
 
     def multisort(self, fields=[], subsort=False):
         fields = [(self.sanitize_field_name(x), bool(y)) for x, y in fields]
@@ -655,7 +666,7 @@ class ResultCache(SearchQueryParser):
             fields += [('sort', True)]
         if not fields:
             fields = [('timestamp', False)]
-        keys = self.field_metadata.keys()
+        keys = self.field_metadata.field_keys()
         for f, order in fields:
             if f not in keys:
                 raise ValueError(f + ' not an existing field name')
@@ -665,7 +676,11 @@ class ResultCache(SearchQueryParser):
             self._map.sort(key=keyg, reverse=not fields[0][1])
         else:
             self._map.sort(key=keyg)
-        self._map_filtered = [id for id in self._map if id in self._map_filtered]
+        mf = OrderedDict()
+        for id in self._map:
+            if id in self._map_filtered:
+                mf[id] = id
+        self._map_filtered = mf
 
 
 class SortKey(object):
@@ -677,16 +692,14 @@ class SortKey(object):
         for i, ascending in enumerate(self.orders):
             ans = cmp(self.values[i], other.values[i])
             if ans != 0:
-                if not ascending:
-                    ans *= -1
-                return ans
+                return ans * ascending
         return 0
 
 class SortKeyGenerator(object):
 
     def __init__(self, fields, field_metadata, data):
         self.field_metadata = field_metadata
-        self.orders = [x[1] for x in fields]
+        self.orders = [-1 if x[1] else 1 for x in fields]
         self.entries = [(x[0], field_metadata[x[0]]) for x in fields]
         self.library_order = tweaks['title_series_sorting'] == 'library_order'
         self.data = data
@@ -735,7 +748,7 @@ if __name__ == '__main__':
 
     db.refresh()
 
-    fields = db.field_metadata.keys()
+    fields = db.field_metadata.field_keys()
 
     print fields
 
@@ -765,7 +778,7 @@ if __name__ == '__main__':
     print 'Running single sort differentials'
     for field in fields:
         if field in ('search', 'id', 'news', 'flags'): continue
-        print '\t', field
+        print '\t', field, db.field_metadata[field]['datatype']
         old, new = test_single_sort(field)
         if old[1] != new[1] or old[2] != new[2]:
             print '\t\t', 'Sort failure!'
@@ -797,7 +810,7 @@ if __name__ == '__main__':
             [('size', True), ('tags', True), ('author', False)],
             [('series', False), ('title', True)],
             [('size', True), ('tags', True), ('author', False), ('pubdate',
-                True), ('tags', False), ('formats', False), ('uuid', True)],
+                True), ('series', False), ('formats', False), ('uuid', True)],
 
             ]:
         print '\t', ms
diff --git a/src/calibre/library/database2.py b/src/calibre/library/database2.py
index 4106f8c965..8a5ab75c3c 100644
--- a/src/calibre/library/database2.py
+++ b/src/calibre/library/database2.py
@@ -311,6 +311,7 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
         self.search_getting_ids  = self.data.search_getting_ids
         self.refresh = functools.partial(self.data.refresh, self)
         self.sort    = self.data.sort
+        self.multisort = self.data.multisort
         self.index   = self.data.index
         self.refresh_ids = functools.partial(self.data.refresh_ids, self)
         self.row     = self.data.row
diff --git a/src/calibre/library/field_metadata.py b/src/calibre/library/field_metadata.py
index 096dfa66fe..276a6ba971 100644
--- a/src/calibre/library/field_metadata.py
+++ b/src/calibre/library/field_metadata.py
@@ -335,6 +335,9 @@ class FieldMetadata(dict):
     def keys(self):
         return self._tb_cats.keys()
 
+    def field_keys(self):
+        return [k for k in self._tb_cats.keys() if self._tb_cats[k]['kind']=='field']
+
     def iterkeys(self):
         for key in self._tb_cats:
             yield key

From 8b09f4c293e82ff797635320c42487d9be190831 Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sun, 12 Sep 2010 13:42:37 +0100
Subject: [PATCH 29/43] Restore the second 'tags' to the tests

---
 src/calibre/library/caches.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py
index c342d5ff15..882de975db 100644
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@@ -810,7 +810,7 @@ if __name__ == '__main__':
             [('size', True), ('tags', True), ('author', False)],
             [('series', False), ('title', True)],
             [('size', True), ('tags', True), ('author', False), ('pubdate',
-                True), ('series', False), ('formats', False), ('uuid', True)],
+                True), ('tags', False), ('formats', False), ('uuid', True)],
 
             ]:
         print '\t', ms

From 5626418d1a6993b16f3d6a83c22a761a7490b7ee Mon Sep 17 00:00:00 2001
From: Charles Haley <>
Date: Sun, 12 Sep 2010 14:51:21 +0100
Subject: [PATCH 30/43] Correct regression in device handing -- sorting after
 sending a book.

---
 src/calibre/gui2/library/models.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py
index d18516493a..c746a5aa56 100644
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@@ -1024,6 +1024,11 @@ class DeviceBooksModel(BooksModel): # {{{
         if reset:
             self.reset()
 
+    def resort(self, reset=True):
+        if self.sorted_on:
+            self.sort(self.column_map.index(self.sorted_on[0]),
+                      self.sorted_on[1], reset=reset)
+
     def columnCount(self, parent):
         if parent and parent.isValid():
             return 0

From cdb696f63bc39b9327abe809fa71e94baa6e0b86 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 13 Sep 2010 00:12:21 +1000
Subject: [PATCH 31/43] enhanced preprocessing class - looking pretty good

---
 src/calibre/ebooks/conversion/preprocess.py | 18 ++--
 src/calibre/ebooks/conversion/utils.py      | 98 +++++++++++++++------
 2 files changed, 82 insertions(+), 34 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 46308b2ea0..f6277956c8 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -62,7 +62,6 @@ def wrap_lines(match):
     else:
                return ital+' '
 
-
 def line_length(format, raw, percent):
     '''
     raw is the raw text to find the line length to use for wrapping.
@@ -76,6 +75,8 @@ def line_length(format, raw, percent):
         linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
     elif format == 'pdf':
         linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
+    elif format == 'spanned_html':
+        linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
     lines = linere.findall(raw)
 
     lengths = []
@@ -223,14 +224,15 @@ class HTMLPreProcessor(object):
                   # Remove page links
                   (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
                   # Remove <hr> tags
-                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
+                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br>'),
 
                   # Remove gray background
                   (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
 
                   # Detect Chapters to match default XPATH in GUI
-                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</(i|b)>(</(i|b)>)?)?)\s*(</?(br|p)[^>]*>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
-                  (re.compile(r'<br\s*/?>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<(i|b)>)?(\s*\w+){1,4}\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?'), chap_head),
+                  (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
+                  # Cover the case where every letter in a chapter title is separated by a space
+                  (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
                   
                   # Have paragraphs show better
                   (re.compile(r'<br.*?>'), lambda match : '<p>'),
@@ -238,8 +240,7 @@ class HTMLPreProcessor(object):
                   (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
                   # Add space before and after italics
                   (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
-                  (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
-                                   
+                  (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),                            
                  ]
 
     # Fix Book Designer markup
@@ -327,10 +328,11 @@ class HTMLPreProcessor(object):
             # unwrap/delete soft hyphens with formatting
             end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
         
-        # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
+        # Make the more aggressive chapter marking regex optional with the preprocess option to 
+        # reduce false positives and move after header/footer removal
         if getattr(self.extra_opts, 'preprocess_html', None):
             if is_pdftohtml:
-                end_rules.append((re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P<chap>([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?|\d+\.?\s*([\d\w-]+\s*){0,4}\s*)\s*(</(i|b)>(</(i|b)>)?)?\s*(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head))
+                end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
                 
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
             length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index fb683bdb12..abfa43e7ed 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -8,10 +8,10 @@ __docformat__ = 'restructuredtext en'
 import re
 from calibre.ebooks.conversion.preprocess import line_length
 from calibre.utils.logging import default_log
-from lxml import etree
 
 class PreProcessor(object):
     html_preprocess_sections = 0
+    found_indents = 0
 
     def __init__(self, args):
         self.args = args
@@ -22,11 +22,11 @@ class PreProcessor(object):
         title = match.group('title')
         if not title:
                    self.html_preprocess_sections = self.html_preprocess_sections + 1
-                   self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
+                   self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
                    return '<h2>'+chap+'</h2>\n'
         else:
                    self.html_preprocess_sections = self.html_preprocess_sections + 1
-                   self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+                   self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
                    return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
 
     def chapter_break(self, match):
@@ -35,7 +35,22 @@ class PreProcessor(object):
         self.html_preprocess_sections = self.html_preprocess_sections + 1
         self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
         return '<'+styles+' style="page-break-before:always">'+chap
-
+    
+    def insert_indent(self, match):
+        pstyle = match.group('formatting')
+        span = match.group('span')
+        self.found_indents = self.found_indents + 1
+        if pstyle:
+            if not span:
+                return '<p '+pstyle+' style="text-indent:3%">'
+            else:
+                return '<p '+pstyle+' style="text-indent:3%">'+span
+        else:
+            if not span:
+                return '<p style="text-indent:3%">'
+            else:
+                return '<p style="text-indent:3%">'+span
+        
     def no_markup(self, raw, percent):
         '''
         Detects total marked up line endings in the file. raw is the text to 
@@ -48,7 +63,7 @@ class PreProcessor(object):
         line_end = line_end_ere.findall(raw)
         tot_htm_ends = len(htm_end)
         tot_ln_fds = len(line_end)
-        self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
+        self.log("There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked up endings")
 
         if percent > 1:
             percent = 1
@@ -56,13 +71,18 @@ class PreProcessor(object):
             percent = 0    
     
         min_lns = tot_ln_fds * percent
-        self.log("There must be fewer than " + str(min_lns) + " unmarked lines to return true")
+        self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
         if min_lns > tot_htm_ends:
             return True
             
     def __call__(self, html):
         self.log("*********  Preprocessing HTML  *********")
-        # remove non-breaking spaces
+        # Replace series of non-breaking spaces with text-indent
+        txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
+        html = txtindent.sub(self.insert_indent, html)
+        if self.found_indents > 1:
+            self.log("replaced "+str(self.found_indents)+ " nbsp indents with inline styles")
+        # remove remaining non-breaking spaces
         html = re.sub(ur'\u00a0', ' ', html)
         # Get rid of empty <o:p> tags to simplify other processing
         html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
@@ -83,41 +103,67 @@ class PreProcessor(object):
         html = re.sub(r"\s*</p>", "</p>\n", html)
         html = re.sub(r"\s*<p>\s*", "\n<p>", html)
         
-        # some lit files don't have any <p> tags or equivalent, check and 
-        # mark up line endings if required before proceeding
+        # some lit files don't have any <p> tags or equivalent (generally just plain text between 
+        # <pre> tags), check and  mark up line endings if required before proceeding
         if self.no_markup(html, 0.1):
              self.log("not enough paragraph markers, adding now")
              add_markup = re.compile('(?<!>)(\n)')
              html = add_markup.sub('</p>\n<p>', html)
         
         # detect chapters/sections to match xpath or splitting logic
+        heading = re.compile('<h(1|2)[^>]*>', re.IGNORECASE)
+        self.html_preprocess_sections = len(heading.findall(html))
+        self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
         # 
-        # Start with most typical chapter headings       
-        chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
-        html = chapdetect.sub(self.chapter_head, html)
+        # Start with most typical chapter headings, get more aggressive until one works
+        if self.html_preprocess_sections < 10:
+            chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
+            html = chapdetect.sub(self.chapter_head, html)
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
             html = chapdetect2.sub(self.chapter_head, html)    
 
         if self.html_preprocess_sections < 10:
             self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
-            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
-            html = chapdetect2.sub(self.chapter_head, html)    
-        #    
-        # Unwrap lines using punctation if the median length of all lines is less than 200        
-        length = line_length('html', html, 0.4)
-        self.log("*** Median line length is " + str(length) + " ***")
-        unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
-        if length < 200:
-            self.log("Unwrapping Lines")
-            html = unwrap.sub(' ', html)
+            chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE)
+            html = chapdetect2.sub(self.chapter_head, html)
             
-        # If still no sections after unwrapping lines break on lines with no punctuation
+        # Unwrap lines
+        # 
+        self.log("Unwrapping Lines")
+        # Some OCR sourced files have line breaks in the html using a combination of span & p tags
+        # span are used for hard line breaks, p for new paragraphs.  Determine which is used so 
+        # that lines can be wrapped across page boundaries
+        paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
+        spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
+        paras = len(paras_reg.findall(html))
+        spans = len(spans_reg.findall(html))
+        if spans > 1:
+            if float(paras) / float(spans) < 0.75:
+                format = 'spanned_html'
+            else:
+                format = 'html'
+        else:
+            format = 'html'
+        
+        # Calculate Length
+        length = line_length(format, html, 0.4)
+        self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
+        #
+        # Unwrap and/or delete soft-hyphens, hyphens
+        html = re.sub(u'­\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
+        html = re.sub(u'(?<=[-–—])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
+        
+        # Unwrap lines using punctation if the median length of all lines is less than 200        
+        unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+        html = unwrap.sub(' ', html)
+
+        # If still no sections after unwrapping mark split points on lines with no punctuation
         if self.html_preprocess_sections < 10:
-            self.log(str(self.html_preprocess_sections) + " split points marked, matching based on punctuation")
+            self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
             #self.log(html)
-            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</(i|b|u)>){0,2}\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
+            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*(<[ibu]>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
             html = chapdetect3.sub(self.chapter_break, html)      
         # search for places where a first or second level heading is immediately followed by another
         # top level heading.  demote the second heading to h3 to prevent splitting between chapter

From 6cc332089a421e6100fa4937c5126309c483e132 Mon Sep 17 00:00:00 2001
From: Starson17 <starson17@gmail.com>
Date: Sun, 12 Sep 2010 11:28:24 -0400
Subject: [PATCH 32/43] Change Merge and Safe Merge warnings re ISBN

---
 src/calibre/gui2/actions/edit_metadata.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/calibre/gui2/actions/edit_metadata.py b/src/calibre/gui2/actions/edit_metadata.py
index f0232d9859..878ba77a43 100644
--- a/src/calibre/gui2/actions/edit_metadata.py
+++ b/src/calibre/gui2/actions/edit_metadata.py
@@ -209,8 +209,9 @@ class EditMetadataAction(InterfaceAction):
         dest_id, src_books, src_ids = self.books_to_merge(rows)
         if safe_merge:
             if not confirm('<p>'+_(
-                'All book formats and metadata from the selected books '
-                'will be added to the <b>first selected book.</b><br><br> '
+                'Book formats and metadata from the selected books '
+                'will be added to the <b>first selected book.</b> '
+                'ISBN will <i>not</i> be merged.<br><br> '
                 'The second and subsequently selected books will not '
                 'be deleted or changed.<br><br>'
                 'Please confirm you want to proceed.')
@@ -220,8 +221,9 @@ class EditMetadataAction(InterfaceAction):
             self.merge_metadata(dest_id, src_ids)
         else:
             if not confirm('<p>'+_(
-                'All book formats and metadata from the selected books will be merged '
-                'into the <b>first selected book</b>.<br><br>'
+                'Book formats and metadata from the selected books will be merged '
+                'into the <b>first selected book</b>. '
+                'ISBN will <i>not</i> be merged.<br><br>'
                 'After merger the second and '
                 'subsequently selected books will be <b>deleted</b>. <br><br>'
                 'All book formats of the first selected book will be kept '

From 78874a9117941de749f3b09934be8588181dd4b7 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 12 Sep 2010 09:32:16 -0600
Subject: [PATCH 33/43] Use the new sorting code in the content server as well.

---
 src/calibre/library/caches.py         | 153 +-------------------------
 src/calibre/library/server/content.py |  38 +++----
 2 files changed, 18 insertions(+), 173 deletions(-)

diff --git a/src/calibre/library/caches.py b/src/calibre/library/caches.py
index dfd7086076..4f795ab733 100644
--- a/src/calibre/library/caches.py
+++ b/src/calibre/library/caches.py
@@ -6,7 +6,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-import re, itertools, functools
+import re, itertools
 from itertools import repeat
 from datetime import timedelta
 from threading import Thread, RLock
@@ -584,39 +584,7 @@ class ResultCache(SearchQueryParser):
 
     # Sorting functions {{{
 
-    def seriescmp(self, sidx, siidx, x, y, library_order=None):
-        try:
-            if library_order:
-                ans = cmp(title_sort(self._data[x][sidx].lower()),
-                                title_sort(self._data[y][sidx].lower()))
-            else:
-                ans = cmp(self._data[x][sidx].lower(),
-                                                self._data[y][sidx].lower())
-        except AttributeError: # Some entries may be None
-            ans = cmp(self._data[x][sidx], self._data[y][sidx])
-        if ans != 0: return ans
-        return cmp(self._data[x][siidx], self._data[y][siidx])
-
-    def cmp(self, loc, x, y, asstr=True, subsort=False):
-        try:
-            ans = cmp(self._data[x][loc].lower(), self._data[y][loc].lower()) if \
-                asstr else cmp(self._data[x][loc], self._data[y][loc])
-        except AttributeError: # Some entries may be None
-            ans = cmp(self._data[x][loc], self._data[y][loc])
-        except TypeError: ## raised when a datetime is None
-            x = self._data[x][loc]
-            if x is None:
-                x = UNDEFINED_DATE
-            y = self._data[y][loc]
-            if y is None:
-                y = UNDEFINED_DATE
-            return cmp(x, y)
-        if subsort and ans == 0:
-            idx = self.FIELD_MAP['sort']
-            return cmp(self._data[x][idx].lower(), self._data[y][idx].lower())
-        return ans
-
-    def sanitize_field_name(self, field):
+    def sanitize_sort_field_name(self, field):
         field = field.lower().strip()
         if field not in self.field_metadata.iterkeys():
             if field in ('author', 'tag', 'comment'):
@@ -627,38 +595,10 @@ class ResultCache(SearchQueryParser):
         return field
 
     def sort(self, field, ascending, subsort=False):
-        field = self.sanitize_field_name(field)
-        as_string = field not in ('size', 'rating', 'timestamp')
-
-        if self.first_sort:
-            subsort = True
-            self.first_sort = False
-        if self.field_metadata[field]['is_custom']:
-            if self.field_metadata[field]['datatype'] == 'series':
-                fcmp = functools.partial(self.seriescmp,
-                    self.field_metadata[field]['rec_index'],
-                    self.field_metadata.cc_series_index_column_for(field),
-                    library_order=tweaks['title_series_sorting'] == 'library_order')
-            else:
-                as_string = self.field_metadata[field]['datatype'] in ('comments', 'text')
-                field = self.field_metadata[field]['colnum']
-                fcmp = functools.partial(self.cmp, self.FIELD_MAP[field],
-                                     subsort=subsort, asstr=as_string)
-        elif field == 'series':
-            fcmp = functools.partial(self.seriescmp, self.FIELD_MAP['series'],
-                self.FIELD_MAP['series_index'],
-                library_order=tweaks['title_series_sorting'] == 'library_order')
-        else:
-            fcmp = functools.partial(self.cmp, self.field_metadata[field]['rec_index'],
-                                     subsort=subsort, asstr=as_string)
-        self._map.sort(cmp=fcmp, reverse=not ascending)
-        tmap = list(itertools.repeat(False, len(self._data)))
-        for x in self._map_filtered:
-            tmap[x] = True
-        self._map_filtered = [x for x in self._map if tmap[x]]
+        self.multisort([(field, ascending)])
 
     def multisort(self, fields=[], subsort=False):
-        fields = [(self.sanitize_field_name(x), bool(y)) for x, y in fields]
+        fields = [(self.sanitize_sort_field_name(x), bool(y)) for x, y in fields]
         keys = self.field_metadata.field_keys()
         fields = [x for x in fields if x[0] in keys]
         if subsort and 'sort' not in [x[0] for x in fields]:
@@ -671,6 +611,7 @@ class ResultCache(SearchQueryParser):
             self._map.sort(key=keyg, reverse=not fields[0][1])
         else:
             self._map.sort(key=keyg)
+
         tmap = list(itertools.repeat(False, len(self._data)))
         for x in self._map_filtered:
             tmap[x] = True
@@ -733,87 +674,3 @@ class SortKeyGenerator(object):
     # }}}
 
 
-if __name__ == '__main__':
-    # Testing.timing for new multi-sort {{{
-    import time
-
-    from calibre.library import db
-    db = db()
-
-    db.refresh()
-
-    fields = db.field_metadata.field_keys()
-
-    print fields
-
-
-    def do_single_sort(meth, field, order):
-        if meth == 'old':
-            db.data.sort(field, order)
-        else:
-            db.data.multisort([(field, order)])
-
-    def test_single_sort(field):
-        for meth in ('old', 'new'):
-            ttime = 0
-            NUM = 10
-            asc = desc = None
-            for i in range(NUM):
-                db.data.sort('id', False)
-                st = time.time()
-                do_single_sort(meth, field, True)
-                asc = db.data._map
-                do_single_sort(meth, field, False)
-                desc = db.data._map
-                ttime += time.time() - st
-            yield (ttime/NUM, asc, desc)
-
-
-    print 'Running single sort differentials'
-    for field in fields:
-        if field in ('search', 'id', 'news', 'flags'): continue
-        print '\t', field, db.field_metadata[field]['datatype']
-        old, new = test_single_sort(field)
-        if old[1] != new[1] or old[2] != new[2]:
-            print '\t\t', 'Sort failure!'
-            raise SystemExit(1)
-        print '\t\t', 'Old:', old[0], 'New:', new[0], 'Ratio: %.2f'%(new[0]/old[0])
-
-    def do_multi_sort(meth, ms):
-        if meth == 'new':
-            db.data.multisort(ms)
-        else:
-            for s in reversed(ms):
-                db.data.sort(*s)
-
-    def test_multi_sort(ms):
-        for meth in ('old', 'new'):
-            ttime = 0
-            NUM = 10
-            for i in range(NUM):
-                db.data.sort('id', False)
-                st = time.time()
-                do_multi_sort(meth, ms)
-                ttime += time.time() - st
-            yield (ttime/NUM, db.data._map)
-
-    print 'Running multi-sort differentials'
-
-    for ms in [
-            [('timestamp', False), ('author', True), ('title', False)],
-            [('size', True), ('tags', True), ('author', False)],
-            [('series', False), ('title', True)],
-            [('size', True), ('tags', True), ('author', False), ('pubdate',
-                True), ('tags', False), ('formats', False), ('uuid', True)],
-
-            ]:
-        print '\t', ms
-        db.data.sort('id', False)
-        old, new = test_multi_sort(ms)
-        if old[1] != new[1]:
-            print '\t\t', 'Sort failure!'
-            raise SystemExit()
-        print '\t\t', 'Old:', old[0], 'New:', new[0], 'Ratio: %.2f'%(new[0]/old[0])
-
-    # }}}
-
diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py
index 6784abd8f4..ecb467b4c2 100644
--- a/src/calibre/library/server/content.py
+++ b/src/calibre/library/server/content.py
@@ -5,7 +5,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
-import re, os, cStringIO, operator
+import re, os, cStringIO
 
 import cherrypy
 try:
@@ -16,7 +16,15 @@ except ImportError:
 
 from calibre import fit_image, guess_type
 from calibre.utils.date import fromtimestamp
-from calibre.ebooks.metadata import title_sort
+from calibre.library.caches import SortKeyGenerator
+
+class CSSortKeyGenerator(SortKeyGenerator):
+
+    def __init__(self, fields, fm):
+        SortKeyGenerator.__init__(self, fields, fm, None)
+
+    def __call__(self, record):
+        return self.itervals(record).next()
 
 class ContentServer(object):
 
@@ -47,32 +55,12 @@ class ContentServer(object):
 
 
     def sort(self, items, field, order):
-        field = field.lower().strip()
-        if field == 'author':
-            field = 'authors'
-        if field == 'date':
-            field = 'timestamp'
+        field = self.db.data.sanitize_sort_field_name(field)
         if field not in ('title', 'authors', 'rating', 'timestamp', 'tags', 'size', 'series'):
             raise cherrypy.HTTPError(400, '%s is not a valid sort field'%field)
-        cmpf = cmp if field in ('rating', 'size', 'timestamp') else \
-                lambda x, y: cmp(x.lower() if x else '', y.lower() if y else '')
-        if field == 'series':
-            items.sort(cmp=self.seriescmp, reverse=not order)
-        else:
-            lookup = 'sort' if field == 'title' else field
-            lookup = 'author_sort' if field == 'authors' else field
-            field = self.db.FIELD_MAP[lookup]
-            getter = operator.itemgetter(field)
-            items.sort(cmp=lambda x, y: cmpf(getter(x), getter(y)), reverse=not order)
+        keyg = CSSortKeyGenerator([(field, order)], self.db.field_metadata)
+        items.sort(key=keyg, reverse=not order)
 
-    def seriescmp(self, x, y):
-        si = self.db.FIELD_MAP['series']
-        try:
-            ans = cmp(title_sort(x[si].lower()), title_sort(y[si].lower()))
-        except AttributeError: # Some entries may be None
-            ans = cmp(x[si], y[si])
-        if ans != 0: return ans
-        return cmp(x[self.db.FIELD_MAP['series_index']], y[self.db.FIELD_MAP['series_index']])
     # }}}
 
 

From 80c976e0f24f05a5ee7a9bfce50bf7745215e339 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 12 Sep 2010 11:11:00 -0600
Subject: [PATCH 34/43] Fix #6794 (Updated recipes for Infobae and NSPM)

---
 resources/recipes/infobae.recipe | 82 ++++++++------------------------
 resources/recipes/nspm.recipe    | 11 ++++-
 2 files changed, 30 insertions(+), 63 deletions(-)

diff --git a/resources/recipes/infobae.recipe b/resources/recipes/infobae.recipe
index cda9bf83d2..b7f9cd3c6c 100644
--- a/resources/recipes/infobae.recipe
+++ b/resources/recipes/infobae.recipe
@@ -1,12 +1,8 @@
-#!/usr/bin/env  python
-
 __license__   = 'GPL v3'
-__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
+__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
 '''
 infobae.com
 '''
-import re
-import urllib, urlparse
 
 from calibre.web.feeds.news import BasicNewsRecipe
 
@@ -20,35 +16,24 @@ class Infobae(BasicNewsRecipe):
     max_articles_per_feed = 100
     no_stylesheets        = True
     use_embedded_content  = False
-    language = 'es'
-    lang = 'es-AR'
-
+    language              = 'es'
     encoding              = 'cp1252'
-    cover_url             = 'http://www.infobae.com/imgs/header/header.gif'
+    masthead_url          = 'http://www.infobae.com/imgs/header/header.gif'
     remove_javascript     = True
-    preprocess_regexps = [(re.compile(
-        r'<meta name="Description" content="[^"]+">'), lambda m:'')]
-
-
-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
-
-    extra_css = '''
-                    .col-center{font-family:Arial,Helvetica,sans-serif;}
-                    h1{font-family:Arial,Helvetica,sans-serif; color:#0D4261;}
-                    .fuenteIntNota{font-family:Arial,Helvetica,sans-serif; color:#1D1D1D; font-size:x-small;}
-                '''
-
-    keep_only_tags = [dict(name='div', attrs={'class':['content']})]
-
-
-    remove_tags = [
-               dict(name='div', attrs={'class':['options','col-right','controles', 'bannerLibre','tiulo-masleidas','masleidas-h']}),
-               dict(name='a', attrs={'name' : 'comentario',}),
-               dict(name='iframe'),
-               dict(name='img', alt = "Ver galerias de imagenes"),
-
-                                 ]
-
+    remove_empty_feeds    = True
+    extra_css             = '''
+                              body{font-family:Arial,Helvetica,sans-serif;}
+                              .popUpTitulo{color:#0D4261; font-size: xx-large}
+                            '''
+    
+    conversion_options = {
+                          'comment'          : description
+                        , 'tags'             : category
+                        , 'publisher'        : publisher
+                        , 'language'         : language
+                        , 'linearize_tables' : True
+                        }
+    
 
     feeds = [
               (u'Noticias'  , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml'       )
@@ -57,39 +42,14 @@ class Infobae(BasicNewsRecipe):
              ,(u'Deportes'  , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml'  )
             ]
 
-#    def print_version(self, url):
-#        main, sep, article_part = url.partition('contenidos/')
-#        article_id, rsep, rrest = article_part.partition('-')
-#        return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
-
-    def get_article_url(self, article):
-        ans = article.get('link').encode('utf-8')
-        parts = list(urlparse.urlparse(ans))
-        parts[2] = urllib.quote(parts[2])
-        ans = urlparse.urlunparse(parts)
-        return ans.decode('utf-8')
-
-
-    def preprocess_html(self, soup):
-
-        for tag in soup.head.findAll('strong'):
-            tag.extract()
-        for tag in soup.findAll('meta'):
-            del tag['content']
-            tag.extract()
-
-        mtag = '<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\n<meta http-equiv="Content-Language" content="es-AR"/>\n'
-        soup.head.insert(0,mtag)
-        for item in soup.findAll(style=True):
-            del item['style']
-
-        return soup
+    def print_version(self, url):
+        article_part = url.rpartition('/')[2]
+        article_id= article_part.partition('-')[0]
+        return 'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
 
     def postprocess_html(self, soup, first):
-
         for tag in soup.findAll(name='strong'):
              tag.name = 'b'
-
         return soup
 
 
diff --git a/resources/recipes/nspm.recipe b/resources/recipes/nspm.recipe
index 13ff42b277..29f2cfc5e3 100644
--- a/resources/recipes/nspm.recipe
+++ b/resources/recipes/nspm.recipe
@@ -6,6 +6,7 @@ nspm.rs
 
 import re
 from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import Tag, NavigableString
 
 class Nspm(BasicNewsRecipe):
     title                 = 'Nova srpska politicka misao'
@@ -21,6 +22,7 @@ class Nspm(BasicNewsRecipe):
     encoding              = 'utf-8'
     language              = 'sr'
     delay                 = 2
+    remove_empty_feeds    = True
     publication_type      = 'magazine'
     masthead_url          = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg'
     extra_css             = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
@@ -45,8 +47,9 @@ class Nspm(BasicNewsRecipe):
                            dict(name=['link','object','embed','script','meta','base','iframe'])
                           ,dict(attrs={'class':'buttonheading'})
                          ]
-    remove_tags_after = dict(attrs={'class':'article_separator'})
-    remove_attributes = ['width','height']
+    remove_tags_before = dict(attrs={'class':'contentheading'})
+    remove_tags_after  = dict(attrs={'class':'article_separator'})
+    remove_attributes  = ['width','height']
 
     def get_browser(self):
         br = BasicNewsRecipe.get_browser()
@@ -67,4 +70,8 @@ class Nspm(BasicNewsRecipe):
     def preprocess_html(self, soup):
         for item in soup.body.findAll(style=True):
             del item['style']
+        for item in soup.body.findAll('h1'):
+            nh = NavigableString(item.a.string)
+            item.a.extract()
+            item.insert(0,nh)
         return self.adeify_images(soup)

From 548417ea6b6157faf1688b3b082f3eac5476636f Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 13 Sep 2010 09:18:45 +1000
Subject: [PATCH 35/43] comments and minor tweak

---
 src/calibre/ebooks/conversion/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index abfa43e7ed..ecf030b27d 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -111,7 +111,7 @@ class PreProcessor(object):
              html = add_markup.sub('</p>\n<p>', html)
         
         # detect chapters/sections to match xpath or splitting logic
-        heading = re.compile('<h(1|2)[^>]*>', re.IGNORECASE)
+        heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
         self.html_preprocess_sections = len(heading.findall(html))
         self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
         # 
@@ -134,7 +134,7 @@ class PreProcessor(object):
         self.log("Unwrapping Lines")
         # Some OCR sourced files have line breaks in the html using a combination of span & p tags
         # span are used for hard line breaks, p for new paragraphs.  Determine which is used so 
-        # that lines can be wrapped across page boundaries
+        # that lines can be un-wrapped across page boundaries
         paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
         spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
         paras = len(paras_reg.findall(html))

From de6aadee76d4dafe9b84133dc3af43ddef22fd0a Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 13 Sep 2010 10:15:35 -0600
Subject: [PATCH 36/43] News download: Fix bug that could break some downloads
 in non ASCII locales

---
 resources/recipes/xkcd.recipe     | 6 +++---
 src/calibre/web/feeds/__init__.py | 4 +++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/resources/recipes/xkcd.recipe b/resources/recipes/xkcd.recipe
index 312027004e..ad0d420deb 100644
--- a/resources/recipes/xkcd.recipe
+++ b/resources/recipes/xkcd.recipe
@@ -24,18 +24,18 @@ class XkcdCom(BasicNewsRecipe):
         (re.compile(r'(<img.*title=")([^"]+)(".*>)'),
          lambda m: '%s%s<p>%s</p>' % (m.group(1), m.group(3), m.group(2)))
     ]
-    
+
     def parse_index(self):
         INDEX = 'http://xkcd.com/archive/'
 
-        soup = self.index_to_soup(INDEX) 
+        soup = self.index_to_soup(INDEX)
         articles = []
         for item in soup.findAll('a', title=True):
             articles.append({
                 'date': item['title'],
                 'timestamp': time.mktime(time.strptime(item['title'], '%Y-%m-%d'))+1,
                 'url': 'http://xkcd.com' + item['href'],
-                'title': self.tag_to_string(item).encode('UTF-8'),
+                'title': self.tag_to_string(item),
                 'description': '',
                 'content': '',
             })
diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py
index a70cf8b664..8aef350498 100644
--- a/src/calibre/web/feeds/__init__.py
+++ b/src/calibre/web/feeds/__init__.py
@@ -165,7 +165,9 @@ class Feed(object):
             if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
                 self.articles.append(article)
             else:
-                self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
+                t = strftime(u'%a, %d %b, %Y %H:%M', article.localtime.timetuple())
+                self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%
+                        (title, t, self.title))
             d = item.get('date', '')
             article.formatted_date = d
 

From b73e1b3da50810e151d10a5d62251754a077e605 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 14 Sep 2010 02:56:56 +1000
Subject: [PATCH 37/43] tweaked preprocess for $, added rtf to new preprocess
 logic, changed last pdf default

---
 src/calibre/ebooks/conversion/preprocess.py |  2 +-
 src/calibre/ebooks/rtf/input.py             | 13 +++----------
 src/calibre/gui2/convert/pdf_input.ui       |  2 +-
 3 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index f6277956c8..9464be1210 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -340,7 +340,7 @@ class HTMLPreProcessor(object):
                 # print "The pdf line length returned is " + str(length)
                 end_rules.append(
                     # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                 )
 
         for rule in self.PREPROCESS + start_rules:
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 216ccf591d..d229b80c16 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -8,6 +8,7 @@ from lxml import etree
 
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
 
 class InlineClass(etree.XSLTExtension):
 
@@ -229,16 +230,8 @@ class RTFInput(InputFormatPlugin):
             res = transform.tostring(result)
             res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
             if self.options.preprocess_html:
-                self.log("*********  Preprocessing HTML  *********")
-                # Detect Chapters to match the xpath in the GUI
-                chapdetect = re.compile(r'<p[^>]*>\s*<span[^>]*>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(<(/i|b)>)?)?)\s*</span>\s*</p>', re.IGNORECASE)
-                res = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', res)
-                # Unwrap lines using punctation if the median length of all lines is less than 150
-                length = line_length('html', res, 0.4)
-                self.log("*** Median length is " + str(length) + " ***")
-                unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</span>\s*</p>\s*(?P<up2threeblanks><p[^>]*>\s*(<span[^>]*>\s*</span>\s*)</p>\s*){0,3}\s*<p[^>]*>\s*<span[^>]*>\s*" % length, re.UNICODE)
-                if length < 150:
-                    res = unwrap.sub(' ', res)
+                preprocessor = PreProcessor(res)
+                res = preprocessor(res)
             f.write(res)
         self.write_inline_css(inline_class)
         stream.seek(0)
diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui
index 626c68ea63..b2ee421922 100644
--- a/src/calibre/gui2/convert/pdf_input.ui
+++ b/src/calibre/gui2/convert/pdf_input.ui
@@ -46,7 +46,7 @@
       <double>0.010000000000000</double>
      </property>
      <property name="value">
-      <double>0.500000000000000</double>
+      <double>0.450000000000000</double>
      </property>
     </widget>
    </item>

From 8b73bb52e8d551538d0c0e55e7b91b6b16f69977 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 13 Sep 2010 16:42:22 -0600
Subject: [PATCH 38/43] Fix #6802 (Sovos E Reader Not Recognised / Floppy Drive
 Activation)

---
 src/calibre/customize/builtins.py     |  3 ++-
 src/calibre/devices/teclast/driver.py | 11 +++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index 4c87236e71..68df832048 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -459,7 +459,7 @@ from calibre.devices.iriver.driver import IRIVER_STORY
 from calibre.devices.binatone.driver import README
 from calibre.devices.hanvon.driver import N516, EB511, ALEX, AZBOOKA, THEBOOK
 from calibre.devices.edge.driver import EDGE
-from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS
+from calibre.devices.teclast.driver import TECLAST_K3, NEWSMY, IPAPYRUS, SOVOS
 from calibre.devices.sne.driver import SNE
 from calibre.devices.misc import PALMPRE, AVANT, SWEEX, PDNOVEL, KOGAN, GEMEI
 from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
@@ -557,6 +557,7 @@ plugins += [
     TECLAST_K3,
     NEWSMY,
     IPAPYRUS,
+    SOVOS,
     EDGE,
     SNE,
     ALEX,
diff --git a/src/calibre/devices/teclast/driver.py b/src/calibre/devices/teclast/driver.py
index 0c60a367cf..2055ff9306 100644
--- a/src/calibre/devices/teclast/driver.py
+++ b/src/calibre/devices/teclast/driver.py
@@ -52,3 +52,14 @@ class IPAPYRUS(TECLAST_K3):
     VENDOR_NAME      = 'E_READER'
     WINDOWS_MAIN_MEM = ''
 
+class SOVOS(TECLAST_K3):
+
+    name = 'Sovos device interface'
+    gui_name = 'Sovos'
+    description    = _('Communicate with the Sovos reader.')
+
+    FORMATS = ['epub', 'fb2', 'pdf', 'txt']
+
+    VENDOR_NAME      = 'RK28XX'
+    WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = 'USB-MSC'
+

From fb053fe3f37d531a170bb2a1d67ccf70ea030351 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 13 Sep 2010 16:58:09 -0600
Subject: [PATCH 39/43] Fix #6773 (Slightly broken CHM file)

---
 src/calibre/ebooks/chm/reader.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/chm/reader.py b/src/calibre/ebooks/chm/reader.py
index 67a2d36607..831c16bf6a 100644
--- a/src/calibre/ebooks/chm/reader.py
+++ b/src/calibre/ebooks/chm/reader.py
@@ -132,7 +132,11 @@ class CHMReader(CHMFile):
         for path in self.Contents():
             lpath = os.path.join(output_dir, path)
             self._ensure_dir(lpath)
-            data = self.GetFile(path)
+            try:
+                data = self.GetFile(path)
+            except:
+                self.log.exception('Failed to extract %s from CHM, ignoring'%path)
+                continue
             if lpath.find(';') != -1:
                 # fix file names with ";<junk>" at the end, see _reformat()
                 lpath = lpath.split(';')[0]

From ba5de1c92d797abc1f82782c7e15bd61dfa387c5 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 13 Sep 2010 18:18:32 -0600
Subject: [PATCH 40/43] Conversion pipeline: When setting margins on <body>
 explicitly set padding to 0 to override and existing padding in the input
 document

---
 src/calibre/ebooks/oeb/transforms/flatcss.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py
index f48bdb9934..ffdc641d1e 100644
--- a/src/calibre/ebooks/oeb/transforms/flatcss.py
+++ b/src/calibre/ebooks/oeb/transforms/flatcss.py
@@ -138,6 +138,7 @@ class CSSFlattener(object):
                     float(self.context.margin_left))
             bs.append('margin-right : %fpt'%\
                     float(self.context.margin_right))
+            bs.extend(['padding-left: 0pt', 'padding-right: 0pt'])
             if self.context.change_justification != 'original':
                 bs.append('text-align: '+ self.context.change_justification)
             body.set('style', '; '.join(bs))

From c5063b8633506f3b661d3e3dcc84d7ec68e74345 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 13 Sep 2010 18:26:51 -0600
Subject: [PATCH 41/43] Fix #6804 (Timeout error when browsing content server
 via browser)

---
 resources/content_server/gui.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/content_server/gui.js b/resources/content_server/gui.js
index 631fb8b617..d0fb49cc8e 100644
--- a/resources/content_server/gui.js
+++ b/resources/content_server/gui.js
@@ -26,7 +26,7 @@ var current_library_request = null;
 
 ////////////////////////////// GET BOOK LIST //////////////////////////////
 
-var LIBRARY_FETCH_TIMEOUT = 30000; // milliseconds
+var LIBRARY_FETCH_TIMEOUT = 5*60000; // milliseconds
 
 function create_table_headers() {
     var thead = $('table#book_list thead tr');

From c5415bbe8012179b405f2c3ca3b5258e83a863b3 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 13 Sep 2010 19:11:38 -0600
Subject: [PATCH 42/43] Fix #6806 (--start-in-tray switch displays hidden
 windows in metacity, xfwm4 and compiz)

---
 src/calibre/gui2/cover_flow.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/calibre/gui2/cover_flow.py b/src/calibre/gui2/cover_flow.py
index 88bbae6c41..cb951b09be 100644
--- a/src/calibre/gui2/cover_flow.py
+++ b/src/calibre/gui2/cover_flow.py
@@ -155,6 +155,7 @@ class CoverFlowMixin(object):
             self.cb_splitter.action_toggle.triggered.connect(self.toggle_cover_browser)
             if CoverFlow is not None:
                 self.cover_flow.stop.connect(self.hide_cover_browser)
+            self.cover_flow.setVisible(False)
         else:
             self.cb_splitter.insertWidget(self.cb_splitter.side_index, self.cover_flow)
             if CoverFlow is not None:

From 6a3609f031bb9400630cd6418b278903a4883c8a Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 13 Sep 2010 19:58:22 -0600
Subject: [PATCH 43/43] Implement #6808 (Feature request: ability to convert
 all single/double quotes to "smart quotes")

---
 src/calibre/ebooks/conversion/cli.py        |   2 +-
 src/calibre/ebooks/conversion/plumber.py    |   8 +
 src/calibre/ebooks/conversion/preprocess.py |  23 +-
 src/calibre/gui2/convert/look_and_feel.py   |   2 +-
 src/calibre/gui2/convert/look_and_feel.ui   |   9 +-
 src/calibre/utils/smartypants.py            | 899 ++++++++++++++++++++
 6 files changed, 933 insertions(+), 10 deletions(-)
 create mode 100755 src/calibre/utils/smartypants.py

diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index 7439718cf6..2ef633d0bb 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -122,7 +122,7 @@ def add_pipeline_options(parser, plumber):
                       'font_size_mapping',
                       'line_height',
                       'linearize_tables',
-                      'extra_css',
+                      'extra_css', 'smarten_punctuation',
                       'margin_top', 'margin_left', 'margin_right',
                       'margin_bottom', 'change_justification',
                       'insert_blank_line', 'remove_paragraph_spacing','remove_paragraph_spacing_indent_size',
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 24b35f804f..16282dd28d 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -362,6 +362,14 @@ OptionRecommendation(name='preprocess_html',
             )
         ),
 
+OptionRecommendation(name='smarten_punctuation',
+        recommended_value=False, level=OptionRecommendation.LOW,
+        help=_('Convert plain quotes, dashes and ellipsis to their '
+            'typographically correct equivalents. For details, see '
+            'http://daringfireball.net/projects/smartypants'
+            )
+        ),
+
 OptionRecommendation(name='remove_header',
         recommended_value=False, level=OptionRecommendation.LOW,
         help=_('Use a regular expression to try and remove the header.'
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 7742a20a21..4538af96c4 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -221,7 +221,7 @@ class HTMLPreProcessor(object):
                   (re.compile(u'˛\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ą'),
                   (re.compile(u'˛\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ę'),
                   (re.compile(u'˛\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ę'),
-                  
+
                   # ˙
                   (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
                   (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
@@ -244,14 +244,14 @@ class HTMLPreProcessor(object):
                   (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
                   # Cover the case where every letter in a chapter title is separated by a space
                   (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
-                  
+
                   # Have paragraphs show better
                   (re.compile(r'<br.*?>'), lambda match : '<p>'),
                   # Clean up spaces
                   (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
                   # Add space before and after italics
                   (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
-                  (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),                            
+                  (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
                  ]
 
     # Fix Book Designer markup
@@ -328,7 +328,7 @@ class HTMLPreProcessor(object):
                 import traceback
                 print 'Failed to parse remove_footer regexp'
                 traceback.print_exc()
-      
+
         # unwrap hyphenation - moved here so it's executed after header/footer removal
         if is_pdftohtml:
             # unwrap visible dashes and hyphens - don't delete they are often hyphens for
@@ -338,13 +338,13 @@ class HTMLPreProcessor(object):
             end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
             # unwrap/delete soft hyphens with formatting
             end_rules.append((re.compile(u'[­]\s*(</(i|u|b)>)+(\s*<p>)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
-        
-        # Make the more aggressive chapter marking regex optional with the preprocess option to 
+
+        # Make the more aggressive chapter marking regex optional with the preprocess option to
         # reduce false positives and move after header/footer removal
         if getattr(self.extra_opts, 'preprocess_html', None):
             if is_pdftohtml:
                 end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
-                
+
         if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
             length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
             if length:
@@ -401,5 +401,14 @@ class HTMLPreProcessor(object):
         if self.plugin_preprocess:
             html = self.input_plugin_preprocess(html)
 
+        if getattr(self.extra_opts, 'smarten_punctuation', False):
+            html = self.smarten_punctuation(html)
+
         return html
 
+    def smarten_punctuation(self, html):
+        from calibre.utils.smartypants import smartyPants
+        from calibre.ebooks.chardet import substitute_entites
+        html = smartyPants(html)
+        return substitute_entites(html)
+
diff --git a/src/calibre/gui2/convert/look_and_feel.py b/src/calibre/gui2/convert/look_and_feel.py
index b0403bf1dd..ec3f0b944d 100644
--- a/src/calibre/gui2/convert/look_and_feel.py
+++ b/src/calibre/gui2/convert/look_and_feel.py
@@ -22,7 +22,7 @@ class LookAndFeelWidget(Widget, Ui_Form):
         Widget.__init__(self, parent,
                 ['change_justification', 'extra_css', 'base_font_size',
                     'font_size_mapping', 'line_height',
-                    'linearize_tables',
+                    'linearize_tables', 'smarten_punctuation',
                     'disable_font_rescaling', 'insert_blank_line',
                     'remove_paragraph_spacing', 'remove_paragraph_spacing_indent_size','input_encoding',
                     'asciiize', 'keep_ligatures']
diff --git a/src/calibre/gui2/convert/look_and_feel.ui b/src/calibre/gui2/convert/look_and_feel.ui
index de48e7caf9..c683300854 100644
--- a/src/calibre/gui2/convert/look_and_feel.ui
+++ b/src/calibre/gui2/convert/look_and_feel.ui
@@ -178,7 +178,7 @@
      </property>
     </widget>
    </item>
-   <item row="9" column="0" colspan="4">
+   <item row="10" column="0" colspan="4">
     <widget class="QGroupBox" name="groupBox">
      <property name="title">
       <string>Extra &amp;CSS</string>
@@ -214,6 +214,13 @@
      </property>
     </widget>
    </item>
+   <item row="9" column="0">
+    <widget class="QCheckBox" name="opt_smarten_punctuation">
+     <property name="text">
+      <string>Smarten &amp;punctuation</string>
+     </property>
+    </widget>
+   </item>
   </layout>
  </widget>
  <resources>
diff --git a/src/calibre/utils/smartypants.py b/src/calibre/utils/smartypants.py
new file mode 100755
index 0000000000..44aac4de8c
--- /dev/null
+++ b/src/calibre/utils/smartypants.py
@@ -0,0 +1,899 @@
+#!/usr/bin/python
+
+r"""
+==============
+smartypants.py
+==============
+
+----------------------------
+SmartyPants ported to Python
+----------------------------
+
+Ported by `Chad Miller`_
+Copyright (c) 2004, 2007 Chad Miller
+
+original `SmartyPants`_ by `John Gruber`_
+Copyright (c) 2003 John Gruber
+
+
+Synopsis
+========
+
+A smart-quotes plugin for Pyblosxom_.
+
+The priginal "SmartyPants" is a free web publishing plug-in for Movable Type,
+Blosxom, and BBEdit that easily translates plain ASCII punctuation characters
+into "smart" typographic punctuation HTML entities.
+
+This software, *smartypants.py*, endeavours to be a functional port of
+SmartyPants to Python, for use with Pyblosxom_.
+
+
+Description
+===========
+
+SmartyPants can perform the following transformations:
+
+- Straight quotes ( " and ' ) into "curly" quote HTML entities
+- Backticks-style quotes (\`\`like this'') into "curly" quote HTML entities
+- Dashes (``--`` and ``---``) into en- and em-dash entities
+- Three consecutive dots (``...`` or ``. . .``) into an ellipsis entity
+
+This means you can write, edit, and save your posts using plain old
+ASCII straight quotes, plain dashes, and plain dots, but your published
+posts (and final HTML output) will appear with smart quotes, em-dashes,
+and proper ellipses.
+
+SmartyPants does not modify characters within ``<pre>``, ``<code>``, ``<kbd>``,
+``<math>`` or ``<script>`` tag blocks. Typically, these tags are used to
+display text where smart quotes and other "smart punctuation" would not be
+appropriate, such as source code or example markup.
+
+
+Backslash Escapes
+=================
+
+If you need to use literal straight quotes (or plain hyphens and
+periods), SmartyPants accepts the following backslash escape sequences
+to force non-smart punctuation. It does so by transforming the escape
+sequence into a decimal-encoded HTML entity:
+
+(FIXME:  table here.)
+
+.. comment    It sucks that there's a disconnect between the visual layout and table markup when special characters are involved.
+.. comment ======  =====  =========
+.. comment Escape  Value  Character
+.. comment ======  =====  =========
+.. comment \\\\\\\\    &#92;  \\\\
+.. comment \\\\"     &#34;  "
+.. comment \\\\'     &#39;  '
+.. comment \\\\.     &#46;  .
+.. comment \\\\-     &#45;  \-
+.. comment \\\\`     &#96;  \`
+.. comment ======  =====  =========
+
+This is useful, for example, when you want to use straight quotes as
+foot and inch marks: 6'2" tall; a 17" iMac.
+
+Options
+=======
+
+For Pyblosxom users, the ``smartypants_attributes`` attribute is where you
+specify configuration options.
+
+Numeric values are the easiest way to configure SmartyPants' behavior:
+
+"0"
+	Suppress all transformations. (Do nothing.)
+"1"
+	Performs default SmartyPants transformations: quotes (including
+	\`\`backticks'' -style), em-dashes, and ellipses. "``--``" (dash dash)
+	is used to signify an em-dash; there is no support for en-dashes.
+
+"2"
+	Same as smarty_pants="1", except that it uses the old-school typewriter
+	shorthand for dashes:  "``--``" (dash dash) for en-dashes, "``---``"
+	(dash dash dash)
+	for em-dashes.
+
+"3"
+	Same as smarty_pants="2", but inverts the shorthand for dashes:
+	"``--``" (dash dash) for em-dashes, and "``---``" (dash dash dash) for
+	en-dashes.
+
+"-1"
+	Stupefy mode. Reverses the SmartyPants transformation process, turning
+	the HTML entities produced by SmartyPants into their ASCII equivalents.
+	E.g.  "&#8220;" is turned into a simple double-quote ("), "&#8212;" is
+	turned into two dashes, etc.
+
+
+The following single-character attribute values can be combined to toggle
+individual transformations from within the smarty_pants attribute. For
+example, to educate normal quotes and em-dashes, but not ellipses or
+\`\`backticks'' -style quotes:
+
+``py['smartypants_attributes'] = "1"``
+
+"q"
+	Educates normal quote characters: (") and (').
+
+"b"
+	Educates \`\`backticks'' -style double quotes.
+
+"B"
+	Educates \`\`backticks'' -style double quotes and \`single' quotes.
+
+"d"
+	Educates em-dashes.
+
+"D"
+	Educates em-dashes and en-dashes, using old-school typewriter shorthand:
+	(dash dash) for en-dashes, (dash dash dash) for em-dashes.
+
+"i"
+	Educates em-dashes and en-dashes, using inverted old-school typewriter
+	shorthand: (dash dash) for em-dashes, (dash dash dash) for en-dashes.
+
+"e"
+	Educates ellipses.
+
+"w"
+	Translates any instance of ``&quot;`` into a normal double-quote character.
+	This should be of no interest to most people, but of particular interest
+	to anyone who writes their posts using Dreamweaver, as Dreamweaver
+	inexplicably uses this entity to represent a literal double-quote
+	character. SmartyPants only educates normal quotes, not entities (because
+	ordinarily, entities are used for the explicit purpose of representing the
+	specific character they represent). The "w" option must be used in
+	conjunction with one (or both) of the other quote options ("q" or "b").
+	Thus, if you wish to apply all SmartyPants transformations (quotes, en-
+	and em-dashes, and ellipses) and also translate ``&quot;`` entities into
+	regular quotes so SmartyPants can educate them, you should pass the
+	following to the smarty_pants attribute:
+
+The ``smartypants_forbidden_flavours`` list contains pyblosxom flavours for
+which no Smarty Pants rendering will occur.
+
+
+Caveats
+=======
+
+Why You Might Not Want to Use Smart Quotes in Your Weblog
+---------------------------------------------------------
+
+For one thing, you might not care.
+
+Most normal, mentally stable individuals do not take notice of proper
+typographic punctuation. Many design and typography nerds, however, break
+out in a nasty rash when they encounter, say, a restaurant sign that uses
+a straight apostrophe to spell "Joe's".
+
+If you're the sort of person who just doesn't care, you might well want to
+continue not caring. Using straight quotes -- and sticking to the 7-bit
+ASCII character set in general -- is certainly a simpler way to live.
+
+Even if you I *do* care about accurate typography, you still might want to
+think twice before educating the quote characters in your weblog. One side
+effect of publishing curly quote HTML entities is that it makes your
+weblog a bit harder for others to quote from using copy-and-paste. What
+happens is that when someone copies text from your blog, the copied text
+contains the 8-bit curly quote characters (as well as the 8-bit characters
+for em-dashes and ellipses, if you use these options). These characters
+are not standard across different text encoding methods, which is why they
+need to be encoded as HTML entities.
+
+People copying text from your weblog, however, may not notice that you're
+using curly quotes, and they'll go ahead and paste the unencoded 8-bit
+characters copied from their browser into an email message or their own
+weblog. When pasted as raw "smart quotes", these characters are likely to
+get mangled beyond recognition.
+
+That said, my own opinion is that any decent text editor or email client
+makes it easy to stupefy smart quote characters into their 7-bit
+equivalents, and I don't consider it my problem if you're using an
+indecent text editor or email client.
+
+
+Algorithmic Shortcomings
+------------------------
+
+One situation in which quotes will get curled the wrong way is when
+apostrophes are used at the start of leading contractions. For example:
+
+``'Twas the night before Christmas.``
+
+In the case above, SmartyPants will turn the apostrophe into an opening
+single-quote, when in fact it should be a closing one. I don't think
+this problem can be solved in the general case -- every word processor
+I've tried gets this wrong as well. In such cases, it's best to use the
+proper HTML entity for closing single-quotes (``&#8217;``) by hand.
+
+
+Bugs
+====
+
+To file bug reports or feature requests (other than topics listed in the
+Caveats section above) please send email to: mailto:smartypantspy@chad.org
+
+If the bug involves quotes being curled the wrong way, please send example
+text to illustrate.
+
+To Do list
+----------
+
+- Provide a function for use within templates to quote anything at all.
+
+
+Version History
+===============
+
+1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400
+	- Fixed bug where blocks of precious unalterable text was instead
+	  interpreted.  Thanks to Le Roux and Dirk van Oosterbosch.
+
+1.5_1.5: Sat, 13 Aug 2005 15:50:24 -0400
+	- Fix bogus magical quotation when there is no hint that the
+	  user wants it, e.g., in "21st century".  Thanks to Nathan Hamblen.
+	- Be smarter about quotes before terminating numbers in an en-dash'ed
+	  range.
+
+1.5_1.4: Thu, 10 Feb 2005 20:24:36 -0500
+	- Fix a date-processing bug, as reported by jacob childress.
+	- Begin a test-suite for ensuring correct output.
+	- Removed import of "string", since I didn't really need it.
+	  (This was my first every Python program.  Sue me!)
+
+1.5_1.3: Wed, 15 Sep 2004 18:25:58 -0400
+	- Abort processing if the flavour is in forbidden-list.  Default of
+	  [ "rss" ]   (Idea of Wolfgang SCHNERRING.)
+	- Remove stray virgules from en-dashes.  Patch by Wolfgang SCHNERRING.
+
+1.5_1.2: Mon, 24 May 2004 08:14:54 -0400
+	- Some single quotes weren't replaced properly.  Diff-tesuji played
+	  by Benjamin GEIGER.
+
+1.5_1.1: Sun, 14 Mar 2004 14:38:28 -0500
+	- Support upcoming pyblosxom 0.9 plugin verification feature.
+
+1.5_1.0: Tue, 09 Mar 2004 08:08:35 -0500
+	- Initial release
+
+Version Information
+-------------------
+
+Version numbers will track the SmartyPants_ version numbers, with the addition
+of an underscore and the smartypants.py version on the end.
+
+New versions will be available at `http://wiki.chad.org/SmartyPantsPy`_
+
+.. _http://wiki.chad.org/SmartyPantsPy: http://wiki.chad.org/SmartyPantsPy
+
+Authors
+=======
+
+`John Gruber`_ did all of the hard work of writing this software in Perl for
+`Movable Type`_ and almost all of this useful documentation.  `Chad Miller`_
+ported it to Python to use with Pyblosxom_.
+
+
+Additional Credits
+==================
+
+Portions of the SmartyPants original work are based on Brad Choate's nifty
+MTRegex plug-in.  `Brad Choate`_ also contributed a few bits of source code to
+this plug-in.  Brad Choate is a fine hacker indeed.
+
+`Jeremy Hedley`_ and `Charles Wiltgen`_ deserve mention for exemplary beta
+testing of the original SmartyPants.
+
+`Rael Dornfest`_ ported SmartyPants to Blosxom.
+
+.. _Brad Choate: http://bradchoate.com/
+.. _Jeremy Hedley: http://antipixel.com/
+.. _Charles Wiltgen: http://playbacktime.com/
+.. _Rael Dornfest: http://raelity.org/
+
+
+Copyright and License
+=====================
+
+SmartyPants_ license::
+
+	Copyright (c) 2003 John Gruber
+	(http://daringfireball.net/)
+	All rights reserved.
+
+	Redistribution and use in source and binary forms, with or without
+	modification, are permitted provided that the following conditions are
+	met:
+
+	*   Redistributions of source code must retain the above copyright
+		notice, this list of conditions and the following disclaimer.
+
+	*   Redistributions in binary form must reproduce the above copyright
+		notice, this list of conditions and the following disclaimer in
+		the documentation and/or other materials provided with the
+		distribution.
+
+	*   Neither the name "SmartyPants" nor the names of its contributors
+		may be used to endorse or promote products derived from this
+		software without specific prior written permission.
+
+	This software is provided by the copyright holders and contributors "as
+	is" and any express or implied warranties, including, but not limited
+	to, the implied warranties of merchantability and fitness for a
+	particular purpose are disclaimed. In no event shall the copyright
+	owner or contributors be liable for any direct, indirect, incidental,
+	special, exemplary, or consequential damages (including, but not
+	limited to, procurement of substitute goods or services; loss of use,
+	data, or profits; or business interruption) however caused and on any
+	theory of liability, whether in contract, strict liability, or tort
+	(including negligence or otherwise) arising in any way out of the use
+	of this software, even if advised of the possibility of such damage.
+
+
+smartypants.py license::
+
+	smartypants.py is a derivative work of SmartyPants.
+
+	Redistribution and use in source and binary forms, with or without
+	modification, are permitted provided that the following conditions are
+	met:
+
+	*   Redistributions of source code must retain the above copyright
+		notice, this list of conditions and the following disclaimer.
+
+	*   Redistributions in binary form must reproduce the above copyright
+		notice, this list of conditions and the following disclaimer in
+		the documentation and/or other materials provided with the
+		distribution.
+
+	This software is provided by the copyright holders and contributors "as
+	is" and any express or implied warranties, including, but not limited
+	to, the implied warranties of merchantability and fitness for a
+	particular purpose are disclaimed. In no event shall the copyright
+	owner or contributors be liable for any direct, indirect, incidental,
+	special, exemplary, or consequential damages (including, but not
+	limited to, procurement of substitute goods or services; loss of use,
+	data, or profits; or business interruption) however caused and on any
+	theory of liability, whether in contract, strict liability, or tort
+	(including negligence or otherwise) arising in any way out of the use
+	of this software, even if advised of the possibility of such damage.
+
+
+
+.. _John Gruber: http://daringfireball.net/
+.. _Chad Miller: http://web.chad.org/
+
+.. _Pyblosxom: http://roughingit.subtlehints.net/pyblosxom
+.. _SmartyPants: http://daringfireball.net/projects/smartypants/
+.. _Movable Type: http://www.movabletype.org/
+
+"""
+
+default_smartypants_attr = "1"
+
+import re
+
+tags_to_skip_regex = re.compile(r"<(/)?(pre|code|kbd|script|math)[^>]*>", re.I)
+
+
+def verify_installation(request):
+	return 1
+	# assert the plugin is functional
+
+
+def cb_story(args):
+	global default_smartypants_attr
+
+	try:
+		forbidden_flavours = args["entry"]["smartypants_forbidden_flavours"]
+	except KeyError:
+		forbidden_flavours = [ "rss" ]
+
+	try:
+		attributes = args["entry"]["smartypants_attributes"]
+	except KeyError:
+		attributes = default_smartypants_attr
+
+	if attributes is None:
+		attributes = default_smartypants_attr
+
+	entryData = args["entry"].getData()
+
+	try:
+		if args["request"]["flavour"] in forbidden_flavours:
+			return
+	except KeyError:
+		if "&lt;" in args["entry"]["body"][0:15]:  # sniff the stream
+			return  # abort if it looks like escaped HTML.  FIXME
+
+	# FIXME: make these configurable, perhaps?
+	args["entry"]["body"] = smartyPants(entryData, attributes)
+	args["entry"]["title"] = smartyPants(args["entry"]["title"], attributes)
+
+
+### interal functions below here
+
+def smartyPants(text, attr=default_smartypants_attr):
+	convert_quot = False  # should we translate &quot; entities into normal quotes?
+
+	# Parse attributes:
+	# 0 : do nothing
+	# 1 : set all
+	# 2 : set all, using old school en- and em- dash shortcuts
+	# 3 : set all, using inverted old school en and em- dash shortcuts
+	#
+	# q : quotes
+	# b : backtick quotes (``double'' only)
+	# B : backtick quotes (``double'' and `single')
+	# d : dashes
+	# D : old school dashes
+	# i : inverted old school dashes
+	# e : ellipses
+	# w : convert &quot; entities to " for Dreamweaver users
+
+	skipped_tag_stack = []
+	do_dashes = "0"
+	do_backticks = "0"
+	do_quotes = "0"
+	do_ellipses = "0"
+	do_stupefy = "0"
+
+	if attr == "0":
+		# Do nothing.
+		return text
+	elif attr == "1":
+		do_quotes    = "1"
+		do_backticks = "1"
+		do_dashes    = "1"
+		do_ellipses  = "1"
+	elif attr == "2":
+		# Do everything, turn all options on, use old school dash shorthand.
+		do_quotes    = "1"
+		do_backticks = "1"
+		do_dashes    = "2"
+		do_ellipses  = "1"
+	elif attr == "3":
+		# Do everything, turn all options on, use inverted old school dash shorthand.
+		do_quotes    = "1"
+		do_backticks = "1"
+		do_dashes    = "3"
+		do_ellipses  = "1"
+	elif attr == "-1":
+		# Special "stupefy" mode.
+		do_stupefy   = "1"
+	else:
+		for c in attr:
+			if c == "q": do_quotes = "1"
+			elif c == "b": do_backticks = "1"
+			elif c == "B": do_backticks = "2"
+			elif c == "d": do_dashes = "1"
+			elif c == "D": do_dashes = "2"
+			elif c == "i": do_dashes = "3"
+			elif c == "e": do_ellipses = "1"
+			elif c == "w": convert_quot = "1"
+			else:
+				pass
+				# ignore unknown option
+
+	tokens = _tokenize(text)
+	result = []
+	in_pre = False
+
+	prev_token_last_char = ""
+	# This is a cheat, used to get some context
+	# for one-character tokens that consist of
+	# just a quote char. What we do is remember
+	# the last character of the previous text
+	# token, to use as context to curl single-
+	# character quote tokens correctly.
+
+	for cur_token in tokens:
+		if cur_token[0] == "tag":
+			# Don't mess with quotes inside some tags.  This does not handle self <closing/> tags!
+			result.append(cur_token[1])
+			skip_match = tags_to_skip_regex.match(cur_token[1])
+			if skip_match is not None:
+				if not skip_match.group(1):
+					skipped_tag_stack.append(skip_match.group(2).lower())
+					in_pre = True
+				else:
+					if len(skipped_tag_stack) > 0:
+						if skip_match.group(2).lower() == skipped_tag_stack[-1]:
+							skipped_tag_stack.pop()
+						else:
+							pass
+							# This close doesn't match the open.  This isn't XHTML.  We should barf here.
+					if len(skipped_tag_stack) == 0:
+						in_pre = False
+		else:
+			t = cur_token[1]
+			last_char = t[-1:] # Remember last char of this token before processing.
+			if not in_pre:
+				t = processEscapes(t)
+
+				if convert_quot != "0":
+					t = re.sub('&quot;', '"', t)
+
+				if do_dashes != "0":
+					if do_dashes == "1":
+						t = educateDashes(t)
+					if do_dashes == "2":
+						t = educateDashesOldSchool(t)
+					if do_dashes == "3":
+						t = educateDashesOldSchoolInverted(t)
+
+				if do_ellipses != "0":
+					t = educateEllipses(t)
+
+				# Note: backticks need to be processed before quotes.
+				if do_backticks != "0":
+					t = educateBackticks(t)
+
+				if do_backticks == "2":
+					t = educateSingleBackticks(t)
+
+				if do_quotes != "0":
+					if t == "'":
+						# Special case: single-character ' token
+						if re.match("\S", prev_token_last_char):
+							t = "&#8217;"
+						else:
+							t = "&#8216;"
+					elif t == '"':
+						# Special case: single-character " token
+						if re.match("\S", prev_token_last_char):
+							t = "&#8221;"
+						else:
+							t = "&#8220;"
+
+					else:
+						# Normal case:
+						t = educateQuotes(t)
+
+				if do_stupefy == "1":
+					t = stupefyEntities(t)
+
+			prev_token_last_char = last_char
+			result.append(t)
+
+	return "".join(result)
+
+
+def educateQuotes(str):
+	"""
+	Parameter:  String.
+
+	Returns:	The string, with "educated" curly quote HTML entities.
+
+	Example input:  "Isn't this fun?"
+	Example output: &#8220;Isn&#8217;t this fun?&#8221;
+	"""
+
+	punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]"""
+
+	# Special case if the very first character is a quote
+	# followed by punctuation at a non-word-break. Close the quotes by brute force:
+	str = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), r"""&#8217;""", str)
+	str = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), r"""&#8221;""", str)
+
+	# Special case for double sets of quotes, e.g.:
+	#   <p>He said, "'Quoted' words in a larger quote."</p>
+	str = re.sub(r""""'(?=\w)""", """&#8220;&#8216;""", str)
+	str = re.sub(r"""'"(?=\w)""", """&#8216;&#8220;""", str)
+
+	# Special case for decade abbreviations (the '80s):
+	str = re.sub(r"""\b'(?=\d{2}s)""", r"""&#8217;""", str)
+
+	close_class = r"""[^\ \t\r\n\[\{\(\-]"""
+	dec_dashes = r"""&#8211;|&#8212;"""
+
+	# Get most opening single quotes:
+	opening_single_quotes_regex = re.compile(r"""
+			(
+				\s          |   # a whitespace char, or
+				&nbsp;      |   # a non-breaking space entity, or
+				--          |   # dashes, or
+				&[mn]dash;  |   # named dash entities
+				%s          |   # or decimal entities
+				&\#x201[34];    # or hex
+			)
+			'                 # the quote
+			(?=\w)            # followed by a word character
+			""" % (dec_dashes,), re.VERBOSE)
+	str = opening_single_quotes_regex.sub(r"""\1&#8216;""", str)
+
+	closing_single_quotes_regex = re.compile(r"""
+			(%s)
+			'
+			(?!\s | s\b | \d)
+			""" % (close_class,), re.VERBOSE)
+	str = closing_single_quotes_regex.sub(r"""\1&#8217;""", str)
+
+	closing_single_quotes_regex = re.compile(r"""
+			(%s)
+			'
+			(\s | s\b)
+			""" % (close_class,), re.VERBOSE)
+	str = closing_single_quotes_regex.sub(r"""\1&#8217;\2""", str)
+
+	# Any remaining single quotes should be opening ones:
+	str = re.sub(r"""'""", r"""&#8216;""", str)
+
+	# Get most opening double quotes:
+	opening_double_quotes_regex = re.compile(r"""
+			(
+				\s          |   # a whitespace char, or
+				&nbsp;      |   # a non-breaking space entity, or
+				--          |   # dashes, or
+				&[mn]dash;  |   # named dash entities
+				%s          |   # or decimal entities
+				&\#x201[34];    # or hex
+			)
+			"                 # the quote
+			(?=\w)            # followed by a word character
+			""" % (dec_dashes,), re.VERBOSE)
+	str = opening_double_quotes_regex.sub(r"""\1&#8220;""", str)
+
+	# Double closing quotes:
+	closing_double_quotes_regex = re.compile(r"""
+			#(%s)?   # character that indicates the quote should be closing
+			"
+			(?=\s)
+			""" % (close_class,), re.VERBOSE)
+	str = closing_double_quotes_regex.sub(r"""&#8221;""", str)
+
+	closing_double_quotes_regex = re.compile(r"""
+			(%s)   # character that indicates the quote should be closing
+			"
+			""" % (close_class,), re.VERBOSE)
+	str = closing_double_quotes_regex.sub(r"""\1&#8221;""", str)
+
+	# Any remaining quotes should be opening ones.
+	str = re.sub(r'"', r"""&#8220;""", str)
+
+	return str
+
+
+def educateBackticks(str):
+	"""
+	Parameter:  String.
+	Returns:    The string, with ``backticks'' -style double quotes
+	            translated into HTML curly quote entities.
+	Example input:  ``Isn't this fun?''
+	Example output: &#8220;Isn't this fun?&#8221;
+	"""
+
+	str = re.sub(r"""``""", r"""&#8220;""", str)
+	str = re.sub(r"""''""", r"""&#8221;""", str)
+	return str
+
+
+def educateSingleBackticks(str):
+	"""
+	Parameter:  String.
+	Returns:    The string, with `backticks' -style single quotes
+	            translated into HTML curly quote entities.
+
+	Example input:  `Isn't this fun?'
+	Example output: &#8216;Isn&#8217;t this fun?&#8217;
+	"""
+
+	str = re.sub(r"""`""", r"""&#8216;""", str)
+	str = re.sub(r"""'""", r"""&#8217;""", str)
+	return str
+
+
+def educateDashes(str):
+	"""
+	Parameter:  String.
+
+	Returns:    The string, with each instance of "--" translated to
+	            an em-dash HTML entity.
+	"""
+
+	str = re.sub(r"""---""", r"""&#8211;""", str) # en  (yes, backwards)
+	str = re.sub(r"""--""", r"""&#8212;""", str) # em (yes, backwards)
+	return str
+
+
+def educateDashesOldSchool(str):
+	"""
+	Parameter:  String.
+
+	Returns:    The string, with each instance of "--" translated to
+	            an en-dash HTML entity, and each "---" translated to
+	            an em-dash HTML entity.
+	"""
+
+	str = re.sub(r"""---""", r"""&#8212;""", str)    # em (yes, backwards)
+	str = re.sub(r"""--""", r"""&#8211;""", str)    # en (yes, backwards)
+	return str
+
+
+def educateDashesOldSchoolInverted(str):
+	"""
+	Parameter:  String.
+
+	Returns:    The string, with each instance of "--" translated to
+	            an em-dash HTML entity, and each "---" translated to
+	            an en-dash HTML entity. Two reasons why: First, unlike the
+	            en- and em-dash syntax supported by
+	            EducateDashesOldSchool(), it's compatible with existing
+	            entries written before SmartyPants 1.1, back when "--" was
+	            only used for em-dashes.  Second, em-dashes are more
+	            common than en-dashes, and so it sort of makes sense that
+	            the shortcut should be shorter to type. (Thanks to Aaron
+	            Swartz for the idea.)
+	"""
+	str = re.sub(r"""---""", r"""&#8211;""", str)    # em
+	str = re.sub(r"""--""", r"""&#8212;""", str)    # en
+	return str
+
+
+
+def educateEllipses(str):
+	"""
+	Parameter:  String.
+	Returns:    The string, with each instance of "..." translated to
+	            an ellipsis HTML entity.
+
+	Example input:  Huh...?
+	Example output: Huh&#8230;?
+	"""
+
+	str = re.sub(r"""\.\.\.""", r"""&#8230;""", str)
+	str = re.sub(r"""\. \. \.""", r"""&#8230;""", str)
+	return str
+
+
+def stupefyEntities(str):
+	"""
+	Parameter:  String.
+	Returns:    The string, with each SmartyPants HTML entity translated to
+	            its ASCII counterpart.
+
+	Example input:  &#8220;Hello &#8212; world.&#8221;
+	Example output: "Hello -- world."
+	"""
+
+	str = re.sub(r"""&#8211;""", r"""-""", str)  # en-dash
+	str = re.sub(r"""&#8212;""", r"""--""", str) # em-dash
+
+	str = re.sub(r"""&#8216;""", r"""'""", str)  # open single quote
+	str = re.sub(r"""&#8217;""", r"""'""", str)  # close single quote
+
+	str = re.sub(r"""&#8220;""", r'''"''', str)  # open double quote
+	str = re.sub(r"""&#8221;""", r'''"''', str)  # close double quote
+
+	str = re.sub(r"""&#8230;""", r"""...""", str)# ellipsis
+
+	return str
+
+
+def processEscapes(str):
+	r"""
+	Parameter:  String.
+	Returns:    The string, with after processing the following backslash
+	            escape sequences. This is useful if you want to force a "dumb"
+	            quote or other character to appear.
+
+	            Escape  Value
+	            ------  -----
+	            \\      &#92;
+	            \"      &#34;
+	            \'      &#39;
+	            \.      &#46;
+	            \-      &#45;
+	            \`      &#96;
+	"""
+	str = re.sub(r"""\\\\""", r"""&#92;""", str)
+	str = re.sub(r'''\\"''', r"""&#34;""", str)
+	str = re.sub(r"""\\'""", r"""&#39;""", str)
+	str = re.sub(r"""\\\.""", r"""&#46;""", str)
+	str = re.sub(r"""\\-""", r"""&#45;""", str)
+	str = re.sub(r"""\\`""", r"""&#96;""", str)
+
+	return str
+
+
+def _tokenize(str):
+	"""
+	Parameter:  String containing HTML markup.
+	Returns:    Reference to an array of the tokens comprising the input
+	            string. Each token is either a tag (possibly with nested,
+	            tags contained therein, such as <a href="<MTFoo>">, or a
+	            run of text between tags. Each element of the array is a
+	            two-element array; the first is either 'tag' or 'text';
+	            the second is the actual value.
+
+	Based on the _tokenize() subroutine from Brad Choate's MTRegex plugin.
+	    <http://www.bradchoate.com/past/mtregex.php>
+	"""
+
+	tokens = []
+
+	#depth = 6
+	#nested_tags = "|".join(['(?:<(?:[^<>]',] * depth) + (')*>)' * depth)
+	#match = r"""(?: <! ( -- .*? -- \s* )+ > ) |  # comments
+	#		(?: <\? .*? \?> ) |  # directives
+	#		%s  # nested tags       """ % (nested_tags,)
+	tag_soup = re.compile(r"""([^<]*)(<[^>]*>)""")
+
+	token_match = tag_soup.search(str)
+
+	previous_end = 0
+	while token_match is not None:
+		if token_match.group(1):
+			tokens.append(['text', token_match.group(1)])
+
+		tokens.append(['tag', token_match.group(2)])
+
+		previous_end = token_match.end()
+		token_match = tag_soup.search(str, token_match.end())
+
+	if previous_end < len(str):
+		tokens.append(['text', str[previous_end:]])
+
+	return tokens
+
+
+
+if __name__ == "__main__":
+
+	import locale
+
+	try:
+		locale.setlocale(locale.LC_ALL, '')
+	except:
+		pass
+
+	from docutils.core import publish_string
+	docstring_html = publish_string(__doc__, writer_name='html')
+
+	print docstring_html
+
+
+	# Unit test output goes out stderr.  No worries.
+	import unittest
+	sp = smartyPants
+
+	class TestSmartypantsAllAttributes(unittest.TestCase):
+		# the default attribute is "1", which means "all".
+
+		def test_dates(self):
+			self.assertEqual(sp("1440-80's"), "1440-80&#8217;s")
+			self.assertEqual(sp("1440-'80s"), "1440-&#8216;80s")
+			self.assertEqual(sp("1440---'80s"), "1440&#8211;&#8216;80s")
+			self.assertEqual(sp("1960s"), "1960s")  # no effect.
+			self.assertEqual(sp("1960's"), "1960&#8217;s")
+			self.assertEqual(sp("one two '60s"), "one two &#8216;60s")
+			self.assertEqual(sp("'60s"), "&#8216;60s")
+
+		def test_skip_tags(self):
+			self.assertEqual(
+				sp("""<script type="text/javascript">\n<!--\nvar href = "http://www.google.com";\nvar linktext = "google";\ndocument.write('<a href="' + href + '">' + linktext + "</a>");\n//-->\n</script>"""),
+				   """<script type="text/javascript">\n<!--\nvar href = "http://www.google.com";\nvar linktext = "google";\ndocument.write('<a href="' + href + '">' + linktext + "</a>");\n//-->\n</script>""")
+			self.assertEqual(
+				sp("""<p>He said &quot;Let's write some code.&quot; This code here <code>if True:\n\tprint &quot;Okay&quot;</code> is python code.</p>"""),
+				   """<p>He said &#8220;Let&#8217;s write some code.&#8221; This code here <code>if True:\n\tprint &quot;Okay&quot;</code> is python code.</p>""")
+
+
+		def test_ordinal_numbers(self):
+			self.assertEqual(sp("21st century"), "21st century")  # no effect.
+			self.assertEqual(sp("3rd"), "3rd")  # no effect.
+
+		def test_educated_quotes(self):
+			self.assertEqual(sp('''"Isn't this fun?"'''), '''&#8220;Isn&#8217;t this fun?&#8221;''')
+
+	unittest.main()
+
+
+
+
+__author__ = "Chad Miller <smartypantspy@chad.org>"
+__version__ = "1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400"
+__url__ = "http://wiki.chad.org/SmartyPantsPy"
+__description__ = "Smart-quotes, smart-ellipses, and smart-dashes for weblog entries in pyblosxom"