From ffe8fe5fd23a721af0fe1d07df109d78a39c743c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 17 Sep 2010 10:42:11 -0600 Subject: [PATCH] Fix use of UTF-8 raw string --- src/calibre/ebooks/conversion/preprocess.py | 1 - src/calibre/ebooks/conversion/utils.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 3e5de26766..03a0047927 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -144,7 +144,6 @@ class HTMLPreProcessor(object): # Fix pdftohtml markup PDFTOHTML = [ # Fix umlauts - # ¨ (re.compile(u'¨\s*()*\s*a', re.UNICODE), lambda match: u'ä'), (re.compile(u'¨\s*()*\s*A', re.UNICODE), lambda match: u'Ä'), (re.compile(u'¨\s*()*\s*e', re.UNICODE), lambda match: u'ë'), diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 8588ff65ad..37fd169cb1 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -176,8 +176,8 @@ class PreProcessor(object): self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***") # # Unwrap and/or delete soft-hyphens, hyphens - html = re.sub(u'­\s*(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*', '', html) - html = re.sub(u'(?<=[-–—])\s*(?=<)(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html) + html = re.sub(u'\xad\s*(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*', '', html) + html = re.sub(u'(?<=[-\u2013\u2014])\s*(?=<)(\s*(\s*<[iubp][^>]*>\s*)?]*>|\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html) # Unwrap lines using punctation and line length unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) @@ -195,6 +195,6 @@ class PreProcessor(object): html = doubleheading.sub('\g'+'\n'+'', html) # put back non-breaking spaces in empty paragraphs to preserve original formatting - html = blankreg.sub('\n'+'\g'+' '+'\g', html) + html = blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) return html