From ff28f1741630669ed71a7dc75f37d7d82fb14367 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 19 Nov 2010 12:26:17 -0700 Subject: [PATCH] PDF Input: Fix unwrapping of accented characters --- src/calibre/ebooks/conversion/preprocess.py | 2 +- src/calibre/ebooks/conversion/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index b6b218e052..64dcd93f38 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -475,7 +475,7 @@ class HTMLPreProcessor(object): end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*

\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation - (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), + (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 2faec27b68..f75fe71c82 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -228,7 +228,7 @@ class PreProcessor(object): html = dehyphenator(html,'html', length) self.log("Done dehyphenating") # Unwrap lines using punctation and line length - unwrap = re.compile(u"(?<=.{%i}([a-z,:)\IA\u00DF]|(?\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) + unwrap = re.compile(u"(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) html = unwrap.sub(' ', html) #check any remaining hyphens, but only unwrap if there is a match dehyphenator = Dehyphenator()