From 9217e6bed381211842f56d25c9ed4957324a2b7e Mon Sep 17 00:00:00 2001 From: Lee Date: Sat, 21 Apr 2012 00:24:32 +0800 Subject: [PATCH 1/4] remove full stop punctuation from the line unwrap heuristic --- src/calibre/ebooks/conversion/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index e2a02702df..24528d1fb8 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -319,7 +319,7 @@ class HeuristicProcessor(object): ''' # define the pieces of the regex - lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?\s*()?" From 740c812de24e38120b33ba7d094ad288bc7cf234 Mon Sep 17 00:00:00 2001 From: Lee Date: Sat, 21 Apr 2012 00:52:13 +0800 Subject: [PATCH 2/4] expanded comments --- src/calibre/ebooks/conversion/utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 24528d1fb8..6dc3973213 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -316,9 +316,17 @@ class HeuristicProcessor(object): ''' Unwraps lines based on line length and punctuation supports a range of html markup and text files + + the lookahead regex below is meant look for any non-full stop characters - punctuation + characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc + the reason for this is to prevent false positive wrapping. False positives are more + difficult to detect than false negatives during a manual review of the doc + + This function intentionally leaves hyphenated content alone as that is handled by the + dehyphenate routine in a future step ''' - # define the pieces of the regex + # define the pieces of the regex lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(? Date: Sat, 21 Apr 2012 00:54:35 +0800 Subject: [PATCH 3/4] ... --- src/calibre/ebooks/conversion/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 6dc3973213..acfa80e877 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -323,7 +323,7 @@ class HeuristicProcessor(object): difficult to detect than false negatives during a manual review of the doc This function intentionally leaves hyphenated content alone as that is handled by the - dehyphenate routine in a future step + dehyphenate routine in a separate step ''' # define the pieces of the regex From b717749138e144155edc86c7d61ff8c1413e7d9a Mon Sep 17 00:00:00 2001 From: Lee Date: Sat, 21 Apr 2012 00:59:30 +0800 Subject: [PATCH 4/4] fix the pattern in preprocess --- src/calibre/ebooks/conversion/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index c526cba8a9..16acaad383 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -559,7 +559,7 @@ class HTMLPreProcessor(object): end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*

\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation - (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?)?\s*(

\s*

\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), + (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?)?\s*(

\s*

\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), ) for rule in self.PREPROCESS + start_rules: