From 9217e6bed381211842f56d25c9ed4957324a2b7e Mon Sep 17 00:00:00 2001
From: Lee <ldolse@yahoo.com>
Date: Sat, 21 Apr 2012 00:24:32 +0800
Subject: [PATCH 1/4] remove full stop punctuation from the line unwrap
 heuristic

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index e2a02702df..24528d1fb8 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -319,7 +319,7 @@ class HeuristicProcessor(object):
         '''
         # define the pieces of the regex
 
-        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
+        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
         em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
         soft_hyphen = u"\xad"
         line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"

From 740c812de24e38120b33ba7d094ad288bc7cf234 Mon Sep 17 00:00:00 2001
From: Lee <ldolse@yahoo.com>
Date: Sat, 21 Apr 2012 00:52:13 +0800
Subject: [PATCH 2/4] expanded comments

---
 src/calibre/ebooks/conversion/utils.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 24528d1fb8..6dc3973213 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -316,9 +316,17 @@ class HeuristicProcessor(object):
         '''
         Unwraps lines based on line length and punctuation
         supports a range of html markup and text files
+        
+        the lookahead regex below is meant look for any non-full stop characters - punctuation
+        characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc
+        the reason for this is to prevent false positive wrapping.  False positives are more
+        difficult to detect than false negatives during a manual review of the doc
+        
+        This function intentionally leaves hyphenated content alone as that is handled by the 
+        dehyphenate routine in a future step
         '''
-        # define the pieces of the regex
 
+        # define the pieces of the regex
         lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
         em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
         soft_hyphen = u"\xad"

From c5c3354b338ac0bfc3fb1329923179638c02f7b1 Mon Sep 17 00:00:00 2001
From: Lee <ldolse@yahoo.com>
Date: Sat, 21 Apr 2012 00:54:35 +0800
Subject: [PATCH 3/4] ...

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 6dc3973213..acfa80e877 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -323,7 +323,7 @@ class HeuristicProcessor(object):
         difficult to detect than false negatives during a manual review of the doc
         
         This function intentionally leaves hyphenated content alone as that is handled by the 
-        dehyphenate routine in a future step
+        dehyphenate routine in a separate step
         '''
 
         # define the pieces of the regex

From b717749138e144155edc86c7d61ff8c1413e7d9a Mon Sep 17 00:00:00 2001
From: Lee <ldolse@yahoo.com>
Date: Sat, 21 Apr 2012 00:59:30 +0800
Subject: [PATCH 4/4] fix the pattern in preprocess

---
 src/calibre/ebooks/conversion/preprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index c526cba8a9..16acaad383 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -559,7 +559,7 @@ class HTMLPreProcessor(object):
                 end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                 end_rules.append(
                     # Un wrap using punctuation
-                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                 )
 
         for rule in self.PREPROCESS + start_rules: