From c2cef786ce19b25cbdfc79c345d4cffa38885248 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 9 Jan 2011 19:34:02 +0800
Subject: [PATCH] added partial dehyphenation for markdown

---
 src/calibre/ebooks/conversion/preprocess.py | 16 +++++++--------
 src/calibre/ebooks/txt/input.py             | 22 +++++++++++++--------
 2 files changed, 22 insertions(+), 16 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index d9d735e391..e2c51846a4 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -201,15 +201,15 @@ class Dehyphenator(object):
             searchresult = self.html.find(lookupword.lower())
         except:
             return hyphenated
-        if self.format == 'html_cleanup':
+        if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
             if self.html.find(lookupword) != -1 or searchresult != -1:
-                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+                print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
                 return dehyphenated
             elif self.html.find(hyphenated) != -1:
-                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
+                print "Cleanup:returned hyphenated word: " + str(hyphenated)
                 return hyphenated
             else:
-                #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+                print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
                 return firsthalf+u'\u2014'+wraptags+secondhalf
 
         else:
@@ -230,12 +230,12 @@ class Dehyphenator(object):
         elif format == 'txt':
             intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\w\d]+)'% length)
         elif format == 'individual_words':
-            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020?(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
-        elif format == 'individual_words_txt':
-            intextmatch = re.compile(u'\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b')
-
+            intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)(-|‐)\u0020*(?P<secondpart>\w+)\b[^<]*<') # for later, not called anywhere yet
         elif format == 'html_cleanup':
             intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
+        elif format == 'txt_cleanup':
+            intextmatch = re.compile(u'(?P<firstpart>\w+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
+
 
         html = intextmatch.sub(self.dehyphenate, html)
         return html
diff --git a/src/calibre/ebooks/txt/input.py b/src/calibre/ebooks/txt/input.py
index 2e35e8e345..5fbdc7131a 100644
--- a/src/calibre/ebooks/txt/input.py
+++ b/src/calibre/ebooks/txt/input.py
@@ -73,6 +73,14 @@ class TXTInput(InputFormatPlugin):
         # followed by the &nbsp; entity.
         if options.preserve_spaces:
             txt = preserve_spaces(txt)
+
+        # Normalize line endings
+        txt = normalize_line_endings(txt)
+
+        # Get length for hyphen removal and punctuation unwrap
+        docanalysis = DocAnalysis('txt', txt)
+        length = docanalysis.line_length(.5)
+        print "length is "+str(length)
             
         if options.formatting_type == 'auto':
             options.formatting_type = detect_formatting_type(txt)
@@ -94,14 +102,6 @@ class TXTInput(InputFormatPlugin):
                 else:
                     log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
 
-            # Normalize line endings
-            txt = normalize_line_endings(txt)
-
-            # Get length for hyphen removal and punctuation unwrap
-            docanalysis = DocAnalysis('txt', txt)
-            length = docanalysis.line_length(.5)
-            print "length is "+str(length)
-
             # Dehyphenate
             dehyphenator = Dehyphenator()
             txt = dehyphenator(txt,'txt', length)
@@ -129,6 +129,12 @@ class TXTInput(InputFormatPlugin):
             else:
                 html = convert_basic(txt, epub_split_size_kb=flow_size)
 
+        # Dehyphenate in cleanup mode for missed txt and markdown conversion
+        print "going through final dehyphenation"
+        dehyphenator = Dehyphenator()
+        html = dehyphenator(html,'txt_cleanup', length)
+        html = dehyphenator(html,'html_cleanup', length)
+
         from calibre.customize.ui import plugin_for_input_format
         html_input = plugin_for_input_format('html')
         for opt in html_input.options: